コード例 #1
0
ファイル: gonzales.py プロジェクト: MazenAly/datamining
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
コード例 #2
0
    def test_setitem_cache_updating(self):
        # GH 5424
        cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']

        for do_ref in [False, False]:
            df = DataFrame({'a': cont,
                            "b": cont[3:] + cont[:3],
                            'c': np.arange(7)})

            # ref the cache
            if do_ref:
                df.loc[0, "c"]

            # set it
            df.loc[7, 'c'] = 1

            assert df.loc[0, 'c'] == 0.0
            assert df.loc[7, 'c'] == 1.0

        # GH 7084
        # not updating cache on series setting with slices
        expected = DataFrame({'A': [600, 600, 600]},
                             index=date_range('5/7/2014', '5/9/2014'))
        out = DataFrame({'A': [0, 0, 0]},
                        index=date_range('5/7/2014', '5/9/2014'))
        df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]})

        # loop through df to update out
        six = Timestamp('5/7/2014')
        eix = Timestamp('5/9/2014')
        for ix, row in df.iterrows():
            out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D']

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out['A'], expected['A'])

        # try via a chain indexing
        # this actually works
        out = DataFrame({'A': [0, 0, 0]},
                        index=date_range('5/7/2014', '5/9/2014'))
        for ix, row in df.iterrows():
            v = out[row['C']][six:eix] + row['D']
            out[row['C']][six:eix] = v

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out['A'], expected['A'])

        out = DataFrame({'A': [0, 0, 0]},
                        index=date_range('5/7/2014', '5/9/2014'))
        for ix, row in df.iterrows():
            out.loc[six:eix, row['C']] += row['D']

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out['A'], expected['A'])
コード例 #3
0
ファイル: analyser.py プロジェクト: Sinderella/OSINT
    def compute_tf_idf_queries(self):
        # Find total number of document
        results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents'))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute('SELECT did, total_word, path FROM documents')
        tmp = results.fetchall()
        documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path'])
        documents_df['tf_idf'] = 0.0

        no_docterm = {}

        for query in self.queries:
            no_docterm[query] = 0

        for index, row in documents_df.iterrows():
            path = row['path']
            with codecs.open(path, 'rt') as f:
                text = f.read()
                for query in self.queries:
                    if query in text.decode('utf-8').lower():
                        no_docterm[query] += 1

        for query in self.queries:
            for index, row in documents_df.iterrows():
                total_word = row['total_word']
                path = row['path']

                with codecs.open(path, 'rt') as f:
                    text = f.read()

                tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query])
                cur_tf_idf = documents_df.get_value(index, 'tf_idf')
                documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf)

        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])
        df['tf_idf'] = 0.0

        for index, row in df.iterrows():
            did = row['did']
            tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0]
            df.set_value(index, 'tf_idf', tf_idf)

        del df['did']
        df = df.groupby(['e_type', 'entity']).sum().reset_index()
        return df
コード例 #4
0
ファイル: results.py プロジェクト: sernst/airplane_boarding
def create_seated(
        settings: dict,
        passengers: pd.DataFrame) -> pd.DataFrame:
    """
    :param settings:
        Configuration settings for the current trial
    :param passengers:
        The passengers data frame for the trial
    """

    passenger_index = []
    seat_names = []
    seated_time = []

    for index, passenger in passengers.iterrows():
        passenger_index.append(index)
        seat_names.append(
            '{}{}'.format(passenger['aisle'], passenger['letter'])
        )
        seated_time.append(None)

    return pd.DataFrame({
        'passenger': passenger_index,
        'seat': seat_names,
        'time': seated_time
    })
コード例 #5
0
ファイル: plots.py プロジェクト: scienceopen/pyrinex
def receiver_locations(locs: pandas.DataFrame):
    if not isinstance(locs, pandas.DataFrame):
        return

    if cartopy is not None:
        ax = figure().gca(projection=cartopy.crs.PlateCarree())

        ax.add_feature(cpf.LAND)
        ax.add_feature(cpf.OCEAN)
        ax.add_feature(cpf.COASTLINE)
        ax.add_feature(cpf.BORDERS, linestyle=':')
    else:
        ax = figure().gca()

    for name, loc in locs.iterrows():
        if 15 <= loc.interval < 30:
            c = 'g'
        elif 5 <= loc.interval < 15:
            c = 'o'
        elif loc.interval < 5:
            c = 'r'
        else:  # large or undefined interval
            c = 'b'

        if np.isfinite(loc.interval):
            ax.scatter(loc.lon, loc.lat, s=1000*1/loc.interval, c=c, label=name)
        else:
            ax.scatter(loc.lon, loc.lat, c=c, label=name)
コード例 #6
0
ファイル: Aggregator.py プロジェクト: PWuJuveOKC/stat_agg
 def predict(self, prediction_data):
   df = DataFrame(prediction_data)
   ret = []
   for row in df.iterrows():
     index, data = row
     ret += [self.agg(data.tolist())]
   return(ret)
コード例 #7
0
    def test_pivot_index_with_nan(self):
        # GH 3588
        nan = np.nan
        df = DataFrame({'a': ['R1', 'R2', nan, 'R4'],
                        'b': ['C1', 'C2', 'C3', 'C4'],
                        'c': [10, 15, 17, 20]})
        result = df.pivot('a', 'b', 'c')
        expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan],
                              [nan, 15, nan, nan], [nan, nan, nan, 20]],
                             index=Index([nan, 'R1', 'R2', 'R4'], name='a'),
                             columns=Index(['C1', 'C2', 'C3', 'C4'], name='b'))
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T)

        # GH9491
        df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'),
                        'c': 100 + np.arange(6)})
        df['b'] = df['a'] - pd.Timestamp('2014-02-02')
        df.loc[1, 'a'] = df.loc[3, 'a'] = nan
        df.loc[1, 'b'] = df.loc[4, 'b'] = nan

        pv = df.pivot('a', 'b', 'c')
        self.assertEqual(pv.notnull().values.sum(), len(df))

        for _, row in df.iterrows():
            self.assertEqual(pv.loc[row['a'], row['b']], row['c'])

        tm.assert_frame_equal(df.pivot('b', 'a', 'c'), pv.T)
コード例 #8
0
ファイル: agg_tasks.py プロジェクト: vhpgomes/rhizome
    def sum_of_parts(self):
        '''
        For more info on this see:
        https://github.com/unicef/rhizome/blob/master/docs/spec.rst#aggregation-and-calculation

        '''

        ## get the indicator_ids we need to make the calculation ##
        initial_calc_df = self.build_calc_df(['PART_TO_BE_SUMMED'])

        ## handle recursive calculations ( see spec.rst link above ) ##
        calc_df = self.build_recursive_sum_calc_df(initial_calc_df)

        self_join_calc_df = calc_df.merge(calc_df, left_on =\
            'indicator_component_id',right_on='calc_indicator_id',how='left')

        ## get the datapoints for the above indicator_ids ##
        dp_df = self.build_dp_df(calc_df['indicator_component_id'])

        ## now join the above dataframe on itself to set up the calculation ##
        dp_df_with_calc = self.join_dp_to_calc(calc_df, dp_df)

        ## take the sum of all of the component indicators ##
        grouped_df = DataFrame(dp_df_with_calc.merge(dp_df_with_calc)\
            .groupby(['location_id','calc_indicator_id','campaign_id',])\
            ['value'].sum())

        for ix, row_data in grouped_df.iterrows():
            self.dwc_tuple_dict[ix] = row_data.value
コード例 #9
0
ファイル: S01-first.py プロジェクト: sernst/cauldron
def create_unified_column(data_frame: pd.DataFrame) -> pd.Series:
    unified = [
        '-'.join(to_strings(row.to_dict().values()))
        for _, row in data_frame.iterrows()
    ]

    return pd.Series(unified)
コード例 #10
0
def resolve(dataset, m):
    t = dataset.y
    phis = DataFrame()
    for i in range(0,m+1):
        p = dataset.x**i
        p.name="x**%d" % i
        phis = pd.concat([phis,p], axis=1)

    for index, line in phis.iterrows():
        phi = DataFrame(line)
        if index == 0:
            phiphi = np.dot(phi,phi.T)
        else:
            phiphi += np.dot(phi,phi.T)
    s_inv = alpha * DataFrame(np.identity(m+1)) + beta * phiphi
    s = np.linalg.inv(s_inv)

    # 平均 m(x)
    def mean_fun(x0):
        phi_x0 = DataFrame([x0 ** i for i in range(0,m+1)])
        for index, line in phis.iterrows():
            if index == 0:
                tmp = t[index] * line
            else:
                tmp += t[index] * line
        return (beta * np.dot(np.dot(phi_x0.T, s), DataFrame(tmp))).flatten()

    # 標準偏差 s(x)
    def deviation_fun(x0):
        phi_x0 = DataFrame([x0 ** i for i in range(0,m+1)])
        deviation = np.sqrt(1.0/beta + np.dot(np.dot(phi_x0.T, s), phi_x0))
        return deviation.diagonal()

    return mean_fun, deviation_fun
コード例 #11
0
def test_age(df: DataFrame):
    sub = 0
    for index, row in df.iterrows():
        name = row['Name']
        age = row['Age']
        if not math.isnan(age):
            if age <= 8:
                res = 'kid'
            elif age <= 30:
                res = 'young'
            elif age <= 45:
                res = 'middle'
            else:
                res = 'old'
        else:
            if match_name(name, r".*Master\..*"):
                res = 'kid'
            elif match_name(name, r".*Miss\..*"):
                res = 'young'
            elif match_name(name, r".*Mr(s)?\..*"):
                res = 'middle'
            else:
                res = 'young'
        df.loc[sub, 'Age'] = res
        sub += 1
    return df
コード例 #12
0
ファイル: frame_methods.py プロジェクト: changhiskhan/pandas
class Iteration(object):

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples(self):
        for row in self.df2.itertuples():
            pass

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
コード例 #13
0
ファイル: Aggregator.py プロジェクト: PWuJuveOKC/stat_agg
 def predict(self, prediction_data):
   preds = DataFrame(prediction_data)
   ret = []
   for row in preds.iterrows():
     index, data = row
     ret.append(mean(data))
   return(ret)
コード例 #14
0
ファイル: draw.py プロジェクト: sernst/tracksim-analysis
def make_lines(tracks: pd.DataFrame, transformation: dict) -> list:
    def make_line(start: pd.Series, end: pd.Series) -> str:
        return create_tag('line', {
            'stroke': 'rgba(0, 0, 0, 0.2)',
            'stroke-width': '2',
            'stroke-dasharray': '5,5',
            'x1': transform_x(start['x'], transformation),
            'y1': transform_y(start['y'], transformation),
            'x2': transform_x(end['x'], transformation),
            'y2': transform_y(end['y'], transformation)
        })

    previous = pd.Series(dict(
        x=transformation['x_min'],
        y=tracks.iloc[0]['y']
    ))
    lines = []

    for index, row in tracks.iterrows():
        lines.append(make_line(previous, row))
        previous = row

    if previous['x'] < transformation['x_max']:
        lines.append(make_line(previous, pd.Series(dict(
            x=transformation['x_max'],
            y=previous['y']
        ))))

    return lines
コード例 #15
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
    def test_pivot_index_with_nan(self):
        # GH 3588
        nan = np.nan
        df = DataFrame({"a": ["R1", "R2", nan, "R4"], "b": ["C1", "C2", "C3", "C4"], "c": [10, 15, 17, 20]})
        result = df.pivot("a", "b", "c")
        expected = DataFrame(
            [[nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20]],
            index=Index([nan, "R1", "R2", "R4"], name="a"),
            columns=Index(["C1", "C2", "C3", "C4"], name="b"),
        )
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T)

        # GH9491
        df = DataFrame({"a": pd.date_range("2014-02-01", periods=6, freq="D"), "c": 100 + np.arange(6)})
        df["b"] = df["a"] - pd.Timestamp("2014-02-02")
        df.loc[1, "a"] = df.loc[3, "a"] = nan
        df.loc[1, "b"] = df.loc[4, "b"] = nan

        pv = df.pivot("a", "b", "c")
        self.assertEqual(pv.notnull().values.sum(), len(df))

        for _, row in df.iterrows():
            self.assertEqual(pv.loc[row["a"], row["b"]], row["c"])

        tm.assert_frame_equal(df.pivot("b", "a", "c"), pv.T)
コード例 #16
0
ファイル: waiting.py プロジェクト: sernst/airplane_boarding
def calculate(settings: dict, progress: pd.DataFrame):
    """

    :param settings:
    :param progress:
    :return:
    """

    passenger_count = settings['passenger_count']

    waiting = []

    previous_row = None
    for elapsed_time, row in progress.iterrows():
        waiting.append(0)
        for passenger_index in range(passenger_count):
            if previous_row is None:
                continue

            position = row[str(passenger_index)]
            last_position = previous_row[str(passenger_index)]

            if position == last_position:
                waiting[-1] += 1

        previous_row = row
        waiting[-1] = 100.0 * waiting[-1] / passenger_count
コード例 #17
0
ファイル: flightgear.py プロジェクト: csindle/gps2fgfp
def convert2flightplan(df: pd.DataFrame):
    """
    Convert dataframe to Flight Gear Flight Plan.
    """

    # Feet above sea level
    df['fasl'] = df['masl'] * 3.28084

    # Knots are nm (1852 m) per hour.
    df['knots'] = (df['dm'] / 1852) / (df['dt_sec'] / 60 / 60)

    rv = HEADER
    prev_state = STOP
    # Take off at Vr (stop ignoring altitude).
    df['state'] = df.apply(lambda x: flight_state(x['knots']), axis=1)

    for index, row in df.iterrows():
        state = row['state']
        if state in (TAXI, RUNWAY, AERO,):
            # Only output when really moving.

            rv += WPT.format(**row, ground='true' if state in (STOP, TAXI, RUNWAY,) else 'false')

            if state == TAXI and prev_state == STOP:
                # Landed new flight.
                rv += FOOTER + HEADER + "<!-- -lat={lat} -lon={lon} -->".format(**row)
        prev_state = state

    rv += FOOTER
    return rv
コード例 #18
0
ファイル: repository.py プロジェクト: rlugojr/git-pandas
    def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True,
                                  workers=1, ignore_globs=None, include_globs=None):
        """
        Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines
        blamed to each committer at each timestamp as data.

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :param committer: (optional, defualt=True) true if committer should be reported, false if author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core.
        :return: DataFrame

        """

        if not _has_joblib:
            raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use
            cumulative_blame() instead.''')

        revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints)

        if self.verbose:
            print('Beginning processing for cumulative blame:')

        revisions = json.loads(revs.to_json(orient='index'))
        revisions = [revisions[key] for key in revisions]

        ds = Parallel(n_jobs=workers, backend='threading', verbose=5)(
            delayed(_parallel_cumulative_blame_func)
            (self, x, committer, ignore_globs, include_globs) for x in revisions
        )

        revs = DataFrame(ds)
        del revs['rev']

        revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp))
        revs.set_index(keys=['date'], drop=True, inplace=True)
        revs = revs.fillna(0.0)

        # drop 0 cols
        for col in revs.columns.values:
            if col != 'col':
                if revs[col].sum() == 0:
                    del revs[col]

        # drop 0 rows
        keep_idx = []
        committers = [x for x in revs.columns.values if x != 'date']
        for idx, row in revs.iterrows():
            if sum([row[x] for x in committers]) > 0:
                keep_idx.append(idx)

        revs = revs.ix[keep_idx]
        revs.sort_index(ascending=False, inplace=True)

        return revs
コード例 #19
0
ファイル: Utilities.py プロジェクト: benrifkind/Integration
def write_dialer(filepath: str, dialer: pd.DataFrame):
    """
    Write to fixed width dialer format - expect each column to be left justified data frame with no need for character padding
    line endings are carriage returns in windows - FIX??
    """
    with open(filepath, 'w') as f:
        for i, row in dialer.iterrows():
            f.write(''.join(row.tolist()) + "\n")
コード例 #20
0
def set_cabin(df: DataFrame):
    for index, row in df.iterrows():
        # is nan
        if isinstance(row['Cabin'], float):
            df.loc[index, ['Cabin']] = 'X'
        else:
            df.loc[index, ['Cabin']] = row['Cabin'][0]
    df['Cabin'] = df['Cabin'].astype('object')
    return df
コード例 #21
0
    def fillna_dict(cls, prop):
        """
        Use trade history then fill empty with value row above
        """
        df = DataFrame(prop)
        df = df.replace(['', 'DEBIT', 'CREDIT'], numpy.nan)
        df = df.fillna(method='ffill')

        return [r.to_dict() for k, r in df.iterrows()]
コード例 #22
0
ファイル: ground_truth.py プロジェクト: t7reyeslua/NILM-Loc
 def save_to_file(self, fn):
     gg = DataFrame(self.power_series_apps_table)
     try:
         del gg['diff1']
         del gg['diff2']
     except Exception:
         print('')
         
     gg['Loc Events'] = self.loc.events_apps_1min['Apps']
     apps = self.loc.metadata.get_channels()
     sd = {}
     #Initialize series with 0s
     for app in apps:
         sd[app] = Series(0, index=gg.index)
         
     #Count location events for each appliance
     for index, row in gg.iterrows():
         try:
             if len(row['Loc Events']) > 0:
                 for app in apps:
                     n = row['Loc Events'].count(app)
                     sd[app][index] = n
         except Exception:
             continue
     
     if self.loc.name == 'REDD':
         sd[(3,4)] = sd[3]
         sd[(10,20)] = sd[10]
         del sd[3]
         del sd[4]
         del sd[10]
         del sd[20]
       
     #Change column names and append them to gral table
     locevents = DataFrame(sd)
     locevents.columns = [(str(col) + ' locEv') for col in locevents]        
     for locEv in locevents:
         gg[locEv] = locevents[locEv]
         
     
     #Get power values of each appliance and resample for 1min
     act = DataFrame(self.loc.appliances_consuming_times)
     act = act.resample('1Min')
            
     if self.loc.name == 'REDD':
         del act[3]
         del act[10]
         act.columns = [(3,4), 5,6,7,8,9,11,12,13,14,15,16,17,18,19,(10,20)]
     act.columns = [(str(col) + ' conEv') for col in act]
     
     for app in act:
         gg[app] = act[app]        
     gg.columns = [str(col) for col in gg]
     gg = gg[sorted(gg.columns)]
     gg.to_csv(fn)   
     return
コード例 #23
0
ファイル: draw.py プロジェクト: sernst/tracksim-analysis
def make_circles(tracks: pd.DataFrame, transformation: dict) -> list:
    def make_circle(track: pd.Series):
        return create_tag('circle', {
            'r': 16,
            'cx': transform_x(track['x'], transformation),
            'cy': transform_y(track['y'], transformation),
            'style': 'fill:{}'.format(get_color(track, tracks))
        })

    return [make_circle(row) for index, row in tracks.iterrows()]
コード例 #24
0
ファイル: test_api.py プロジェクト: bwignall/pandas
 def test_iterrows_corner(self):
     # gh-12222
     df = DataFrame(
         {'a': [datetime.datetime(2015, 1, 1)], 'b': [None], 'c': [None],
          'd': [''], 'e': [[]], 'f': [set()], 'g': [{}]})
     expected = Series(
         [datetime.datetime(2015, 1, 1), None, None, '', [], set(), {}],
         index=list('abcdefg'), name=0, dtype='object')
     _, result = next(df.iterrows())
     tm.assert_series_equal(result, expected)
コード例 #25
0
def word_freq(file_name, suffix='_wordfreq', sep='\t', threshold=.5):
	print "start word_freq"
	# start = datetime.datetime.now()
	# print start
	reviews = pd.read_csv(file_name, error_bad_lines=False, sep=sep)
	cb = reviews['stopword_body']
	rate = reviews['Rating']
	# label all words with the rating
	cb_temp = []
	for i, c in enumerate(cb):
		cb_temp.append([(w, rate[i]) for w in ast.literal_eval(c)])
	reviews['stopword_body'] = cb_temp
	# calculate_time(start)
	# get the corpus of all reviews, lists of all words with label
	'''--------------------------------------------------------'''
	cop_wl = []
	for b in cb_temp:
		# change the unicode data to the raw string
		# cop_wl += [(unicodedata.normalize('NFKD', w[0]).encode('utf-8','replace'), w[1]) for w in b if type(w[0])==unicode]
		cop_wl += b
	'''--------------------------------------------------------'''
	# calculate_time(start)
	# word frequency of the corpus with label
	wfq = nltk.FreqDist(cop_wl)
	# calculate_time(start)
	# get the word list of all reviews without label
	cop = [w[0] for w in cop_wl]
	cop = set(cop)
	cop_len = len(cop)
	# calculate_time(start)
	# get freq of all words in one list
	wfq_l = []
	for w in cop:
		for i in range(1, 6):
			wfq_l.append(wfq[(w, i)])

	# calculate_time(start)
	# reshape the list to a matrix
	wfq_mx = DataFrame(np.array(wfq_l).reshape((cop_len,5)), index=pd.Index(cop), columns=pd.Index([1,2,3,4,5]))
	# calculate_time(start)
	# calculate the prob of each rating
	w_s = []
	w_sum = []
	for i, r in wfq_mx.iterrows():
		word_sum = wfq_mx.ix[i].sum()
		# wfq_mx.ix[i] = wfq_mx.ix[i]/word_sum
		w_s.append(word_useful_score(list(wfq_mx.ix[i]), word_sum))
		w_sum.append(word_sum)

	wfq_mx['score'] = w_s
	wfq_mx['sum'] = w_sum
	wfq_mx = wfq_mx.sort(columns='sum').ix[-int(len(w_s) * threshold):,:]
	print wfq_mx
	wfq_mx.to_csv(file_name.split('.')[0] + suffix + '.' + file_name.split('.')[1], sep='\t')
コード例 #26
0
def record_match_data(min_seq):
    matches = db.match.find({"match_seq_num": { '$gt': min_seq } })
    # import pdb; pdb.set_trace()
    for match in matches:
        if match["human_players"] == 10 and match["duration"] > 1200:

            data_frame = DataFrame(match["players"])
            radiant_heroes = data_frame[data_frame['player_slot']<128]['hero_id'].tolist()
            dire_heroes = data_frame[data_frame['player_slot']>=128]['hero_id'].tolist()
            for index, row in data_frame.iterrows():
                if (row["player_slot"] < 128):
                    radiant_heroes.remove(row["hero_id"])
                    teammate = radiant_heroes
                    opponent = dire_heroes
                    is_win = bool(match["radiant_win"])
                else:
                    dire_heroes.remove(row["hero_id"])
                    teammate = dire_heroes
                    opponent = radiant_heroes
                    is_win = not bool(match["radiant_win"])
                record_json = json.loads(row.to_json())
                record_json['win'] = is_win
                record_json['match_id'] = match['match_id']
                record_json['match_seq'] = match['match_seq_num']
                record_json['teammate'] = teammate
                record_json['opponent'] = opponent

                item = []

                for x in range(0,6):
                    if record_json["item_{}".format(x)]>0:
                        if "item_{}" in record_json:
                            item.append(record_json["item_{}".format(x)])
                            del record_json["item_{}".format(x)]
                        if "item_{}_name" in record_json:
                            del record_json["item_{}_name".format(x)]

                record_json['item'] = item

                count = statics_db.match_record.find({'$and':[{'hero_id':row['hero_id']},{'match_id':match['match_id']}]}).count()
                if count == 0:
                    statics_db.match_record.insert_one(record_json)
            max_solved_seq_num = max(statics_db.max_solved_seq_num.find({"value_name":"max_solved_seq_num"})[0]["value"],match["match_seq_num"])
            statics_db.max_solved_seq_num.update_one(
                {"value_name":"max_solved_seq_num"},
                {
                    "$set":
                    {
                        "value":max_solved_seq_num
                    },
                    "$currentDate": {"lastModified": True}
                }
            )
            logging.info("match handle:"+str(max_solved_seq_num))
コード例 #27
0
ファイル: nba_stats.py プロジェクト: ltiao/nba_stats_bot_old
 def request_player_info_1(self, response):
     r_json = json.loads(response.body_as_unicode())
     result_set = r_json[u'resultSets'][0]
     df = DataFrame(data=result_set[u'rowSet'], columns=result_set[u'headers']).set_index('PERSON_ID')
     for id_, data in df.iterrows():
         p = PlayerItem()
         p['nba_player_id'] = id_
         p['nba_player_code'] = data['PLAYERCODE']
         p['is_active'] = bool(data['ROSTERSTATUS'])
         yield FormRequest(
             url = 'http://stats.nba.com/stats/commonplayerinfo/',
             method = 'GET',
             formdata = {'PlayerID': str(id_)},
             meta = dict(player=p),
             callback = self.request_player_info_2
         )
コード例 #28
0
ファイル: Aggregator.py プロジェクト: PWuJuveOKC/stat_agg
 def train(self, training_data):
   preds = DataFrame(training_data['prediction'])
   preds['actual'] = training_data['actual']
   pred_cols = len(training_data['prediction'].keys())
   results = DataFrame()
   for row in preds.iterrows():
    index, data = row 
    results = results.append( data[range(pred_cols)] == data['actual'] )
   for k in preds[range(pred_cols)]:
     self.weights[k] = 1/variance(1-results[k])
   # If we have infinte weights make them 2* the sum of the other 
   if any(x == inf for x in self.weights):
     tot_weight = sum( [x for x in self.weights.values() if x != inf] )
     for wk in self.weights:
       if self.weights[wk] == inf:
         self.weights[wk] = 2*tot_weight
コード例 #29
0
    def test_append_list_of_series_dicts(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=['foo', 'bar', 'baz', 'qux'])

        dicts = [x.to_dict() for idx, x in df.iterrows()]

        result = df.append(dicts, ignore_index=True)
        expected = df.append(df, ignore_index=True)
        assert_frame_equal(result, expected)

        # different columns
        dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
                 {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
        result = df.append(dicts, ignore_index=True, sort=True)
        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        assert_frame_equal(result, expected)
コード例 #30
0
ファイル: test_scalar.py プロジェクト: changhiskhan/pandas
    def test_mixed_index_at_iat_loc_iloc_dataframe(self):
        # GH 19860
        df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]],
                       columns=['a', 'b', 'c', 1, 2])
        for rowIdx, row in df.iterrows():
            for el, item in row.iteritems():
                assert df.at[rowIdx, el] == df.loc[rowIdx, el] == item

        for row in range(2):
            for i in range(5):
                assert df.iat[row, i] == df.iloc[row, i] == row * 5 + i

        with pytest.raises(KeyError):
            df.at[0, 3]
        with pytest.raises(KeyError):
            df.loc[0, 3]
コード例 #31
0
ファイル: geo_map.py プロジェクト: jonzarecki/coord2vec
    def load_wkt_layer_from_dataframe(self, df: DataFrame, wkt_column_name: str,
                                      color: Union[str, List[str]] = '#0078d7',
                                      fill_color: Union[str, List[str]] = '#0048a7',
                                      fill_alpha: float = 0.2,
                                      group_name=None,
                                      change_bounds_on_click=False,
                                      pop_up: bool = True):
        """
        loads additional layer to the map

        Args:
                df (Pandas dataframe,default=False)
                    dataframe with at least one geographic column

                wkt_column_name (str,default=False)
                    the name of the geometry column to show
                    the geometries should be in the format of wkt string
                    if you are using oracle, select the sdo_geometry with the oracle function sdo_util.to_wktgeometry

                color (str,default=True)
                    the color to use when drawing the geoms on the map
                    examples - blue,white,#0078d7, #9999d9

                fill_color(str,default=True)
                    the color to fill when its complex geometry as polygon

                fill_alpha(float)
                    the opacity of the fill color, between 0 and 1

                group_name: (str,default=None)
                    will take geometries and create a group in the LayerControl, gives a specific name for the group.
                    If None then does'nt group the geometries.

                change_bounds_on_click: whether a mouse-click on change the map bounds to the object

                pop_up: whether a mouse-click on the object will pop-up
        """
        part_func = lambda x, color_index: {'color': color[color_index] if type(color) is not str else color,
                                            'fillColor': fill_color[color_index] if type(
                                                fill_color) is not str else fill_color,
                                            'fillOpacity': fill_alpha}

        object_to_add = self.map
        if group_name is not None:
            object_to_add = FeatureGroup(name=group_name)

        for index, row in df.iterrows():
            geom_dict = wkt.loads(row[wkt_column_name])
            shp_geom = loads(row[wkt_column_name])
            row_formatted = ""
            for index_c, column in enumerate(row.index.values):
                if column == wkt_column_name:
                    pass
                else:
                    row_formatted += "<b>{}</b>: {} <br/>".format(column, row[column])
            row_formatted += "<b>{}</b>: {} <br/>".format("GEOM CENTROID", shp_geom.centroid)
            feature = (folium.GeoJson if change_bounds_on_click else NoClickGeoJson)(
                            geom_dict, style_function=partial(part_func, color_index=index))
            if pop_up:
                popup = folium.Popup(row_formatted.replace("'", "\""))
                popup.add_to(feature)
            feature.add_to(object_to_add)

        if group_name is not None:
            object_to_add.add_to(self.map)
コード例 #32
0
class DifferentialFVA(StrainDesignMethod):
    r"""Differential flux variability analysis.

    Compares flux ranges of a reference model to a set of models that
    have been parameterized to lie on a grid of evenly spaced points in the
    n-dimensional production envelope (n being the number of reaction bounds
    to be varied).
    ::
        production
        ^
        |---------.          * reference_model
        | . . . . .\         . design_space_model
        | . . . . . \
        | . . . . . .\
        | . . . . . . \
        o--------------*- >
                     growth

    Overexpression, downregulation, knockout, flux-reversal and other
    strain engineering targets can be inferred from the resulting comparison.

    Parameters
    ----------
    design_space_model : cobra.Model
        A model whose flux ranges will be scanned.
    objective : str or Reaction or Metabolite
        A reaction whose flux or a metabolite whose production should be maximized.
    variables : iterable, optional
        A iterable of n reactions (or IDs) to be scanned (defaults to current objective in design_space_model).
    reference_model : cobra.Model, optional
        A model whose flux ranges represent the reference state and all calculated
        flux ranges will be compared to. Defaults to design_space_model constrained
        to its maximum objective value.
    exclude : iterable
        An iterable of reactions (or IDs) to be excluded in the analysis (exchange
        reactions will not be analyzed automatically).
    normalize_ranges_by : str or Reaction, optional
        A reaction ID that specifies a flux by whom all calculated flux ranges
        will be normalized by.
    points : int, optional
        Number of points to lay on the surface of the n-dimensional production envelope (defaults to 10).

    Examples
    --------
    >>> from cameo import models
    >>> from cameo.strain_design.deterministic import DifferentialFVA
    >>> model = models.bigg.e_coli_core
    >>> reference_model = model.copy()
    >>> reference_model.reactions.Biomass_Ecoli_core_w_GAM.lower_bound = reference_model.optimize().objective_value
    >>> diffFVA = DifferentialFVA(design_space_model=model,
                          reference_model=reference_model,
                          objective=model.reactions.EX_succ_e,
                          variables=[model.reactions.Biomass_Ecoli_core_w_GAM],
                          normalize_ranges_by=model.reactions.Biomass_Ecoli_core_w_GAM,
                          points=10)
    >>> result = diffFVA.run(surface_only=True)
    >>> result.plot()
    """
    def __init__(self,
                 design_space_model,
                 objective,
                 variables=None,
                 reference_model=None,
                 exclude=(),
                 normalize_ranges_by=None,
                 points=10):
        super(DifferentialFVA, self).__init__()

        self.design_space_model = design_space_model
        self.design_space_nullspace = nullspace(
            create_stoichiometric_array(self.design_space_model))
        if reference_model is None:
            self.reference_model = self.design_space_model.copy()
            fix_objective_as_constraint(self.reference_model)
            self.reference_nullspace = self.design_space_nullspace
        else:
            self.reference_model = reference_model
            self.reference_nullspace = nullspace(
                create_stoichiometric_array(self.reference_model))

        if isinstance(objective, Reaction):
            self.objective = objective.id
        elif isinstance(objective, Metabolite):
            try:
                self.reference_model.add_boundary(objective, type='demand')
            except ValueError:
                pass
            try:
                self.objective = self.design_space_model.add_boundary(
                    objective, type='demand').id
            except ValueError:
                self.objective = self.design_space_model.reactions.get_by_id(
                    "DM_" + objective.id).id
        elif isinstance(objective, six.string_types):
            self.objective = objective
        else:
            raise ValueError(
                'You need to provide an objective as a Reaction, Metabolite or a reaction id'
            )

        if variables is None:
            # try to establish the current objective reaction
            obj_var_ids = [
                variable.name for variable in
                self.design_space_model.objective.expression.free_symbols
            ]
            obj_var_ids = [re.sub('_reverse.*', '', id) for id in obj_var_ids]
            if len(set(obj_var_ids)) != 1:
                raise ValueError(
                    "The current objective in design_space_model is not a single reaction objective. "
                    "DifferentialFVA does not support composite objectives.")
            else:
                self.variables = [
                    self.design_space_model.reactions.get_by_id(
                        obj_var_ids[0]).id
                ]
        else:
            self.variables = list()
            for variable in variables:
                if isinstance(variable, Reaction):
                    self.variables.append(variable.id)
                else:
                    self.variables.append(variable)

        self.exclude = list()
        for elem in exclude:
            if isinstance(elem, Reaction):
                self.exclude.append(elem.id)
            else:
                self.exclude.append(elem)

        design_space_blocked_reactions = find_blocked_reactions_nullspace(
            self.design_space_model, self.design_space_nullspace)
        self.exclude += [
            reaction.id for reaction in design_space_blocked_reactions
        ]

        reference_blocked_reactions = find_blocked_reactions_nullspace(
            self.reference_model, self.reference_nullspace)
        self.exclude += [
            reaction.id for reaction in reference_blocked_reactions
        ]

        self.exclude += [
            reaction.id for reaction in self.design_space_model.exchanges
        ]
        self.exclude += [
            reaction.id for reaction in self.reference_model.exchanges
        ]

        self.exclude += [
            reaction.id for reaction in self.design_space_model.reactions
            if _BIOMASS_RE_.match(reaction.id)
        ]

        self.exclude = set(self.exclude)

        self.points = points
        self.envelope = None
        self.grid = None
        self.reference_flux_ranges = None
        self.reference_flux_dist = None

        if isinstance(normalize_ranges_by, Reaction):
            self.normalize_ranges_by = normalize_ranges_by.id
        else:
            self.normalize_ranges_by = normalize_ranges_by

    @staticmethod
    def _interval_overlap(interval1, interval2):
        return min(interval1[1] - interval2[0], interval2[1] - interval1[0])

    @classmethod
    def _interval_gap(cls, interval1, interval2):
        overlap = cls._interval_overlap(interval1, interval2)
        if overlap >= 0:
            return 0
        else:
            if abs(interval1[1]) > abs(interval2[1]):
                return overlap
            else:
                return -1 * overlap

    def _init_search_grid(self, surface_only=False, improvements_only=True):
        """Initialize the grid of points to be scanned within the production envelope."""
        self.envelope = phenotypic_phase_plane(self.design_space_model,
                                               self.variables,
                                               objective=self.objective,
                                               points=self.points)
        intervals = self.envelope[[
            'objective_lower_bound', 'objective_upper_bound'
        ]].copy()
        intervals['objective_lower_bound'] = float_floor(
            intervals.objective_lower_bound, ndecimals)
        intervals['objective_upper_bound'] = float_ceil(
            intervals.objective_upper_bound, ndecimals)
        max_distance = 0.
        max_interval = None
        for i, (lb, ub) in intervals.iterrows():
            distance = abs(ub - lb)
            if distance > max_distance:
                max_distance = distance
                max_interval = (lb, ub)
        step_size = (max_interval[1] - max_interval[0]) / (self.points - 1)
        grid = list()
        minimal_reference_production = self.reference_flux_ranges[
            'lower_bound'][self.objective]
        for i, row in self.envelope.iterrows():
            variables = row[self.variables]
            lb = row.objective_lower_bound
            if improvements_only:
                lb = max(lb, minimal_reference_production) + step_size
            ub = row.objective_upper_bound
            if not surface_only:
                coordinate = lb
                while coordinate < ub:
                    grid.append(list(variables.values) + [coordinate])
                    coordinate += step_size
            if improvements_only and ub <= minimal_reference_production:
                continue
            else:
                grid.append(list(variables.values) + [ub])
        columns = self.variables + [self.objective]
        self.grid = DataFrame(grid, columns=columns)

    def run(self,
            surface_only=True,
            improvements_only=True,
            progress=True,
            view=None):
        """Run the differential flux variability analysis.

        Parameters
        ----------
        surface_only : bool, optional
            If only the surface of the n-dimensional production envelope should be scanned (defaults to True).
        improvements_only : bool, optional
            If only grid points should should be scanned that constitute and improvement in production
            over the reference state (defaults to True).
        progress : bool, optional
            If a progress bar should be shown.
        view : SequentialView or MultiprocessingView or ipython.cluster.DirectView, optional
            A parallelization view (defaults to SequentialView).

        Returns
        -------
        pandas.Panel
            A pandas Panel containing a results DataFrame for every grid point scanned.
        """
        with TimeMachine() as tm:
            # Make sure that the design_space_model is initialized to its original state later
            for variable in self.variables:
                reaction = self.design_space_model.reactions.get_by_id(
                    variable)
                tm(do=int,
                   undo=partial(setattr, reaction, 'lower_bound',
                                reaction.lower_bound))
                tm(do=int,
                   undo=partial(setattr, reaction, 'upper_bound',
                                reaction.upper_bound))
            target_reaction = self.design_space_model.reactions.get_by_id(
                self.objective)
            tm(do=int,
               undo=partial(setattr, target_reaction, 'lower_bound',
                            target_reaction.lower_bound))
            tm(do=int,
               undo=partial(setattr, target_reaction, 'upper_bound',
                            target_reaction.upper_bound))

            if view is None:
                view = config.default_view
            else:
                view = view

            included_reactions = [
                reaction.id for reaction in self.reference_model.reactions
                if reaction.id not in self.exclude
            ] + self.variables + [self.objective]

            self.reference_flux_dist = pfba(self.reference_model,
                                            fraction_of_optimum=0.99)

            self.reference_flux_ranges = flux_variability_analysis(
                self.reference_model,
                reactions=included_reactions,
                view=view,
                remove_cycles=False,
                fraction_of_optimum=0.75).data_frame

            self._init_search_grid(surface_only=surface_only,
                                   improvements_only=improvements_only)

            func_obj = _DifferentialFvaEvaluator(self.design_space_model,
                                                 self.variables,
                                                 self.objective,
                                                 included_reactions)
            if progress:
                progress = ProgressBar(len(self.grid))
                results = list(
                    progress(view.imap(func_obj, self.grid.iterrows())))
            else:
                results = list(view.map(func_obj, self.grid.iterrows()))

        solutions = dict((tuple(point.iteritems()), fva_result)
                         for (point, fva_result) in results)
        reference_intervals = self.reference_flux_ranges[[
            'lower_bound', 'upper_bound'
        ]].values
        for sol in six.itervalues(solutions):
            intervals = sol[['lower_bound', 'upper_bound']].values
            gaps = [
                self._interval_gap(interval1, interval2) for interval1,
                interval2 in my_zip(reference_intervals, intervals)
            ]
            sol['gaps'] = gaps
            if self.normalize_ranges_by is not None:
                normalizer = sol.lower_bound[self.normalize_ranges_by]
                if normalizer > non_zero_flux_threshold:
                    normalized_intervals = sol[['lower_bound', 'upper_bound'
                                                ]].values / normalizer

                    sol['normalized_gaps'] = [
                        self._interval_gap(interval1, interval2)
                        for interval1, interval2 in my_zip(
                            reference_intervals, normalized_intervals)
                    ]
                else:
                    sol['normalized_gaps'] = [numpy.nan] * len(sol.lower_bound)
            else:
                sol['normalized_gaps'] = gaps

        ref_upper_bound = self.reference_flux_ranges.upper_bound.apply(
            lambda v: 0 if abs(v) < non_zero_flux_threshold else v)
        ref_lower_bound = self.reference_flux_ranges.lower_bound.apply(
            lambda v: 0 if abs(v) < non_zero_flux_threshold else v)

        collection = list()
        for key, df in six.iteritems(solutions):
            df['biomass'] = key[0][1]
            df['production'] = key[1][1]

            df['KO'] = False
            df['flux_reversal'] = False
            df['suddenly_essential'] = False
            df['free_flux'] = False

            df.loc[(df.lower_bound == 0) & (df.upper_bound == 0) &
                   (ref_upper_bound != 0) & (ref_lower_bound != 0),
                   'KO'] = True

            df.loc[((ref_upper_bound < 0) & (df.lower_bound > 0) |
                    ((ref_lower_bound > 0) & (df.upper_bound < 0))),
                   'flux_reversal'] = True

            df.loc[((df.lower_bound <= 0) & (df.lower_bound > 0)) |
                   ((ref_lower_bound >= 0) & (df.upper_bound <= 0)),
                   'suddenly_essential'] = True

            is_reversible = numpy.asarray([
                self.design_space_model.reactions.get_by_id(i).reversibility
                for i in df.index
            ],
                                          dtype=bool)
            not_reversible = numpy.logical_not(is_reversible)

            df.loc[((df.lower_bound == -1000) &
                    (df.upper_bound == 1000) & is_reversible) |
                   ((df.lower_bound == 0) &
                    (df.upper_bound == 1000) & not_reversible) |
                   ((df.lower_bound == -1000) &
                    (df.upper_bound == 0) & not_reversible),
                   'free_flux'] = True

            df['reaction'] = df.index
            df['excluded'] = df['reaction'].isin(self.exclude)

            collection.append(df)


#        multi_index = [(key[0][1], key[1][1]) for key in solutions]
#        solutions_multi_index = pandas.concat(list(solutions.values()),
# axis=0, keys=multi_index)#
#        solutions_multi_index.index.set_names(['biomass', 'production',
# 'reaction'], inplace=True)
        total = pandas.concat(collection, ignore_index=True, copy=False)
        total.sort_values(['biomass', 'production', 'reaction'], inplace=True)
        total.index = total['reaction']
        return DifferentialFVAResult(total, self.envelope,
                                     self.reference_flux_ranges,
                                     self.reference_flux_dist)
コード例 #33
0
ファイル: final.py プロジェクト: AbhishekM117/smart_tree
def calcualteAverageSimilarity(similarityDataFrame: DataFrame) -> Series:
    averageSimilarityList = [None] * len(similarityDataFrame)
    for index, row in similarityDataFrame.iterrows():
        averageSimilarityList[index] = row.mean()
    return Series(averageSimilarityList)
コード例 #34
0
def process_movie_keywords(session, data: pd.DataFrame):
    """ Attaches genre keywords to movie records """
    movie_title_index = {}
    movie_index = src.controller.movie.MovieLookupIndex(logger).query()
    genre_index = src.controller.fields.GenreIndexLookup(logger).query()
    keyword_index = src.controller.fields.PlotKeywordIndexLookup(
        logger).query()
    actor_index = src.controller.person.PersonIndexLookup(logger).query()

    print('Updating movie mappings')
    for i, record in data.iterrows():
        record_no = i + 1

        if record_no % 500 == 0:
            print('\tProcessing record #%s' % record_no)

        # Get searchable title+year of movie record
        movie_title = record['movie_title'].strip()
        movie_title_l = movie_title.lower()
        movie_year = record['title_year']

        if pd.isna(movie_title_l):
            continue

        if pd.isna(movie_year):
            movie_year = ''
        else:
            movie_year = str(movie_year)

        movie_record_index = (movie_title_l, movie_year)

        if movie_record_index in movie_title_index:
            continue
        else:
            # Mark movie by record number in dataframe
            movie_title_index[movie_title_l] = record_no

        # Get movies, keywords, and genre ids
        movie_pk = movie_index[movie_record_index]

        if pd.isna(record['genres']):
            genre_names = []
        else:
            genre_names = [
                name.strip().lower() for name in record['genres'].split('|')
            ]
        genre_pks = [genre_index[name] for name in genre_names]

        if pd.isna(record['plot_keywords']):
            keyword_names = []
        else:
            keyword_names = [
                name.strip().lower()
                for name in record['plot_keywords'].split('|')
            ]
        keyword_pks = [keyword_index[name] for name in keyword_names]

        # Get actor ids
        actor_names = []
        for actor_field in ('actor_1_name', 'actor_2_name', 'actor_3_name'):
            if not pd.isna(record[actor_field]):
                actor_names.append(record[actor_field].strip().lower())
        actor_pks = [actor_index[name] for name in actor_names]

        # Add genres and keywords to movie
        src.controller.movie.AttachMovieGenre(logger=logger,
                                              session=session,
                                              commit_enabled=False).execute(
                                                  movie_pk=movie_pk,
                                                  genre_pks=genre_pks)
        src.controller.movie.AttachMoviePlotKeywords(
            logger=logger, session=session,
            commit_enabled=False).execute(movie_pk=movie_pk,
                                          keyword_pks=keyword_pks)

        # Add actors to movie
        src.controller.movie.AttachMovieActors(logger=logger,
                                               session=session,
                                               commit_enabled=False).execute(
                                                   movie_pk=movie_pk,
                                                   actor_pks=actor_pks)

    session.commit()
コード例 #35
0
class Iteration:
    # mem_itertuples_* benchmarks are slow
    timeout = 120

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=["C" + str(c) for c in range(N * 5)])
        self.df4 = DataFrame(np.random.randn(N * 1000, 10))

    def time_items(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, "_item_cache"):
            self.df._item_cache.clear()
        for name, col in self.df.items():
            pass

    def time_items_cached(self):
        for name, col in self.df.items():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples_start(self):
        self.df4.itertuples()

    def time_itertuples_read_first(self):
        next(self.df4.itertuples())

    def time_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def time_itertuples_to_list(self):
        list(self.df4.itertuples())

    def mem_itertuples_start(self):
        return self.df4.itertuples()

    def peakmem_itertuples_start(self):
        self.df4.itertuples()

    def mem_itertuples_read_first(self):
        return next(self.df4.itertuples())

    def peakmem_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def mem_itertuples_to_list(self):
        return list(self.df4.itertuples())

    def peakmem_itertuples_to_list(self):
        list(self.df4.itertuples())

    def time_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def time_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def time_itertuples_raw_tuples(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def time_itertuples_raw_tuples_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def mem_itertuples_raw_start(self):
        return self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def mem_itertuples_raw_to_list(self):
        return list(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
コード例 #36
0
def present(duration=120, eeg=None, save_fn=None):
    n_trials = 2010
    iti = 0.4
    soa = 0.3
    jitter = 0.2
    record_duration = np.float32(duration)
    markernames = [1, 2]

    # Setup trial list
    image_type = np.random.binomial(1, 0.5, n_trials)
    trials = DataFrame(
        dict(image_type=image_type, timestamp=np.zeros(n_trials)))

    def load_image(fn):
        return visual.ImageStim(win=mywin, image=fn)

    # Setup graphics
    mywin = visual.Window([1600, 900],
                          monitor='testMonitor',
                          units="deg",
                          fullscr=True)

    targets = list(map(load_image, glob(os.path.join(CAT_DOG,
                                                     'target-*.jpg'))))
    nontargets = list(
        map(load_image, glob(os.path.join(CAT_DOG, 'nontarget-*.jpg'))))
    stim = [nontargets, targets]

    # start the EEG stream, will delay 5 seconds to let signal settle
    if eeg:
        if save_fn is None:  # If no save_fn passed, generate a new unnamed save file
            save_fn = generate_save_fn(eeg.device_name, 'visual_p300',
                                       'unnamed')
            print(
                f'No path for a save file was passed to the experiment. Saving data to {save_fn}'
            )
        eeg.start(save_fn, duration=record_duration)

    # Show instructions
    show_instructions(duration=duration)

    # Iterate through the events
    start = time()
    for ii, trial in trials.iterrows():
        # Inter trial interval
        core.wait(iti + np.random.rand() * jitter)

        # Select and display image
        label = trials['image_type'].iloc[ii]
        image = choice(targets if label == 1 else nontargets)
        image.draw()

        # Push sample
        if eeg:
            timestamp = time()
            if eeg.backend == 'muselsl':
                marker = [markernames[label]]
            else:
                marker = markernames[label]
            eeg.push_sample(marker=marker, timestamp=timestamp)

        mywin.flip()

        # offset
        core.wait(soa)
        mywin.flip()
        if len(event.getKeys()) > 0 or (time() - start) > record_duration:
            break

        event.clearEvents()

    # Cleanup
    if eeg: eeg.stop()
    mywin.close()
コード例 #37
0
def present(duration=365,
            eeg=None,
            save_fn=None,
            iti=0.,
            soa=1.0,
            jitter=0.,
            n_trials=180,
            cf1=1000,
            amf1=40):

    # Create markers stream outlet
    info = StreamInfo("Markers", "Markers", 1, 0, "int32", "myuidw43536")
    outlet = StreamOutlet(info)

    markernames = [1]
    start = time()

    # Set up trial parameters
    record_duration = np.float32(duration)

    # Set up trial list
    stim_freq = np.zeros((n_trials, ), dtype=int)
    trials = DataFrame(dict(stim_freq=stim_freq, timestamp=np.zeros(n_trials)))

    # Setup graphics
    mywin = visual.Window([1920, 1080],
                          monitor="testMonitor",
                          units="deg",
                          fullscr=True)
    fixation = visual.GratingStim(win=mywin,
                                  size=0.2,
                                  pos=[0, 0],
                                  sf=0,
                                  rgb=[1, 0, 0])
    fixation.setAutoDraw(True)

    # Generate stimuli
    am1 = generate_am_waveform(cf1, amf1, secs=soa, sample_rate=44100)

    aud1 = sound.Sound(am1)
    aud1.setVolume(0.8)

    auds = [aud1]

    mywin.flip()

    # Show the instructions screen
    show_instructions(10)

    # start the EEG stream=
    if eeg:
        eeg.start(save_fn, duration=record_duration)

    for ii, trial in trials.iterrows():
        # Intertrial interval
        core.wait(iti + np.random.rand() * jitter)

        # Select stimulus frequency
        ind = trials["stim_freq"].iloc[ii]
        auds[ind].stop()
        auds[ind].play()

        # Push sample
        if eeg:
            timestamp = time()
            if eeg.backend == "muselsl":
                marker = [markernames[ind]]
                marker = list(map(int, marker))
            else:
                marker = markernames[ind]
            eeg.push_sample(marker=marker, timestamp=timestamp)

        # offset
        core.wait(soa)
        mywin.flip()
        if len(event.getKeys()) > 0:
            break
        if (time() - start) > record_duration:
            break

        event.clearEvents()

    # Cleanup
    if eeg:
        eeg.stop()

    mywin.close()
コード例 #38
0
def get_analytical_parameter_table(
        hierarchical_candidate_ids: list, parameter_type: str,
        condition_id_to_index: Dict[str, int], measurement_df: pd.DataFrame,
        observable_ids, condition_map,
        no_preeq_condition_idx: int) -> List[Tuple[int, int, int]]:
    """Generate (scalingIdx, conditionIdx, observableIdx) table for all
    occurrences of the given parameter names.

    Parameters:
        hierarchical_candidate_ids: Ids of optimization parameters for
            hierarchical optimization. This table depends on ordering of
            this list.
        parameter_type:
            'observable' or 'noise'

    Returns:
        list of (scalingIdx, conditionIdx, observableIdx) tuples
    """

    # need list, not ndarray
    condition_map_list = [list(x) for x in condition_map]

    if parameter_type == 'observable':

        def _get_overrides():
            return split_parameter_replacement_list(row.observableParameters)
    elif parameter_type == 'noise':

        def _get_overrides():
            return split_parameter_replacement_list(row.noiseParameters)
    else:
        raise ValueError("parameter_type must be 'noise' or "
                         f"'observable', but got {parameter_type}")

    use = []
    for _, row in measurement_df.iterrows():
        overrides = _get_overrides()

        sim_cond_idx = \
            condition_id_to_index[row.simulationConditionId]
        preeq_cond_idx = no_preeq_condition_idx
        if not isnan(row.preequilibrationConditionId):
            preeq_cond_idx = condition_id_to_index[
                row.preequilibrationConditionId]

        for s in overrides:
            # print(s, parametersForHierarchical)
            try:
                candidate_idx = hierarchical_candidate_ids.index(s)
            except ValueError:
                continue  # current parameter not in list

            condition_idx = condition_map_list.index(
                [preeq_cond_idx, sim_cond_idx])
            observable_idx = observable_ids.index(row.observableId)
            tup = (candidate_idx, condition_idx, observable_idx)

            # Don't add a new line for each timepoint
            # We don't allow separate parameters for individual time-points
            # (Can be implemented via different observables)
            if tup not in use:
                use.append(tup)

    if not len(use):
        raise AssertionError("Candidates were: "
                             f"{hierarchical_candidate_ids} but nothing "
                             "usable found")

    return use
コード例 #39
0
def df_to_hover_text(df: pd.DataFrame):
    return [row_to_hover_text(row) for _, row in df.iterrows()]
コード例 #40
0
    def test_unstack_nan_index(self):  # GH7466
        def cast(val):
            val_str = "" if val != val else val
            return f"{val_str:1}"

        def verify(df):
            mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
            rows, cols = df.notna().values.nonzero()
            for i, j in zip(rows, cols):
                left = sorted(df.iloc[i, j].split("."))
                right = mk_list(df.index[i]) + mk_list(df.columns[j])
                right = sorted(map(cast, right))
                assert left == right

        df = DataFrame(
            {
                "jim": ["a", "b", np.nan, "d"],
                "joe": ["w", "x", "y", "z"],
                "jolie": ["a.w", "b.x", " .y", "d.z"],
            }
        )

        left = df.set_index(["jim", "joe"]).unstack()["jolie"]
        right = df.set_index(["joe", "jim"]).unstack()["jolie"].T
        tm.assert_frame_equal(left, right)

        for idx in itertools.permutations(df.columns[:2]):
            mi = df.set_index(list(idx))
            for lev in range(2):
                udf = mi.unstack(level=lev)
                assert udf.notna().values.sum() == len(df)
                verify(udf["jolie"])

        df = DataFrame(
            {
                "1st": ["d"] * 3
                + [np.nan] * 5
                + ["a"] * 2
                + ["c"] * 3
                + ["e"] * 2
                + ["b"] * 5,
                "2nd": ["y"] * 2
                + ["w"] * 3
                + [np.nan] * 3
                + ["z"] * 4
                + [np.nan] * 3
                + ["x"] * 3
                + [np.nan] * 2,
                "3rd": [
                    67,
                    39,
                    53,
                    72,
                    57,
                    80,
                    31,
                    18,
                    11,
                    30,
                    59,
                    50,
                    62,
                    59,
                    76,
                    52,
                    14,
                    53,
                    60,
                    51,
                ],
            }
        )

        df["4th"], df["5th"] = (
            df.apply(lambda r: ".".join(map(cast, r)), axis=1),
            df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1),
        )

        for idx in itertools.permutations(["1st", "2nd", "3rd"]):
            mi = df.set_index(list(idx))
            for lev in range(3):
                udf = mi.unstack(level=lev)
                assert udf.notna().values.sum() == 2 * len(df)
                for col in ["4th", "5th"]:
                    verify(udf[col])

        # GH7403
        df = pd.DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)})
        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [
            [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7],
        ]
        vals = list(map(list, zip(*vals)))
        idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B")
        cols = MultiIndex(
            levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
        )

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)})
        df.iloc[2, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
        cols = MultiIndex(
            levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
        )
        idx = Index([np.nan, 0, 1, 2, 3], name="B")
        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        df = pd.DataFrame(
            {"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}
        )
        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
        cols = MultiIndex(
            levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
        )
        idx = Index([np.nan, 0, 1, 2, 3], name="B")
        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        # GH7401
        df = pd.DataFrame(
            {
                "A": list("aaaaabbbbb"),
                "B": (date_range("2012-01-01", periods=5).tolist() * 2),
                "C": np.arange(10),
            }
        )

        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack()

        vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
        idx = Index(["a", "b"], name="A")
        cols = MultiIndex(
            levels=[["C"], date_range("2012-01-01", periods=5)],
            codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
            names=[None, "B"],
        )

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        # GH4862
        vals = [
            ["Hg", np.nan, np.nan, 680585148],
            ["U", 0.0, np.nan, 680585148],
            ["Pb", 7.07e-06, np.nan, 680585148],
            ["Sn", 2.3614e-05, 0.0133, 680607017],
            ["Ag", 0.0, 0.0133, 680607017],
            ["Hg", -0.00015, 0.0133, 680607017],
        ]
        df = DataFrame(
            vals,
            columns=["agent", "change", "dosage", "s_id"],
            index=[17263, 17264, 17265, 17266, 17267, 17268],
        )

        left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack()

        vals = [
            [np.nan, np.nan, 7.07e-06, np.nan, 0.0],
            [0.0, -0.00015, np.nan, 2.3614e-05, np.nan],
        ]

        idx = MultiIndex(
            levels=[[680585148, 680607017], [0.0133]],
            codes=[[0, 1], [-1, 0]],
            names=["s_id", "dosage"],
        )

        cols = MultiIndex(
            levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]],
            codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
            names=[None, "agent"],
        )

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"])
        tm.assert_frame_equal(left.unstack(), right)

        # GH9497 - multiple unstack with nulls
        df = DataFrame(
            {
                "1st": [1, 2, 1, 2, 1, 2],
                "2nd": pd.date_range("2014-02-01", periods=6, freq="D"),
                "jim": 100 + np.arange(6),
                "joe": (np.random.randn(6) * 10).round(2),
            }
        )

        df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02")
        df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan
        df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan

        left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"])
        assert left.notna().values.sum() == 2 * len(df)

        for col in ["jim", "joe"]:
            for _, r in df.iterrows():
                key = r["1st"], (col, r["2nd"], r["3rd"])
                assert r[col] == left.loc[key]
コード例 #41
0
ファイル: caltrack.py プロジェクト: scuervo91/reservoirpy
def caltrack(df: pd.DataFrame,
             cali: (list, str) = None,
             bit: (list, str) = None,
             lims: (list) = None,
             cal_lim: list = [5, 20],
             dtick: bool = False,
             fill: bool = False,
             fontsize: int = 8,
             grid_numbers: list = [11, 51],
             steps: list = None,
             correlation: pd.DataFrame = None,
             ax=None,
             cal_kw: dict = {},
             corr_kw: dict = {},
             bit_kw: dict = {},
             depth_ref: str = 'md',
             cal_colormap: str = 'winter',
             bit_colormap: str = 'bone'):
    """caltrack [summary]

    Parameters
    ----------
    df : pd.DataFrame
        [description]
    cali : [type], optional
        [description], by default None
    bit : [type], optional
        [description], by default None
    lims : [type], optional
        [description], by default None
    cal_lim : list, optional
        [description], by default [5,20]
    dtick : bool, optional
        [description], by default False
    fill : bool, optional
        [description], by default False
    fontsize : int, optional
        [description], by default 8
    grid_numbers : list, optional
        [description], by default [11,51]
    steps : list, optional
        [description], by default None
    correlation : pd.DataFrame, optional
        [description], by default None
    ax : [type], optional
        [description], by default None
    cal_kw : dict, optional
        [description], by default {}
    corr_kw : dict, optional
        [description], by default {}
    bit_kw : dict, optional
        [description], by default {}
    depth_ref : str, optional
        [description], by default 'md'
    cal_colormap : str, optional
        [description], by default 'winter'
    bit_colormap : str, optional
        [description], by default 'bone'
    """

    assert isinstance(df, pd.DataFrame)
    assert depth_ref in ['md', 'tvd', 'tvdss'
                         ], "depth_ref can only be one of ['md','tvd','tvdss']"

    cal = ax or plt.gca()

    def_cal_kw = {'color': 'black', 'linestyle': '-', 'linewidth': 1}

    for (k, v) in def_cal_kw.items():
        if k not in cal_kw:
            cal_kw[k] = v

    def_bit_kw = {'color': 'darkred', 'linestyle': '--', 'linewidth': 2}
    for (k, v) in def_bit_kw.items():
        if k not in bit_kw:
            bit_kw[k] = v

    def_corr_kw = {'color': 'red', 'linestyle': '--', 'linewidth': 2}
    for (k, v) in def_corr_kw.items():
        if k not in corr_kw:
            corr_kw[k] = v

    #Set lims
    if lims == None:  #Depth Limits
        lims = [df.index.min(), df.index.max()]

    cal.set_ylim([lims[1], lims[0]])

    #Set the vertical grid spacing
    if steps is None:
        mayor_grid = np.linspace(lims[0], lims[1], grid_numbers[0])
        minor_grid = np.linspace(lims[0], lims[1], grid_numbers[1])
    else:
        mayor_grid = np.arange(lims[0], lims[1], steps[0])
        minor_grid = np.arange(lims[0], lims[1], steps[1])

    depth = df.index if depth_ref == 'md' else df[depth_ref]

    if cali is not None:
        if isinstance(cali, str):
            cal.plot(df[cali], depth, **cal_kw)  #Plotting
        elif isinstance(cali, list):
            cmap = mpl.cm.get_cmap(cal_colormap, len(cal))
            for i, c in enumerate(cal):
                cal_kw['color'] = cmap(i)
                cal.plot(df[c], depth, **cal_kw)

    if bit is not None:
        cal.plot(df[bit], depth, **bit_kw)

    cal.set_xlim(cal_lim)
    cal.set_xlabel("Caliper [in]")
    cal.set_xticks(np.linspace(cal_lim[0], cal_lim[1], 4))
    cal.set_xticklabels(
        np.round(np.linspace(cal_lim[0], cal_lim[1], 4), decimals=1))
    cal.xaxis.tick_top()
    cal.xaxis.set_label_position("top")
    cal.tick_params("both", labelsize=fontsize)
    cal.set_yticks(mayor_grid)
    cal.set_yticks(minor_grid, minor=True)
    if dtick == True:
        cal.set_yticklabels(mayor_grid)
    else:
        cal.set_yticklabels([])

    if fill == True:
        cal.fill_betweenx(depth,
                          df[cali],
                          df[bit],
                          where=(df[cali] > df[bit]),
                          color="orange")
        cal.fill_betweenx(depth,
                          df[cali],
                          df[bit],
                          where=(df[cali] < df[bit]),
                          color="gray")

    #Add Correlation Line
    if correlation is not None:
        cor_ann = corr_kw.pop('ann', False)
        for i in correlation.iterrows():
            cal.hlines(i[1]['depth'], 0, 1, **corr_kw)
            if cor_ann:
                try:
                    cal.annotate(f"{i[1]['depth']} - {i[1]['comment']} ",
                                 xy=(16 - 3, i[1]['depth'] - 1),
                                 xycoords='data',
                                 horizontalalignment='right',
                                 bbox={
                                     'boxstyle': 'roundtooth',
                                     'fc': '0.8'
                                 })
                except:
                    cal.annotate(f"{i[1]['depth']}",
                                 xy=(16 - 3, i[1]['depth'] - 1),
                                 xycoords='data',
                                 horizontalalignment='right',
                                 bbox={
                                     'boxstyle': 'roundtooth',
                                     'fc': '0.8'
                                 })
コード例 #42
0
    def get_ef_dict(self, ef_data: pd.DataFrame):

        ef_data[["Slope", "Load"]] = ef_data[["Slope", "Load"]].fillna(0.0)
        return {(row["VehicleName"], row["Pollutant"], row["Slope"],
                 row["Load"]): row.to_dict()
                for _, row in ef_data.iterrows()}
コード例 #43
0
def train_generator(months, overall, label_month, predict=False):
    datas = pd.read_csv("../references.csv", dtype='object')
    datas = datas.fillna(0)
    indexs = datas['U_I_overall_qty_' + overall].as_matrix().tolist()

    vps = []
    for i in indexs:
        if i == 0:
            continue
        tmp = i.split('-')
        # tmp[0]是vipno,tmp[1]是pluno
        vps.append([tmp[0], tmp[1]])
    vps = np.array(vps)
    feature_names = feature_name_generator(months, overall, predict)
    train_datas = DataFrame(np.zeros(shape=(len(vps), len(feature_names))),
                            columns=feature_names,
                            dtype='float')
    # tmp = DataFrame(vps, columns=['vipno', 'pluno'], dtype='object')
    # print(tmp)
    train_datas = pd.concat(
        [train_datas, DataFrame(vps, columns=['vipno', 'pluno'])], axis=1)

    # 不同的阵容,存储的格式不一样,所以分开处理
    start = datetime.datetime.now()
    train_datas.set_index(['vipno', 'pluno'], inplace=True, drop=False)
    for f in feature_names[:8]:
        ds = datas[f].as_matrix().tolist()
        # count = 0
        for row in ds:
            if row == 0:
                continue
            tmp = row.split('-')
            # print(count)
            # count += 1
            train_datas.loc[(tmp[0], tmp[1]), f] = float(tmp[2])
    print(datetime.datetime.now() - start)
    print("***************")
    start = datetime.datetime.now()
    train_datas.set_index(['pluno'], inplace=True, drop=False)
    for f in feature_names[8:12]:
        # count = 0
        ds = datas[f].as_matrix().tolist()
        for row in ds:
            if row == 0:
                continue
            tmp = row.split('-')
            # print(count)
            # count += 1
            train_datas.loc[tmp[0], f] = float(tmp[1])
    print(datetime.datetime.now() - start)
    print("***************")
    start = datetime.datetime.now()
    for f in feature_names[24:28]:
        ds = datas[f].as_matrix().tolist()
        # count = 0
        for row in ds:
            if row == 0:
                continue
            tmp = row.split('-')
            # print(count)
            # count += 1
            train_datas.loc[tmp[0], f] = float(tmp[1])
    print(datetime.datetime.now() - start)
    print("***************")
    train_datas.set_index(['vipno'], inplace=True, drop=False)
    # print(train_datas.index)

    start = datetime.datetime.now()
    for f in feature_names[12:24]:
        ds = datas[f].as_matrix().tolist()
        # count = 0
        for row in ds:
            if row == 0:
                continue
            tmp = row.split('-')
            # print(count)
            # count += 1
            # print(tmp[1])
            train_datas.loc[tmp[0], f] = float(tmp[1])
    print(datetime.datetime.now() - start)
    print("***************")

    # months = ['02', '03', '04']
    start = datetime.datetime.now()
    train_datas.set_index(['vipno', 'pluno'], inplace=True, drop=False)
    for index, row in train_datas.iterrows():
        tmp = []
        for m in months:
            tmp.append(row['U_I_month_count_' + m])
        tmp.sort()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[28]] = np.array(tmp).mean()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[29]] = np.array(tmp).std()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[30]] = np.array(tmp).max()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[31]] = tmp[1]

        tmp = []
        for m in months:
            tmp.append(row['I_U_month_penetration_' + m])
        tmp.sort()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[32]] = np.array(tmp).mean()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[33]] = np.array(tmp).std()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[34]] = np.array(tmp).max()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[35]] = tmp[1]

        tmp = []
        for m in months:
            tmp.append(row['U_I_month_diversity_' + m])
        tmp.sort()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[36]] = np.array(tmp).mean()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[37]] = np.array(tmp).std()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[38]] = np.array(tmp).max()
        train_datas.loc[(row['vipno'], row['pluno']),
                        feature_names[39]] = tmp[1]

    print(datetime.datetime.now() - start)
    print("***************")

    if not predict:
        start = datetime.datetime.now()
        labels = datas['U_I_month_qty_' + label_month].as_matrix().tolist()
        indexs = train_datas.index
        for label in labels:
            # 0代表空值
            if label != 0:
                label = label.split('-')
                if (label[0], label[1]) in indexs:
                    train_datas.loc[(label[0], label[1]),
                                    'label'] = float(label[2])
        print(datetime.datetime.now() - start)
        print("***************")

    return train_datas
コード例 #44
0
image_type = np.random.binomial(1, 0.5, n_trials)
trials = DataFrame(dict(image_type=image_type,
                        timestamp=np.zeros(n_trials)))


# Setup graphics
def load_image(filename):
    return visual.ImageStim(win=mywin, image=filename)


mywin = visual.Window([1920, 1080], monitor='testMonitor', units='deg',
                      fullscr=True)
faces = map(load_image, glob('stimulus_presentation/stim/face_house/faces/*_3.jpg'))
houses = map(load_image, glob('stimulus_presentation/stim/face_house/houses/*.3.jpg'))

for ii, trial in trials.iterrows():
    # Intertrial interval
    core.wait(iti + np.random.rand() * jitter)

    # Select and display image
    label = trials['image_type'].iloc[ii]
    image = choice(faces if label == 1 else houses)
    image.draw()

    # Send marker
    timestamp = local_clock()
    outlet.push_sample([markernames[label]], timestamp)
    mywin.flip()

    # offset
    core.wait(soa)
コード例 #45
0
def plot_view_stock(df: pd.DataFrame, symbol: str, interval: str):
    """
    Plot the loaded stock dataframe
    Parameters
    ----------
    df: Dataframe
        Dataframe of prices and volumnes
    symbol: str
        Symbol of ticker
    interval: str
        Stock data resolution for plotting purposes

    """
    df.sort_index(ascending=True, inplace=True)
    bar_colors = [
        "r" if x[1].Open < x[1].Close else "g" for x in df.iterrows()
    ]

    try:
        fig, ax = plt.subplots(
            2,
            1,
            gridspec_kw={"height_ratios": [3, 1]},
            figsize=plot_autoscale(),
            dpi=cfgPlot.PLOT_DPI,
        )
    except Exception as e:
        print(e)
        print(
            "Encountered an error trying to open a chart window. Check your X server configuration."
        )
        return

    # In order to make nice Volume plot, make the bar width = interval
    if interval == "1440min":
        bar_width = timedelta(days=1)
        title_string = "Daily"
    else:
        bar_width = timedelta(minutes=int(interval.split("m")[0]))
        title_string = f"{int(interval.split('m')[0])} min"

    ax[0].yaxis.tick_right()
    if "Adj Close" in df.columns:
        ax[0].plot(df.index, df["Adj Close"], c=cfgPlot.VIEW_COLOR)
    else:
        ax[0].plot(df.index, df["Close"], c=cfgPlot.VIEW_COLOR)
    ax[0].set_xlim(df.index[0], df.index[-1])
    ax[0].set_xticks([])
    ax[0].yaxis.set_label_position("right")
    ax[0].set_ylabel("Share Price ($)")
    ax[0].grid(axis="y", color="gainsboro", linestyle="-", linewidth=0.5)

    ax[0].spines["top"].set_visible(False)
    ax[0].spines["left"].set_visible(False)
    ax[1].bar(df.index,
              df.Volume / 1_000_000,
              color=bar_colors,
              alpha=0.8,
              width=bar_width)
    ax[1].set_xlim(df.index[0], df.index[-1])
    ax[1].yaxis.tick_right()
    ax[1].yaxis.set_label_position("right")
    ax[1].set_ylabel("Volume (1M)")
    ax[1].grid(axis="y", color="gainsboro", linestyle="-", linewidth=0.5)
    ax[1].spines["top"].set_visible(False)
    ax[1].spines["left"].set_visible(False)
    ax[1].set_xlabel("Time")
    fig.suptitle(
        symbol + " " + title_string,
        size=20,
        x=0.15,
        y=0.95,
        fontfamily="serif",
        fontstyle="italic",
    )
    if gtff.USE_ION:
        plt.ion()
    fig.tight_layout(pad=2)
    plt.setp(ax[1].get_xticklabels(), rotation=20, horizontalalignment="right")

    plt.show()
    print("")
コード例 #46
0
def load_terms(df: DataFrame, term_creation_mode: str = 'ignore') -> int:
    """Creates Term objects from an input Pandas DataFrame and adds the newly created Terms to the database.
    
    Args:
        df (pandas.DataFrame): Input data. Contains columns:

            * 'Term Locale'
            * 'Term Category'
            * 'Term'
            * 'Case Sensitive'.

        term_creation_mode (str): A logical flag for handling duplicate Term conflicts.
            Can be 'add', 'ignore', or 'replace'. Defaults to 'ignore'.
            Only relevant if a Term already exists in the database.

            * 'add': adds the new Terms while keeping the existing Terms.
            * 'ignore': skips adding new Terms.
            * 'replace': deletes the old Term(s) and then adds the new Term.

    Returns:
        int: The number of Terms added to the database.

    Raises:
        ValueError: invalid `term_creation_mode` argument.
    """
    @unique
    class _TermCreationMode(Enum):
        ADD = 'add'
        IGNORE = 'ignore'
        REPLACE = 'replace'

        @classmethod
        def get_modes(self, *args) -> list:
            def _ga(mode, *args) -> tuple:
                return tuple([getattr(mode, arg) for arg in args])

            if len(args) == 1:
                return [getattr(mode, *args) for mode in self]
            elif len(args) == 0:
                args = ('name', 'value')
            return [_ga(mode, args) for mode in self]

    def _prepare_term(
            dataframe_row: Series,
            term_creation_mode: _TermCreationMode) -> Union[Term, None]:
        """Instantiates and returns a new Term objects based on input data.

        Args:
            dataframe_row (pandas.Series): Input data for Term creation.
            term_creation_mode (_TermCreationMode): A logical flag; determines return behavior.

        Returns:
            new_db_term (Term): A new Term object from the input row.
            None: if term_creation_mode is IGNORE.
        """
        term = dataframe_row['Term'].strip()
        qs_terms_of_this_term = Term.objects.filter(term=term)

        # instantiate a new Term
        new_db_term = Term(term=term,
                           source=dataframe_row['Term Category'],
                           definition_url=dataframe_row['Term Locale'])

        # handle term creation modes
        if qs_terms_of_this_term.exists():
            if term_creation_mode == _TermCreationMode.REPLACE:
                qs_terms_of_this_term.delete()
            elif term_creation_mode == _TermCreationMode.IGNORE:
                return None
        return new_db_term

    try:  # set term creation mode
        term_creation_mode = _TermCreationMode(term_creation_mode)
    except ValueError:  # raise custom argument error
        raise ValueError(
            f"Argument `term_creation_mode` must be one of: {_TermCreationMode.get_modes('values')}"
        )

    # preprocess DataFrame
    df.drop_duplicates(inplace=True)
    df.loc[df['Case Sensitive'] == False,
           'Term'] = df.loc[df['Case Sensitive'] == False, 'Term'].str.lower()
    df = df.drop_duplicates(subset='Term').dropna(subset=['Term'])

    # create new Terms. Filter: if _prepare_term() returns None, it is not added to this list
    new_db_terms = list(
        filter(None, (_prepare_term(row, term_creation_mode)
                      for _, row in df.iterrows())))

    # cache "global" term stems step - should be cached here via model manager
    Term.objects.bulk_create(new_db_terms)

    return len(new_db_terms)
コード例 #47
0
def process_movie_records(session, data: pd.DataFrame):
    """ Adds list of movie records to database from data input """
    # Category lookup fiels
    movie_color_lookup = src.controller.fields.MovieColorIndexLookup(
        logger).query()
    country_lookup = src.controller.fields.CountryIndexLookup(logger).query()
    language_lookup = src.controller.fields.LanguageIndexLookup(logger).query()
    rating_lookup = src.controller.fields.ContentRatingIndexLookup(
        logger).query()
    person_lookup = src.controller.person.PersonIndexLookup(logger).query()

    # Lookup of movie record number by title+year
    movie_title_index = {}
    print('Updating movie records')
    for i, record in data.iterrows():
        record_no = i + 1

        if record_no % 500 == 0:
            print('\tProcessing record #%s' % record_no)

        # Get searchable title+year of movie record
        movie_title = record['movie_title'].strip()
        movie_title_l = movie_title.lower()
        movie_year = record['title_year']

        if pd.isna(movie_title_l):
            logger.warning('Movie with no title on record #%s' % record_no)
            continue

        if pd.isna(movie_year):
            movie_year = ''
        else:
            movie_year = str(movie_year)

        movie_record_index = (movie_title_l, movie_year)

        if movie_record_index in movie_title_index:
            logger.warning(
                'Duplicate movie "%s" (#%s, #%s)' %
                (movie_title, movie_title_index[movie_title_l], record_no))
            continue

        else:
            # Mark movie by record number in dataframe
            movie_title_index[movie_title_l] = record_no

        # Get category ids by name
        movie_color_pk = lookup_category_id(record['color'],
                                            movie_color_lookup)
        country_pk = lookup_category_id(record['country'], country_lookup)
        language_pk = lookup_category_id(record['language'], language_lookup)
        rating_pk = lookup_category_id(record['content_rating'], rating_lookup)
        director_pk = lookup_category_id(record['director_name'],
                                         person_lookup)

        # Get movie's imdb id
        imdb_link = record['movie_imdb_link']
        if imdb_link:
            search_results = IMDB_URL_ID_RE.search(imdb_link)
            if search_results is None:
                imdb_id = None
            else:
                imdb_id = search_results.group(1)
        else:
            imdb_id = None

        # Get movie's numerical stats
        aspect_ratio = src.utils.nan_to_none(record['aspect_ratio'])
        budget = src.utils.nan_to_none(record['budget'])
        cast_likes = src.utils.nan_to_none(record['cast_total_facebook_likes'])
        duration = src.utils.nan_to_none(record['duration'])
        facenum = src.utils.nan_to_none(record['facenumber_in_poster'])
        gross = src.utils.nan_to_none(record['gross'])
        imdb_score = src.utils.nan_to_none(record['imdb_score'])
        facebook_likes = src.utils.nan_to_none(record['movie_facebook_likes'])
        num_critic = src.utils.nan_to_none(record['num_critic_for_reviews'])
        num_user = src.utils.nan_to_none(record['num_user_for_reviews'])
        num_voted = src.utils.nan_to_none(record['num_voted_users'])

        # Add movie record. Manaully take over session and comitting
        src.controller.movie.AddMovie(logger=logger,
                                      session=session,
                                      commit_enabled=False).execute(
                                          movie_title=movie_title,
                                          title_year=movie_year,
                                          content_rating_pk=rating_pk,
                                          color_pk=movie_color_pk,
                                          country_pk=country_pk,
                                          director_pk=director_pk,
                                          language_pk=language_pk,
                                          aspect_ratio=aspect_ratio,
                                          budget=budget,
                                          cast_facebook_likes=cast_likes,
                                          duration=duration,
                                          facenum=facenum,
                                          gross=gross,
                                          imdb_id=imdb_id,
                                          imdb_score=imdb_score,
                                          movie_facebook_likes=facebook_likes,
                                          num_critic_for_reviews=num_critic,
                                          num_user_for_reviews=num_user,
                                          num_voted_users=num_voted)

    session.commit()
コード例 #48
0
def df_to_rows(df: pd.DataFrame):
    return [
        ui.table_row(str(row['ID']), [str(row[name]) for name in column_names])
        for i, row in df.iterrows()
    ]
コード例 #49
0
def present(duration=120, eeg=None, save_fn=None):
    n_trials = 2010
    iti = 0.5
    soa = 3.0
    jitter = 0.2
    record_duration = np.float32(duration)
    markernames = [1, 2]

    # Setup trial list
    stim_freq = np.random.binomial(1, 0.5, n_trials)
    trials = DataFrame(dict(stim_freq=stim_freq, timestamp=np.zeros(n_trials)))

    # Set up graphics
    mywin = visual.Window([1600, 900],
                          monitor='testMonitor',
                          units="deg",
                          fullscr=True)
    grating = visual.GratingStim(win=mywin, mask='circle', size=80, sf=0.2)
    grating_neg = visual.GratingStim(win=mywin,
                                     mask='circle',
                                     size=80,
                                     sf=0.2,
                                     phase=0.5)
    fixation = visual.GratingStim(win=mywin,
                                  size=0.2,
                                  pos=[0, 0],
                                  sf=0.2,
                                  color=[1, 0, 0],
                                  autoDraw=True)

    # Generate the possible ssvep frequencies based on monitor refresh rate
    def get_possible_ssvep_freqs(frame_rate, stim_type='single'):
        """Get possible SSVEP stimulation frequencies.
        Utility function that returns the possible SSVEP stimulation
        frequencies and on/off pattern based on screen refresh rate.
        Args:
            frame_rate (float): screen frame rate, in Hz
        Keyword Args:
            stim_type (str): type of stimulation
                'single'-> single graphic stimulus (the displayed object
                    appears and disappears in the background.)
                'reversal' -> pattern reversal stimulus (the displayed object
                    appears and is replaced by its opposite.)
        Returns:
            (dict): keys are stimulation frequencies (in Hz), and values are
                lists of tuples, where each tuple is the number of (on, off)
                periods of one stimulation cycle
        For more info on stimulation patterns, see Section 2 of:
            Danhua Zhu, Jordi Bieger, Gary Garcia Molina, and Ronald M. Aarts,
            "A Survey of Stimulation Methods Used in SSVEP-Based BCIs,"
            Computational Intelligence and Neuroscience, vol. 2010, 12 pages,
            2010.
        """

        max_period_nb = int(frame_rate / 6)
        periods = np.arange(max_period_nb) + 1

        if stim_type == 'single':
            freqs = dict()
            for p1 in periods:
                for p2 in periods:
                    f = frame_rate / (p1 + p2)
                    try:
                        freqs[f].append((p1, p2))
                    except:
                        freqs[f] = [(p1, p2)]
        elif stim_type == 'reversal':
            freqs = {frame_rate / p: [(p, p)] for p in periods[::-1]}

        return freqs

    def init_flicker_stim(frame_rate, cycle, soa):
        """Initialize flickering stimulus.
        Get parameters for a flickering stimulus, based on the screen refresh
        rate and the desired stimulation cycle.
        Args:
            frame_rate (float): screen frame rate, in Hz
            cycle (tuple or int): if tuple (on, off), represents the number of
                'on' periods and 'off' periods in one flickering cycle. This
                supposes a "single graphic" stimulus, where the displayed object
                appears and disappears in the background.
                If int, represents the number of total periods in one cycle.
                This supposes a "pattern reversal" stimulus, where the
                displayed object appears and is replaced by its opposite.
            soa (float): stimulus duration, in s
        Returns:
            (dict): dictionary with keys
                'cycle' -> tuple of (on, off) periods in a cycle
                'freq' -> stimulus frequency
                'n_cycles' -> number of cycles in one stimulus trial
        """
        if isinstance(cycle, tuple):
            stim_freq = frame_rate / sum(cycle)
            n_cycles = int(soa * stim_freq)
        else:
            stim_freq = frame_rate / cycle
            cycle = (cycle, cycle)
            n_cycles = int(soa * stim_freq) / 2

        return {'cycle': cycle, 'freq': stim_freq, 'n_cycles': n_cycles}

    # Set up stimuli
    frame_rate = np.round(mywin.getActualFrameRate())  # Frame rate, in Hz
    freqs = get_possible_ssvep_freqs(frame_rate, stim_type='reversal')
    stim_patterns = [
        init_flicker_stim(frame_rate, 2, soa),
        init_flicker_stim(frame_rate, 3, soa)
    ]

    print(('Flickering frequencies (Hz): {}\n'.format(
        [stim_patterns[0]['freq'], stim_patterns[1]['freq']])))

    # start the EEG stream, will delay 5 seconds to let signal settle
    if eeg:
        if save_fn is None:  # If no save_fn passed, generate a new unnamed save file
            save_fn = generate_save_fn(eeg.device_name, 'visual_ssvep',
                                       'unnamed')
            print(
                f'No path for a save file was passed to the experiment. Saving data to {save_fn}'
            )
        eeg.start(save_fn, duration=record_duration)

    # Iterate through trials
    start = time()
    for ii, trial in trials.iterrows():
        # Intertrial interval
        core.wait(iti + np.random.rand() * jitter)

        # Select stimulus frequency
        ind = trials['stim_freq'].iloc[ii]

        # Push sample
        if eeg:
            timestamp = time()
            if eeg.backend == 'muselsl':
                marker = [markernames[ind]]
            else:
                marker = markernames[ind]
            eeg.push_sample(marker=marker, timestamp=timestamp)

        # Present flickering stim
        for _ in range(int(stim_patterns[ind]['n_cycles'])):
            grating.setAutoDraw(True)
            for _ in range(int(stim_patterns[ind]['cycle'][0])):
                mywin.flip()
            grating.setAutoDraw(False)
            grating_neg.setAutoDraw(True)
            for _ in range(stim_patterns[ind]['cycle'][1]):
                mywin.flip()
            grating_neg.setAutoDraw(False)

        # offset
        mywin.flip()
        if len(event.getKeys()) > 0 or (time() - start) > record_duration:
            break
        event.clearEvents()

    # Cleanup
    if eeg: eeg.stop()
    mywin.close()
コード例 #50
0
ファイル: portfolio.py プロジェクト: ywuywu/ml_monorepo
def gen_portfolio(model,
                  system,
                  group,
                  tframe,
                  startcap=100000,
                  posby='close'):
    r"""Create a portfolio from a trades frame.

    Parameters
    ----------
    model : alphapy.Model
        The model with specifications.
    system : str
        Name of the system.
    group : alphapy.Group
        The group of instruments in the portfolio.
    tframe : pandas.DataFrame
        The input trade list from running the system.
    startcap : float
        Starting capital.
    posby : str
        The position sizing column in the price dataframe.

    Returns
    -------
    p : alphapy.Portfolio
        The generated portfolio.

    Raises
    ------
    MemoryError
        Could not allocate Portfolio.

    Notes
    -----

    This function also generates the files required for analysis
    by the *pyfolio* package:

    * Returns File
    * Positions File
    * Transactions File

    """

    logger.info("Creating Portfolio for System %s", system)

    # Unpack the model data.

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Create the portfolio.

    gname = group.name
    gspace = group.space
    gmembers = group.members
    ff = 1.0 / len(gmembers)

    p = Portfolio(gname,
                  system,
                  gspace,
                  startcap=startcap,
                  posby=posby,
                  restricted=False,
                  fixedfrac=ff)
    if not p:
        raise MemoryError("Could not allocate Portfolio")

    # Build pyfolio data from the trades frame.

    start = tframe.index[0]
    end = tframe.index[-1]
    trange = np.unique(
        tframe.index.map(lambda x: x.date().strftime('%Y-%m-%d'))).tolist()
    drange = date_range(start,
                        end).map(lambda x: x.date().strftime('%Y-%m-%d'))

    # Initialize return, position, and transaction data.

    rs = []
    pcols = list(gmembers)
    pcols.extend(['cash'])
    pf = DataFrame(index=drange, columns=pcols).fillna(0.0)
    ts = []

    # Iterate through the date range, updating the portfolio.
    for d in drange:
        # process today's trades
        if d in trange:
            trades = tframe.ix[d]
            if isinstance(trades, Series):
                trades = DataFrame(trades).transpose()
            for t in trades.iterrows():
                tdate = t[0]
                row = t[1]
                tsize = exec_trade(p, row['name'], row['order'],
                                   row['quantity'], row['price'], tdate)
                if tsize != 0:
                    ts.append((d, [tsize, row['price'], row['name']]))
                else:
                    logger.info("Trade could not be executed for %s",
                                row['name'])
        # iterate through current positions
        positions = p.positions
        pfrow = pf.ix[d]
        for key in positions:
            pos = positions[key]
            if pos.quantity > 0:
                value = pos.value
            else:
                value = -pos.value
            pfrow[pos.name] = value
        pfrow['cash'] = p.cash
        # update the portfolio returns
        p = valuate_portfolio(p, d)
        rs.append((d, [p.netreturn]))

    # Create systems directory path

    system_dir = SSEP.join([directory, 'systems'])

    # Create and record the returns frame for this system.

    logger.info("Recording Returns Frame")
    rspace = Space(system, 'returns', gspace.fractal)
    rf = DataFrame.from_items(rs, orient='index', columns=['return'])
    rfname = frame_name(gname, rspace)
    write_frame(rf,
                system_dir,
                rfname,
                extension,
                separator,
                index=True,
                index_label='date')
    del rspace

    # Record the positions frame for this system.

    logger.info("Recording Positions Frame")
    pspace = Space(system, 'positions', gspace.fractal)
    pfname = frame_name(gname, pspace)
    write_frame(pf,
                system_dir,
                pfname,
                extension,
                separator,
                index=True,
                index_label='date')
    del pspace

    # Create and record the transactions frame for this system.

    logger.info("Recording Transactions Frame")
    tspace = Space(system, 'transactions', gspace.fractal)
    tf = DataFrame.from_items(ts,
                              orient='index',
                              columns=['amount', 'price', 'symbol'])
    tfname = frame_name(gname, tspace)
    write_frame(tf,
                system_dir,
                tfname,
                extension,
                separator,
                index=True,
                index_label='date')
    del tspace

    # Return the portfolio.
    return p
コード例 #51
0
ファイル: utils.py プロジェクト: fossabot/eland
def pandas_to_eland(
    pd_df: pd.DataFrame,
    es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch],
    es_dest_index: str,
    es_if_exists: str = "fail",
    es_refresh: bool = False,
    es_dropna: bool = False,
    es_type_overrides: Optional[Mapping[str, str]] = None,
    chunksize: Optional[int] = None,
    use_pandas_index_for_es_ids: bool = True,
) -> DataFrame:
    """
    Append a pandas DataFrame to an Elasticsearch index.
    Mainly used in testing.
    Modifies the elasticsearch destination index

    Parameters
    ----------
    es_client: Elasticsearch client argument(s)
        - elasticsearch-py parameters or
        - elasticsearch-py instance
    es_dest_index: str
        Name of Elasticsearch index to be appended to
    es_if_exists : {'fail', 'replace', 'append'}, default 'fail'
        How to behave if the index already exists.

        - fail: Raise a ValueError.
        - replace: Delete the index before inserting new values.
        - append: Insert new values to the existing index. Create if does not exist.
    es_refresh: bool, default 'False'
        Refresh es_dest_index after bulk index
    es_dropna: bool, default 'False'
        * True: Remove missing values (see pandas.Series.dropna)
        * False: Include missing values - may cause bulk to fail
    es_type_overrides: dict, default None
        Dict of field_name: es_data_type that overrides default es data types
    chunksize: int, default None
        Number of pandas.DataFrame rows to read before bulk index into Elasticsearch
    use_pandas_index_for_es_ids: bool, default 'True'
        * True: pandas.DataFrame.index fields will be used to populate Elasticsearch '_id' fields.
        * False: Ignore pandas.DataFrame.index when indexing into Elasticsearch

    Returns
    -------
    eland.Dataframe
        eland.DataFrame referencing data in destination_index

    Examples
    --------

    >>> pd_df = pd.DataFrame(data={'A': 3.141,
    ...                            'B': 1,
    ...                            'C': 'foo',
    ...                            'D': pd.Timestamp('20190102'),
    ...                            'E': [1.0, 2.0, 3.0],
    ...                            'F': False,
    ...                            'G': [1, 2, 3],
    ...                            'H': 'Long text - to be indexed as es type text'},
    ...                      index=['0', '1', '2'])
    >>> type(pd_df)
    <class 'pandas.core.frame.DataFrame'>
    >>> pd_df
           A  B  ...  G                                          H
    0  3.141  1  ...  1  Long text - to be indexed as es type text
    1  3.141  1  ...  2  Long text - to be indexed as es type text
    2  3.141  1  ...  3  Long text - to be indexed as es type text
    <BLANKLINE>
    [3 rows x 8 columns]
    >>> pd_df.dtypes
    A           float64
    B             int64
    C            object
    D    datetime64[ns]
    E           float64
    F              bool
    G             int64
    H            object
    dtype: object

    Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
    Overwrite existing Elasticsearch index if it exists `if_exists="replace"`, and sync index so it is
    readable on return `refresh=True`


    >>> ed_df = ed.pandas_to_eland(pd_df,
    ...                            'localhost',
    ...                            'pandas_to_eland',
    ...                            es_if_exists="replace",
    ...                            es_refresh=True,
    ...                            es_type_overrides={'H':'text'}) # index field 'H' as text not keyword
    >>> type(ed_df)
    <class 'eland.dataframe.DataFrame'>
    >>> ed_df
           A  B  ...  G                                          H
    0  3.141  1  ...  1  Long text - to be indexed as es type text
    1  3.141  1  ...  2  Long text - to be indexed as es type text
    2  3.141  1  ...  3  Long text - to be indexed as es type text
    <BLANKLINE>
    [3 rows x 8 columns]
    >>> ed_df.dtypes
    A           float64
    B             int64
    C            object
    D    datetime64[ns]
    E           float64
    F              bool
    G             int64
    H            object
    dtype: object

    See Also
    --------
    eland.read_es: Create an eland.Dataframe from an Elasticsearch index
    eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
    """
    if chunksize is None:
        chunksize = DEFAULT_CHUNK_SIZE

    mapping = FieldMappings._generate_es_mappings(pd_df, es_type_overrides)
    es_client = ensure_es_client(es_client)

    # If table exists, check if_exists parameter
    if es_client.indices.exists(index=es_dest_index):
        if es_if_exists == "fail":
            raise ValueError(
                f"Could not create the index [{es_dest_index}] because it "
                f"already exists. "
                f"Change the if_exists parameter to "
                f"'append' or 'replace' data.")
        elif es_if_exists == "replace":
            es_client.indices.delete(index=es_dest_index)
            es_client.indices.create(index=es_dest_index, body=mapping)
        # elif if_exists == "append":
        # TODO validate mapping are compatible
    else:
        es_client.indices.create(index=es_dest_index, body=mapping)

    # Now add data
    actions = []
    n = 0
    for row in pd_df.iterrows():
        if es_dropna:
            values = row[1].dropna().to_dict()
        else:
            values = row[1].to_dict()

        if use_pandas_index_for_es_ids:
            # Use index as _id
            id = row[0]

            # Use integer as id field for repeatable results
            action = {
                "_index": es_dest_index,
                "_source": values,
                "_id": str(id)
            }
        else:
            action = {"_index": es_dest_index, "_source": values}

        actions.append(action)

        n = n + 1

        if n % chunksize == 0:
            bulk(client=es_client, actions=actions, refresh=es_refresh)
            actions = []

    bulk(client=es_client, actions=actions, refresh=es_refresh)
    return DataFrame(es_client, es_dest_index)
コード例 #52
0
ファイル: scatter.py プロジェクト: czbiohub/dotblotr
def plot_hit_grid(
    hit_table: pd.DataFrame,
    results_table: pd.DataFrame,
    sort_by: Union[str, List[str]] = 'n_hits',
    x_label: str = 'dot_name',
    cmap: str = 'inferno',
):
    """ Plot the hits

    Parameters:
    ------------
    hit_table : pd.DataFrame
        the hit table output from calc_hit_counts().
        Note that `calc_hit_counts()` returns an list of hit tables,
        so index to the hit table you wish to plot.
    results_table : pd.DataFrame
        the results table output from process_dir() from which to get the strip info
    sort_by : list[str]
        a list of hit_table columns to sort the dots (x axis) by.
        ['n_hits', 'dot_name'] sorts by number of hits and then the dot name
    x_label : str
        the results table column to use for the name of the dot on the x_label
        (e.g., dot_name for the name of the dot)
    cmap : str
        Name of the colormap to be used for the number of hits.
        The default value is 'inferno'
        See the matplotlib documation for details:
        https://matplotlib.org/3.1.1/tutorials/colors/colormaps.html
    """

    hit_table.sort_values(by=sort_by, axis=0, inplace=True, ascending=False)

    unique_strip_ids = results_table.strip_id.unique()
    strip_ids = {strip_id: i for i, strip_id in enumerate(unique_strip_ids)}

    names = []
    strip_id_indices = []
    counts = []

    for i, row in hit_table.iterrows():
        name = row['dot_name']
        count = row['n_hits']
        dot_table = results_table.loc[results_table['dot_name'] == name]

        for j, dot_row in dot_table.iterrows():
            if dot_row['pos_hit']:
                #names.append(dot_row[x_label])
                names.append(dot_row['dot_name'])
                strip_id_indices.append(dot_row['strip_id'])
                counts.append(count)

    f, ax = plt.subplots(figsize=(50, 2))
    sc = ax.scatter(names, strip_id_indices, c=counts, cmap=cmap)

    ax.tick_params(axis='x', labelsize=5, rotation=90)
    ax.set_xlabel('spot name')
    ax.set_ylabel('strip id')

    plt.draw()

    norm = Normalize(vmin=1, vmax=hit_table['n_hits'].max())
    # for t in ax.get_xticklabels():
    #     tick_name = t.get_text()
    #     dot_name = results_table.loc[results_table[x_label] == tick_name].dot_name.values[0]
    #     n_hits = hit_table.loc[hit_table['dot_name'] == dot_name].n_hits.values[0]
    #     c = sc.cmap(norm(n_hits))
    #     t.set_color(c)
    label_names = []
    for t in ax.get_xticklabels():
        dot_name = t.get_text()
        label_names.append(dot_name)
        n_hits = hit_table.loc[hit_table['dot_name'] ==
                               dot_name].n_hits.values[0]
        c = sc.cmap(norm(n_hits))
        t.set_color(c)

    new_labels = [
        results_table.loc[results_table['dot_name'] == n][x_label].values[0]
        for n in label_names
    ]
    ax.set_xticklabels(new_labels)
    plt.draw()

    return f, ax
コード例 #53
0
ファイル: plots.py プロジェクト: koukyo1994/streamlit-audio
def specshow(y: np.ndarray,
             sr: int,
             y_processed=None,
             tp: pd.DataFrame = None,
             fp: pd.DataFrame = None):
    plot_spectrogram = st.checkbox("Spectrogram plot")
    if plot_spectrogram:
        st.sidebar.markdown("#### Spectrogram plot settings")
        start_second = st.sidebar.number_input("start second",
                                               min_value=0,
                                               max_value=len(y) // sr,
                                               value=0,
                                               step=1,
                                               key="specshow_start")
        end_second = st.sidebar.number_input("end second",
                                             min_value=0,
                                             max_value=len(y) // sr,
                                             value=len(y) // sr,
                                             step=1,
                                             key="specshow_end")
        start_index = start_second * sr
        if end_second == len(y) // sr:
            end_index = len(y)
        else:
            end_index = end_second * sr
        y_plot = y[start_index:end_index]
        if y_processed is not None:
            y_plot_processed = y_processed[start_index:end_index]

        st.sidebar.markdown("##### (Mel)spectrogram parameters")
        mel = st.sidebar.checkbox("Mel scale", value=True)

        n_fft = st.sidebar.number_input("n_fft",
                                        min_value=64,
                                        max_value=8192,
                                        value=1024,
                                        step=64)
        hop_length = st.sidebar.number_input("hop_length",
                                             min_value=1,
                                             max_value=2048,
                                             value=320,
                                             step=10)
        if mel:
            n_mels = st.sidebar.number_input("n_mels",
                                             min_value=1,
                                             max_value=512,
                                             value=64,
                                             step=16)
            fmin = st.sidebar.number_input("fmin",
                                           min_value=1,
                                           max_value=8192,
                                           value=20,
                                           step=100)
            fmax = st.sidebar.number_input("fmax",
                                           min_value=4000,
                                           max_value=44100,
                                           value=14000,
                                           step=100)
        log = st.sidebar.checkbox("apply log", value=True)

        if mel:
            melspec_params = {
                "n_fft": n_fft,
                "hop_length": hop_length,
                "n_mels": n_mels,
                "fmin": fmin,
                "fmax": fmax,
                "sr": sr
            }
        else:
            spec_params = {"n_fft": n_fft, "hop_length": hop_length}

        if st.button("Show melspectrogram"):
            with st.spinner("Calculating melspectrogram"):
                if mel:
                    spec = melspectrogram(y_plot, melspec_params, log)
                else:
                    spec = spectrogram(y_plot, spec_params, log)
                if y_processed is not None:
                    if mel:
                        spec_processed = melspectrogram(
                            y_plot_processed, melspec_params, log)
                    else:
                        spec_processed = spectrogram(y_plot_processed,
                                                     spec_params, log)

            height, width = spec.shape
            st.write(f"{height} x {width} matrix")
            if y_processed is not None:
                with st.spinner("Plotting"):
                    fig = plt.figure(figsize=(12, 8))
                    ax1 = fig.add_subplot(2, 1, 1)
                    if mel:
                        display.specshow(spec,
                                         sr=sr,
                                         hop_length=hop_length,
                                         x_axis="time",
                                         y_axis="mel",
                                         fmin=fmin,
                                         fmax=fmax,
                                         ax=ax1)
                    else:
                        display.specshow(spec,
                                         sr=sr,
                                         hop_length=hop_length,
                                         x_axis="time",
                                         y_axis="linear",
                                         ax=ax1)

                    ax2 = fig.add_subplot(2, 1, 2)
                    if mel:
                        display.specshow(spec_processed,
                                         sr=sr,
                                         hop_length=hop_length,
                                         x_axis="time",
                                         y_axis="mel",
                                         fmin=fmin,
                                         fmax=fmax,
                                         ax=ax2)
                    else:
                        display.specshow(spec_processed,
                                         sr=sr,
                                         hop_length=hop_length,
                                         x_axis="time",
                                         y_axis="linear",
                                         ax=ax2)
            else:
                with st.spinner("Plotting"):
                    fig = plt.figure(figsize=(12, 4))
                    ax = plt.axes()
                    if mel:
                        display.specshow(spec,
                                         sr=sr,
                                         hop_length=hop_length,
                                         x_axis="time",
                                         y_axis="mel",
                                         fmin=fmin,
                                         fmax=fmax)
                        plt.colorbar()
                    else:
                        display.specshow(spec,
                                         sr=sr,
                                         hop_length=hop_length,
                                         x_axis="time",
                                         y_axis="linear")
                        plt.colorbar()
                    if tp is not None and len(tp) > 0:
                        for _, row in tp.iterrows():
                            rect = patches.Rectangle(
                                (row["t_min"], row["f_min"]),
                                row["t_max"] - row["t_min"],
                                row["f_max"] - row["f_min"],
                                linewidth=1,
                                edgecolor="g",
                                facecolor="g",
                                alpha=0.5,
                                label="tp")
                            ax.add_patch(rect)
                    if fp is not None and len(fp) > 0:
                        for _, row in fp.iterrows():
                            rect = patches.Rectangle(
                                (row["t_min"], row["f_min"]),
                                row["t_max"] - row["t_min"],
                                row["f_max"] - row["f_min"],
                                linewidth=1,
                                edgecolor="r",
                                facecolor="r",
                                alpha=0.5,
                                label="fp")
                            ax.add_patch(rect)
            st.pyplot(fig)
コード例 #54
0
ファイル: ti_provider_base.py プロジェクト: yushu-liu/msticpy
def _(data: pd.DataFrame, obs_col: str, ioc_type_col: Optional[str] = None):
    for _, row in data.iterrows():
        if ioc_type_col is None:
            yield row[obs_col], TIProvider.resolve_ioc_type(row[obs_col])
        else:
            yield row[obs_col], row[ioc_type_col]
コード例 #55
0
def create_df(dataframe: pd.DataFrame) -> pd.DataFrame:
    # get lengths of signals for each sample
    lengths = []
    width = dataframe.shape[1]

    for row in dataframe.index.tolist():
        temp_width = width
        for item in dataframe.loc[row][::-1]:
            if not pd.isna(item) and isinstance(item, float):
                temp_width -= 1
                break

            temp_width -= 1

        lengths.append(temp_width)

    """
    README
    
    For the following features we measured: [mean, median, 5 % percentile, 95 % percentile, standard deviation]
    R-peak location were retrieved by nk.ecg_peaks
    Q-peak and S-location were retrieved by nk.ecg_delineate
    
    ?_ampl_*        ?-Peak amplitude
    ?_nr_peaks      number of ?-Peaks
    ?_diff_*        Interval between ?-Peaks
    QRS_diff_*      QRS duration
    len_*           length of signal
    Qual_*          quality of signal measured with nk.ecg_quality
    sign_*          signal
    
    Also the output from nk.hrv_time which contains different measurements for the heart rate variation (HRV*) was added
    
    Additionally one 'typical' heartbeat was greated (all length 180):
    
    MN_*            mean signal
    MD_*            median signal
    P5_*            5 % percentile signal
    P95_*           95 % percentile signal
    SD_*            standard deviation of signal
    """

    names = ['R_ampl_mean', 'R_ampl_median', 'R_ampl_perc5', 'R_ampl_perc95', 'R_ampl_sd', 'R_nr_peaks',
             'len_mean', 'len_median', 'len_perc5', 'len_perc95', 'len_sd',
             'sign_mean', 'sign_median', 'sign_perc5', 'sign_perc95', 'sign_sd',
             'Qual_mean', 'Qual_median', 'Qual_perc5', 'Qual_perc95', 'Qual_sd',
             'Q_ampl_mean', 'Q_ampl_median', 'Q_ampl_perc5', 'Q_ampl_perc95', 'Q_ampl_sd', 'Q_nr_peaks',
             'Q_diff_mean', 'Q_diff_median', 'Q_diff_perc5', 'Q_diff_perc95', 'Q_diff_sd',
             'S_ampl_mean', 'S_ampl_median', 'S_ampl_perc5', 'S_ampl_perc95', 'S_ampl_sd', 'S_nr_peaks',
             'S_diff_mean', 'S_diff_median', 'S_diff_perc5', 'S_diff_perc95', 'S_diff_sd',
             'P_ampl_mean', 'P_ampl_median', 'P_ampl_perc5', 'P_ampl_perc95', 'P_ampl_sd', 'P_nr_peaks',
             'T_ampl_mean', 'T_ampl_median', 'T_ampl_perc5', 'T_ampl_perc95', 'T_ampl_sd', 'T_nr_peaks',
             'QRS_diff_mean', 'QRS_diff_median', 'QRS_diff_perc5', 'QRS_diff_perc95', 'QRS_diff_sd',
             'PR_diff_mean', 'PR_diff_median', 'PR_diff_perc5', 'PR_diff_perc95', 'PR_diff_sd',
             'RT_diff_mean', 'RT_diff_median', 'RT_diff_perc5', 'RT_diff_perc95', 'RT_diff_sd',
             'HRV_RMSSD', 'HRV_MeanNN', 'HRV_SDNN', 'HRV_SDSD', 'HRV_CVNN', 'HRV_CVSD', 'HRV_MedianNN',
             'HRV_MadNN', 'HRV_MCVNN', 'HRV_IQRNN', 'HRV_pNN50', 'HRV_pNN20', 'HRV_TINN', 'HRV_HTI',
             'HRV_ULF','HRV_VLF','HRV_LF','HRV_HF','HRV_VHF','HRV_LFHF','HRV_LFn','HRV_HFn', 	'HRV_LnHF',
             'HRV_SD1','HRV_SD2', 'HRV_SD1SD2','HRV_S','HRV_CSI','HRV_CVI','HRV_CSI_Modified', 'HRV_PIP',
             'HRV_IALS','HRV_PSS','HRV_PAS','HRV_GI','HRV_SI','HRV_AI','HRV_PI','HRV_C1d','HRV_C1a','HRV_SD1d',
             'HRV_SD1a','HRV_C2d','HRV_C2a','HRV_SD2d','HRV_SD2a','HRV_Cd','HRV_Ca','HRV_SDNNd','HRV_SDNNa','HRV_ApEn',
             'HRV_SampEn','J_LF','J_HF','J_L/H']


    template_len = 180

    mean_names = ['MN_' + str(index) for index in range(template_len)]
    median_names = ['MD_' + str(index) for index in range(template_len)]
    perc5_names = ['P5_' + str(index) for index in range(template_len)]
    perc95_names = ['P95_' + str(index) for index in range(template_len)]
    sd_names = ['SD_' + str(index) for index in range(template_len)]

    wavelet = 'db3'

    wl_len = int(np.floor((template_len + pywt.Wavelet(wavelet).dec_len - 1) / 2))

    wl_mean_names = ['WLMN_' + str(index) for index in range(2*wl_len)]
    wl_median_names = ['WLMD_' + str(index) for index in range(2*wl_len)]
    wl_perc5_names = ['WLP5_' + str(index) for index in range(2*wl_len)]
    wl_perc95_names = ['WLP95_' + str(index) for index in range(2*wl_len)]
    wl_sd_names = ['WLSD_' + str(index) for index in range(2*wl_len)]

    typical_signal_names = mean_names + median_names + perc5_names + perc95_names + sd_names + wl_mean_names + \
                           wl_median_names + wl_perc5_names + wl_perc95_names + wl_sd_names

    names += typical_signal_names

    data = np.empty([dataframe.shape[0], len(names)])

    iteration = 0
    for row_index, row in dataframe.iterrows():
        print(row_index)

        # Retrieve ECG data
        ecg_signal = row[:lengths[iteration] + 1]
        ecg_signal = nk.ecg_clean(ecg_signal, sampling_rate=SAMPLING_RATE)

        # Find R-peaks
        peaks, info = nk.ecg_peaks(ecg_signal, sampling_rate=SAMPLING_RATE)

        # R amplitude
        R_amplitudes = ecg_signal[info['ECG_R_Peaks']]

        # Check if the signal is flipped
        # Check if we have enough peaks to retrieve more information
        if len(R_amplitudes) > 4:

            _, waves_peak = nk.ecg_delineate(ecg_signal, info, sampling_rate=300, show=False)

            # Q amplitude

            # remove nan values
            Q_amplitudes = [ecg_signal[peak_index] if str(peak_index) != 'nan' else - np.infty for peak_index in
                            waves_peak['ECG_Q_Peaks']]

            if np.sum([1 if np.abs(rpeak) > np.abs(Q_amplitudes[index]) else -1 for index, rpeak in
                       enumerate(R_amplitudes)]) < 0:
                print("flip", row_index)

                ecg_signal = -ecg_signal

                peaks, info = nk.ecg_peaks(ecg_signal, sampling_rate=300)

                # R amplitude
                R_amplitudes = ecg_signal[info['ECG_R_Peaks']]

                if len(R_amplitudes) > 4:
                    _, waves_peak = nk.ecg_delineate(ecg_signal, info, sampling_rate=300, show=False)

        data_temp = []
        if len(R_amplitudes) > 0:
            data_temp = [np.mean(R_amplitudes),
                         np.median(R_amplitudes),
                         np.percentile(R_amplitudes, q=5),
                         np.percentile(R_amplitudes, q=95),
                         np.std(R_amplitudes),
                         len(R_amplitudes)]
        else:
            empty = np.empty([6])
            empty[:] = np.NaN
            data_temp += empty.tolist()

        # length of signal
        data_new = [np.mean(lengths[iteration] / SAMPLING_RATE),
                    np.median(lengths[iteration] / SAMPLING_RATE),
                    np.percentile(lengths[iteration] / SAMPLING_RATE, q=5),
                    np.percentile(lengths[iteration] / SAMPLING_RATE, q=95),
                    np.std(lengths[iteration] / SAMPLING_RATE)]

        data_temp += data_new

        # signal
        data_new = [np.mean(ecg_signal),
                    np.median(ecg_signal),
                    np.percentile(ecg_signal, q=5),
                    np.percentile(ecg_signal, q=95),
                    np.std(ecg_signal)]

        data_temp += data_new

        # Check if we have enough peaks to retrieve more information
        if len(R_amplitudes) > 4:

            quality = nk.ecg_quality(ecg_signal, sampling_rate=SAMPLING_RATE)
            data_new = [np.mean(quality),
                        np.median(quality),
                        np.percentile(quality, q=5),
                        np.percentile(quality, q=95),
                        np.std(quality)]

            data_temp += data_new

            # Delineate the ECG signal
            # “ECG_P_Peaks”, “ECG_Q_Peaks”, “ECG_S_Peaks”, “ECG_T_Peaks”, “ECG_P_Onsets”, “ECG_T_Offsets”

            # _, waves_peak = nk.ecg_delineate(ecg_signal, info, sampling_rate=SAMPLING_RATE, show=False)

            # Q amplitude

            # remove nan values
            Q_peaks = [peak for peak in waves_peak['ECG_Q_Peaks'] if str(peak) != 'nan']

            if len(Q_peaks) > 0:
                Q_amplitudes = ecg_signal[Q_peaks]

                data_new = [np.mean(Q_amplitudes),
                            np.median(Q_amplitudes),
                            np.percentile(Q_amplitudes, q=5),
                            np.percentile(Q_amplitudes, q=95),
                            np.std(Q_amplitudes),
                            len(Q_amplitudes)]

                data_temp += data_new
            else:
                empty = np.empty([6])
                empty[:] = np.NaN
                empty[5] = 0
                data_temp += empty.tolist()

            # more than 1 Q-Peak => can build interval[s]
            if len(Q_peaks) > 1:
                Q_peaks_diff = [(Q_peaks[index + 1] - Q_peaks[index]) / SAMPLING_RATE
                                for index, item in enumerate(Q_peaks[:len(Q_peaks) - 1])]

                # QQ interval

                data_new = [np.mean(Q_peaks_diff),
                            np.median(Q_peaks_diff),
                            np.percentile(Q_peaks_diff, q=5),
                            np.percentile(Q_peaks_diff, q=95),
                            np.std(Q_peaks_diff)]

                data_temp += data_new

            # 0 or 1 Q-peak = no interval => return nan
            else:
                empty = np.empty([5])
                empty[:] = np.NaN
                data_temp += empty.tolist()

            # S amplitude

            # remove nan values
            S_peaks = [peak for peak in waves_peak['ECG_S_Peaks'] if str(peak) != 'nan']

            if len(S_peaks) > 0:
                S_amplitudes = ecg_signal[S_peaks]

                data_new = [np.mean(S_amplitudes),
                            np.median(S_amplitudes),
                            np.percentile(S_amplitudes, q=5),
                            np.percentile(S_amplitudes, q=95),
                            np.std(S_amplitudes),
                            len(S_amplitudes)]

                data_temp += data_new

            else:
                empty = np.empty([6])
                empty[:] = np.NaN
                empty[5] = 0
                data_temp += empty.tolist()

            # more than one S-peak
            if len(S_peaks) > 1:
                S_peaks_diff = [(S_peaks[index + 1] - S_peaks[index]) / SAMPLING_RATE
                                for index, item in enumerate(S_peaks[:len(S_peaks) - 1])]

                # SS interval

                data_new = [np.mean(S_peaks_diff),
                            np.median(S_peaks_diff),
                            np.percentile(S_peaks_diff, q=5),
                            np.percentile(S_peaks_diff, q=95),
                            np.std(S_peaks_diff)]

                data_temp += data_new

            # 0 or 1 S-peak = no interval => return nan
            else:
                empty = np.empty([5])
                empty[:] = np.NaN
                data_temp += empty.tolist()

            P_peaks = [peak for peak in waves_peak['ECG_P_Peaks'] if str(peak) != 'nan']

            if len(P_peaks) > 0:
                P_amplitudes = ecg_signal[P_peaks]

                data_new = [np.mean(P_amplitudes),
                            np.median(P_amplitudes),
                            np.percentile(P_amplitudes, q=5),
                            np.percentile(P_amplitudes, q=95),
                            np.std(P_amplitudes),
                            len(P_amplitudes)]

                data_temp += data_new

            else:
                empty = np.empty([6])
                empty[:] = np.NaN
                empty[5] = 0
                data_temp += empty.tolist()

            T_peaks = [peak for peak in waves_peak['ECG_T_Peaks'] if str(peak) != 'nan']

            if len(T_peaks) > 0:
                T_peaks = ecg_signal[T_peaks]

                data_new = [np.mean(T_peaks),
                            np.median(T_peaks),
                            np.percentile(T_peaks, q=5),
                            np.percentile(T_peaks, q=95),
                            np.std(T_peaks),
                            len(T_peaks)]

                data_temp += data_new

            else:
                empty = np.empty([6])
                empty[:] = np.NaN
                empty[5] = 0
                data_temp += empty.tolist()


            # QRS interval

            QRS_peaks_diff = []

            # compute difference between Q and S peak
            for index in range(len(waves_peak['ECG_Q_Peaks'])):
                if not (np.isnan(waves_peak['ECG_Q_Peaks'][index]) or np.isnan(waves_peak['ECG_S_Peaks'][index])):
                    QRS_peaks_diff.append(
                        (waves_peak['ECG_S_Peaks'][index] - waves_peak['ECG_Q_Peaks'][index]) / SAMPLING_RATE)

            if len(QRS_peaks_diff) > 0:
                data_new = [np.mean(QRS_peaks_diff),
                            np.median(QRS_peaks_diff),
                            np.percentile(QRS_peaks_diff, q=5),
                            np.percentile(QRS_peaks_diff, q=95),
                            np.std(QRS_peaks_diff)]

                data_temp += data_new

            else:
                empty = np.empty([5])
                empty[:] = np.NaN
                data_temp += empty.tolist()

            # PR interval

            PR_peaks_diff = []

            # compute difference between P and R peak
            for index in range(len(waves_peak['ECG_P_Peaks'])):
                if not np.isnan(waves_peak['ECG_P_Peaks'][index]):
                    PR_peaks_diff.append(
                        (info['ECG_R_Peaks'][index] - waves_peak['ECG_P_Peaks'][index]) / SAMPLING_RATE)

            if len(PR_peaks_diff) > 0:
                data_new = [np.mean(PR_peaks_diff),
                            np.median(PR_peaks_diff),
                            np.percentile(PR_peaks_diff, q=5),
                            np.percentile(PR_peaks_diff, q=95),
                            np.std(PR_peaks_diff)]

                data_temp += data_new
            else:
                empty = np.empty([5])
                empty[:] = np.NaN
                data_temp += empty.tolist()

            # RT interval

            RT_peaks_diff = []

            # compute difference between P and R peak
            for index in range(len(waves_peak['ECG_T_Peaks'])):
                if not np.isnan(waves_peak['ECG_T_Peaks'][index]):
                    RT_peaks_diff.append(
                        (waves_peak['ECG_T_Peaks'][index] - info['ECG_R_Peaks'][index]) / SAMPLING_RATE)

            if len(RT_peaks_diff) > 0:
                data_new = [np.mean(RT_peaks_diff),
                            np.median(PR_peaks_diff),
                            np.percentile(RT_peaks_diff, q=5),
                            np.percentile(RT_peaks_diff, q=95),
                            np.std(RT_peaks_diff)]

                data_temp += data_new

            else:
                empty = np.empty([5])
                empty[:] = np.NaN
                data_temp += empty.tolist()

            # Extract clean EDA and SCR features
            # explanation of features:
            # https://neurokit2.readthedocs.io/en/latest/functions.html?highlight=hrv%20time#neurokit2.hrv.hrv_time

            hrv_time = nk.hrv(peaks, sampling_rate=SAMPLING_RATE, show=False)

            data_new = hrv_time.values.tolist()[0]

            data_temp += data_new

            # Jannik
            # http://www.paulvangent.com/2016/03/21/analyzing-a-discrete-heart-rate-signal-using-python-part-2/
            rpeaks = info['ECG_R_Peaks']
            r_interval = [rpeaks[index+1]-rpeaks[index] for index in range(len(rpeaks)-1)]
            RR_x_new = np.linspace(rpeaks[0],rpeaks[-2],rpeaks[-2])
            f = interp1d(rpeaks[:-1], r_interval, kind='cubic')

            n = lengths[iteration] + 1 # Length of the signal
            frq = np.fft.fftfreq(n, d=(1 / SAMPLING_RATE)) # divide the bins into frequency categories
            frq = frq[range(int(n/2))] # Get single side of the frequency range

            Y = np.fft.fft(f(RR_x_new))/n # Calculate FFT

            try:
                Y = Y[range(int(n / 2))]
                lf = np.trapz(abs(Y[(frq >= 0.04) & (frq <= 0.15)]))

                hf = np.trapz(abs(Y[(frq >= 0.16) & (frq <= 0.5)]))  # Do the same for 0.16-0.5Hz (HF)

                data_new = [lf, hf, lf / hf]

                data_temp += data_new
            except IndexError as err:
                print(err)
                data_temp += [None, None, None]

        # if we don't have enough R peaks return vector of nan's
        else:
            empty = np.empty([len(names) - 16 - len(typical_signal_names)])
            empty[:] = np.NaN
            data_temp += empty.tolist()

        # Create a 'typical' heartbeat

        # Scaler = StandardScaler()
        # ecg_signal = Scaler.fit_transform(X=ecg_signal.reshape(-1, 1)).reshape(1, -1)[0].tolist()

        out = ecg.ecg(signal=ecg_signal, sampling_rate=SAMPLING_RATE, show=False)

        mean = np.mean(out['templates'], axis=0)
        median = np.median(out['templates'], axis=0)
        perc5 = np.percentile(out['templates'].astype(np.float64), axis=0, q=5)
        perc95 = np.percentile(out['templates'].astype(np.float64), axis=0, q=95)
        std = np.std(out['templates'].astype(np.float64), axis=0)

        data_new = np.concatenate((mean, median, perc5, perc95, std)).tolist()

        data_temp += data_new

        (wl_mean_cA, wl_mean_cD) = pywt.dwt(np.mean(out['templates'], axis=0),
                                            'db3', 'periodic')
        (wl_median_cA, wl_median_cD) = pywt.dwt(np.median(out['templates'], axis=0),
                                                'db3', 'periodic')
        (wl_perc5_cA, wl_perc5_cD) = pywt.dwt(np.percentile(out['templates'].astype(np.float64), axis=0, q=5),
                                              'db3', 'periodic')
        (wl_perc95_cA, wl_perc95_cD) = pywt.dwt(np.percentile(out['templates'].astype(np.float64), axis=0, q=95),
                                                'db3', 'periodic')
        (wl_sd_cA, wl_sd_cD) = pywt.dwt(np.std(out['templates'].astype(np.float64), axis=0),
                                        'db3', 'periodic')

        data_new = np.concatenate((wl_mean_cA, wl_mean_cD,
                                   wl_median_cA, wl_median_cD,
                                   wl_perc5_cA, wl_perc5_cD,
                                   wl_perc95_cA, wl_perc95_cD,
                                   wl_sd_cA, wl_sd_cD)).tolist()

        data_temp += data_new

        data[iteration] = data_temp

        iteration += 1

    features = pd.DataFrame(data, columns=names)

    return features
コード例 #56
0
 def transform(self, X, **transform_params):
     data = DataFrame(X[['completed', 'completed_post']])
     result = []
     for index, row in data.iterrows():
         result.append(self.GetClass(row))
     return DataFrame(result)
コード例 #57
0
def postprocess_predictions(df: pd.DataFrame,
                            opt: argparse.Namespace) -> pd.DataFrame:
    '''
    input:
        + df: input pandas dataframe.
        + opt: configuration.
    output:
        postprocessed pandas dataframe.
    '''
    post_predictions = []
    if opt.has_label:
        list_of_important_tags = []

    if opt.use_multiprocessing:
        import multiprocessing as mp

        # Apply a patch for the multiprocessing module
        import multiprocessing.pool as mpp
        from magneto.utils import istarmap
        mpp.Pool.istarmap = istarmap

        all_rows = [row for idx, row in df.iterrows()]

        inputs = list(
            zip(all_rows, [copy.deepcopy(opt) for _ in range(len(df))]))

        with mp.Pool(opt.num_workers) as pool:
            for result in tqdm(pool.istarmap(postprocess_prediction, inputs),
                               total=len(inputs)):
                if opt.has_label:
                    important_tags, post_prediction = result
                    list_of_important_tags.append(important_tags)
                else:
                    post_prediction = result

                post_predictions.append(post_prediction)

    else:
        for idx, row in tqdm(list(df.iterrows())):
            if opt.has_label:
                important_tags, post_prediction = postprocess_prediction(
                    row, opt)
                list_of_important_tags.append(important_tags)
            else:
                post_prediction = postprocess_prediction(row, opt)

            post_predictions.append(post_prediction)

    list_of_pred_tags = []
    list_of_probs = []

    for post_prediction in post_predictions:
        post_prediction = list(zip(*post_prediction))
        if len(post_prediction) >= 2:
            # TODO we will take care of masks later.
            pred_tags, probs = post_prediction[0], post_prediction[1]

            list_of_pred_tags.append('\n'.join(pred_tags))
            probs = np.round(probs, decimals=3)
            probs = np.array(probs, dtype=str)
            list_of_probs.append('\n'.join(probs))
        else:
            list_of_pred_tags.append('')
            list_of_probs.append('')

    df['pred_tags'] = list_of_pred_tags
    df['probs'] = list_of_probs

    if opt.has_label:
        list_of_important_tags = list(
            map(lambda x: '\n'.join(x), list_of_important_tags))

        df['important_tags'] = list_of_important_tags

    return df
コード例 #58
0
                        'Server=borismsdn.database.windows.net;'
                        'Database=DemoData;'
                        'uid=readbot;pwd=xxxxxxx')
cursor = conn.cursor()
#How many years of data should be loaded for the new airport(initial load)
years_load_for_new_loc = 1
#loading a dataframe that contains all needed airports, start and end dates for loading deltas (or initial load in case of new airports)
cur = cursor.execute(
    'select l.airportcode, isnull(max_date, DateAdd(yy, -?, GetDate())) as max_date, DateAdd(dd, -1, GetDate()) as end_date from dbo.Locations l left join (select airportcode, max(date_utc) as max_date from dbo.Weather group by airportcode) w on l.airportcode=w.airportcode',
    (years_load_for_new_loc, ))
df = DataFrame(cur.fetchall())

conn.close()
print(df)
#Parsing data for each airport
for index, row in df.iterrows():
    try:
        city = row[0]
        print("Parsing data for " + city)
        start_date = row[1]
        print("Start date: " + str(start_date))
        end_date = row[2]
        print("End date: " + str(end_date))
        df = []
        #Parsing each link based on date
        for single_date in daterange(start_date, end_date):
            datec = single_date.strftime("%Y/%m/%d")
            print(datec)
            url = "https://www.wunderground.com/history/airport/" + city + "/" + datec + "/DailyHistory.html?req_city=&req_statename=&MR=1&format=1"
            #some empty values were displayed as -9999, remove them. also last column had <br /> in the end of each row, didn't find other way to remove it
            cur_df = pd.read_table(url, delimiter=',',
コード例 #59
0
    df.drop('index', axis=1, inplace=True)
    return df


linalg = np.linalg
np.random.seed(8)
numOfRows = 20000
numOfSensors = 200
numOfClusters = 20
start_sensor = 48
x = np.random.normal(size=numOfSensors)
y = np.random.normal(size=numOfSensors)
map = DataFrame(dict(longitude=x, latitude=y, index=range(0, numOfSensors)))
distance_list = []
map.drop('index', inplace=True, axis=1)
for index, row in map.iterrows():
    distances = pd.DataFrame({}, columns=['sensor', 'distance'])
    curr_lat = row['latitude']
    curr_long = row['longitude']
    for index1, row1 in map.iterrows():
        row_toAdd = pd.Series(
            {
                'sensor':
                index1,
                'distance':
                distance(row['latitude'], row['longitude'], row1['latitude'],
                         row1['longitude'])
            },
            name=index1)
        distances = distances.append(row_toAdd)
    distances = distances.sort_values(by=['distance'])
コード例 #60
0
def present(record_duration=120,
            stim_types=None,
            itis=None,
            additional_labels={},
            secs=0.07,
            volume=0.8,
            eeg=None,
            save_fn=None):

    markernames = [1, 2]
    record_duration = np.float32(record_duration)

    ## Initialize stimuli
    #aud1 = sound.Sound('C', octave=5, sampleRate=44100, secs=secs)
    aud1 = sound.Sound(440,
                       secs=secs)  #, octave=5, sampleRate=44100, secs=secs)
    aud1.setVolume(volume)

    #aud2 = sound.Sound('D', octave=6, sampleRate=44100, secs=secs)
    aud2 = sound.Sound(528, secs=secs)
    aud2.setVolume(volume)
    auds = [aud1, aud2]

    # Setup trial list
    trials = DataFrame(dict(sound_ind=stim_types, iti=itis))

    for col_name, col_vec in additional_labels.items():
        trials[col_name] = col_vec

    # Setup graphics
    mywin = visual.Window([1920, 1080],
                          monitor='testMonitor',
                          units='deg',
                          fullscr=True)
    fixation = visual.GratingStim(win=mywin,
                                  size=0.2,
                                  pos=[0, 0],
                                  sf=0,
                                  rgb=[1, 0, 0])
    fixation.setAutoDraw(True)
    mywin.flip()
    iteratorthing = 0

    # start the EEG stream, will delay 5 seconds to let signal settle
    if eeg:
        if save_fn is None:  # If no save_fn passed, generate a new unnamed save file
            save_fn = generate_save_fn(eeg.device_name, 'auditoryaMMN',
                                       'unnamed')
            print(
                f'No path for a save file was passed to the experiment. Saving data to {save_fn}'
            )
        eeg.start(save_fn, duration=record_duration)

    show_instructions(10)

    # Start EEG Stream, wait for signal to settle, and then pull timestamp for start point
    start = time()

    # Iterate through the events
    for ii, trial in trials.iterrows():

        iteratorthing = iteratorthing + 1

        # Inter trial interval
        core.wait(trial['iti'])

        # Select and display image
        ind = int(trial['sound_ind'])
        auds[ind].stop()
        auds[ind].play()

        # Push sample
        if eeg:
            timestamp = time()
            if eeg.backend == 'muselsl':
                marker = [additional_labels['labels'][iteratorthing - 1]]
                marker = list(map(int, marker))
            else:
                marker = additional_labels['labels'][iteratorthing - 1]
            eeg.push_sample(marker=marker, timestamp=timestamp)

        mywin.flip()

        mywin.flip()
        if len(event.getKeys()) > 0:
            break
        if (time() - start) > record_duration:
            break

        event.clearEvents()

        if iteratorthing == 1798:
            sleep(10)

    # Cleanup
    if eeg: eeg.stop()

    mywin.close()