Example #1
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
    def test_setitem_cache_updating(self):
        # GH 5424
        cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']

        for do_ref in [False, False]:
            df = DataFrame({'a': cont,
                            "b": cont[3:] + cont[:3],
                            'c': np.arange(7)})

            # ref the cache
            if do_ref:
                df.loc[0, "c"]

            # set it
            df.loc[7, 'c'] = 1

            assert df.loc[0, 'c'] == 0.0
            assert df.loc[7, 'c'] == 1.0

        # GH 7084
        # not updating cache on series setting with slices
        expected = DataFrame({'A': [600, 600, 600]},
                             index=date_range('5/7/2014', '5/9/2014'))
        out = DataFrame({'A': [0, 0, 0]},
                        index=date_range('5/7/2014', '5/9/2014'))
        df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]})

        # loop through df to update out
        six = Timestamp('5/7/2014')
        eix = Timestamp('5/9/2014')
        for ix, row in df.iterrows():
            out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D']

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out['A'], expected['A'])

        # try via a chain indexing
        # this actually works
        out = DataFrame({'A': [0, 0, 0]},
                        index=date_range('5/7/2014', '5/9/2014'))
        for ix, row in df.iterrows():
            v = out[row['C']][six:eix] + row['D']
            out[row['C']][six:eix] = v

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out['A'], expected['A'])

        out = DataFrame({'A': [0, 0, 0]},
                        index=date_range('5/7/2014', '5/9/2014'))
        for ix, row in df.iterrows():
            out.loc[six:eix, row['C']] += row['D']

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out['A'], expected['A'])
Example #3
0
    def compute_tf_idf_queries(self):
        # Find total number of document
        results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents'))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute('SELECT did, total_word, path FROM documents')
        tmp = results.fetchall()
        documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path'])
        documents_df['tf_idf'] = 0.0

        no_docterm = {}

        for query in self.queries:
            no_docterm[query] = 0

        for index, row in documents_df.iterrows():
            path = row['path']
            with codecs.open(path, 'rt') as f:
                text = f.read()
                for query in self.queries:
                    if query in text.decode('utf-8').lower():
                        no_docterm[query] += 1

        for query in self.queries:
            for index, row in documents_df.iterrows():
                total_word = row['total_word']
                path = row['path']

                with codecs.open(path, 'rt') as f:
                    text = f.read()

                tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query])
                cur_tf_idf = documents_df.get_value(index, 'tf_idf')
                documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf)

        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])
        df['tf_idf'] = 0.0

        for index, row in df.iterrows():
            did = row['did']
            tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0]
            df.set_value(index, 'tf_idf', tf_idf)

        del df['did']
        df = df.groupby(['e_type', 'entity']).sum().reset_index()
        return df
Example #4
0
    def test_pivot_index_with_nan(self):
        # GH 3588
        nan = np.nan
        df = DataFrame({'a': ['R1', 'R2', nan, 'R4'],
                        'b': ['C1', 'C2', 'C3', 'C4'],
                        'c': [10, 15, 17, 20]})
        result = df.pivot('a', 'b', 'c')
        expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan],
                              [nan, 15, nan, nan], [nan, nan, nan, 20]],
                             index=Index([nan, 'R1', 'R2', 'R4'], name='a'),
                             columns=Index(['C1', 'C2', 'C3', 'C4'], name='b'))
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T)

        # GH9491
        df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'),
                        'c': 100 + np.arange(6)})
        df['b'] = df['a'] - pd.Timestamp('2014-02-02')
        df.loc[1, 'a'] = df.loc[3, 'a'] = nan
        df.loc[1, 'b'] = df.loc[4, 'b'] = nan

        pv = df.pivot('a', 'b', 'c')
        self.assertEqual(pv.notnull().values.sum(), len(df))

        for _, row in df.iterrows():
            self.assertEqual(pv.loc[row['a'], row['b']], row['c'])

        tm.assert_frame_equal(df.pivot('b', 'a', 'c'), pv.T)
Example #5
0
 def predict(self, prediction_data):
   df = DataFrame(prediction_data)
   ret = []
   for row in df.iterrows():
     index, data = row
     ret += [self.agg(data.tolist())]
   return(ret)
Example #6
0
 def predict(self, prediction_data):
   preds = DataFrame(prediction_data)
   ret = []
   for row in preds.iterrows():
     index, data = row
     ret.append(mean(data))
   return(ret)
Example #7
0
def convert2flightplan(df: pd.DataFrame):
    """
    Convert dataframe to Flight Gear Flight Plan.
    """

    # Feet above sea level
    df['fasl'] = df['masl'] * 3.28084

    # Knots are nm (1852 m) per hour.
    df['knots'] = (df['dm'] / 1852) / (df['dt_sec'] / 60 / 60)

    rv = HEADER
    prev_state = STOP
    # Take off at Vr (stop ignoring altitude).
    df['state'] = df.apply(lambda x: flight_state(x['knots']), axis=1)

    for index, row in df.iterrows():
        state = row['state']
        if state in (TAXI, RUNWAY, AERO,):
            # Only output when really moving.

            rv += WPT.format(**row, ground='true' if state in (STOP, TAXI, RUNWAY,) else 'false')

            if state == TAXI and prev_state == STOP:
                # Landed new flight.
                rv += FOOTER + HEADER + "<!-- -lat={lat} -lon={lon} -->".format(**row)
        prev_state = state

    rv += FOOTER
    return rv
Example #8
0
def resolve(dataset, m):
    t = dataset.y
    phis = DataFrame()
    for i in range(0,m+1):
        p = dataset.x**i
        p.name="x**%d" % i
        phis = pd.concat([phis,p], axis=1)

    for index, line in phis.iterrows():
        phi = DataFrame(line)
        if index == 0:
            phiphi = np.dot(phi,phi.T)
        else:
            phiphi += np.dot(phi,phi.T)
    s_inv = alpha * DataFrame(np.identity(m+1)) + beta * phiphi
    s = np.linalg.inv(s_inv)

    # 平均 m(x)
    def mean_fun(x0):
        phi_x0 = DataFrame([x0 ** i for i in range(0,m+1)])
        for index, line in phis.iterrows():
            if index == 0:
                tmp = t[index] * line
            else:
                tmp += t[index] * line
        return (beta * np.dot(np.dot(phi_x0.T, s), DataFrame(tmp))).flatten()

    # 標準偏差 s(x)
    def deviation_fun(x0):
        phi_x0 = DataFrame([x0 ** i for i in range(0,m+1)])
        deviation = np.sqrt(1.0/beta + np.dot(np.dot(phi_x0.T, s), phi_x0))
        return deviation.diagonal()

    return mean_fun, deviation_fun
Example #9
0
def create_seated(
        settings: dict,
        passengers: pd.DataFrame) -> pd.DataFrame:
    """
    :param settings:
        Configuration settings for the current trial
    :param passengers:
        The passengers data frame for the trial
    """

    passenger_index = []
    seat_names = []
    seated_time = []

    for index, passenger in passengers.iterrows():
        passenger_index.append(index)
        seat_names.append(
            '{}{}'.format(passenger['aisle'], passenger['letter'])
        )
        seated_time.append(None)

    return pd.DataFrame({
        'passenger': passenger_index,
        'seat': seat_names,
        'time': seated_time
    })
Example #10
0
def calculate(settings: dict, progress: pd.DataFrame):
    """

    :param settings:
    :param progress:
    :return:
    """

    passenger_count = settings['passenger_count']

    waiting = []

    previous_row = None
    for elapsed_time, row in progress.iterrows():
        waiting.append(0)
        for passenger_index in range(passenger_count):
            if previous_row is None:
                continue

            position = row[str(passenger_index)]
            last_position = previous_row[str(passenger_index)]

            if position == last_position:
                waiting[-1] += 1

        previous_row = row
        waiting[-1] = 100.0 * waiting[-1] / passenger_count
Example #11
0
def test_age(df: DataFrame):
    sub = 0
    for index, row in df.iterrows():
        name = row['Name']
        age = row['Age']
        if not math.isnan(age):
            if age <= 8:
                res = 'kid'
            elif age <= 30:
                res = 'young'
            elif age <= 45:
                res = 'middle'
            else:
                res = 'old'
        else:
            if match_name(name, r".*Master\..*"):
                res = 'kid'
            elif match_name(name, r".*Miss\..*"):
                res = 'young'
            elif match_name(name, r".*Mr(s)?\..*"):
                res = 'middle'
            else:
                res = 'young'
        df.loc[sub, 'Age'] = res
        sub += 1
    return df
Example #12
0
    def sum_of_parts(self):
        '''
        For more info on this see:
        https://github.com/unicef/rhizome/blob/master/docs/spec.rst#aggregation-and-calculation

        '''

        ## get the indicator_ids we need to make the calculation ##
        initial_calc_df = self.build_calc_df(['PART_TO_BE_SUMMED'])

        ## handle recursive calculations ( see spec.rst link above ) ##
        calc_df = self.build_recursive_sum_calc_df(initial_calc_df)

        self_join_calc_df = calc_df.merge(calc_df, left_on =\
            'indicator_component_id',right_on='calc_indicator_id',how='left')

        ## get the datapoints for the above indicator_ids ##
        dp_df = self.build_dp_df(calc_df['indicator_component_id'])

        ## now join the above dataframe on itself to set up the calculation ##
        dp_df_with_calc = self.join_dp_to_calc(calc_df, dp_df)

        ## take the sum of all of the component indicators ##
        grouped_df = DataFrame(dp_df_with_calc.merge(dp_df_with_calc)\
            .groupby(['location_id','calc_indicator_id','campaign_id',])\
            ['value'].sum())

        for ix, row_data in grouped_df.iterrows():
            self.dwc_tuple_dict[ix] = row_data.value
Example #13
0
def create_unified_column(data_frame: pd.DataFrame) -> pd.Series:
    unified = [
        '-'.join(to_strings(row.to_dict().values()))
        for _, row in data_frame.iterrows()
    ]

    return pd.Series(unified)
Example #14
0
def receiver_locations(locs: pandas.DataFrame):
    if not isinstance(locs, pandas.DataFrame):
        return

    if cartopy is not None:
        ax = figure().gca(projection=cartopy.crs.PlateCarree())

        ax.add_feature(cpf.LAND)
        ax.add_feature(cpf.OCEAN)
        ax.add_feature(cpf.COASTLINE)
        ax.add_feature(cpf.BORDERS, linestyle=':')
    else:
        ax = figure().gca()

    for name, loc in locs.iterrows():
        if 15 <= loc.interval < 30:
            c = 'g'
        elif 5 <= loc.interval < 15:
            c = 'o'
        elif loc.interval < 5:
            c = 'r'
        else:  # large or undefined interval
            c = 'b'

        if np.isfinite(loc.interval):
            ax.scatter(loc.lon, loc.lat, s=1000*1/loc.interval, c=c, label=name)
        else:
            ax.scatter(loc.lon, loc.lat, c=c, label=name)
Example #15
0
class Iteration(object):

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples(self):
        for row in self.df2.itertuples():
            pass

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Example #16
0
    def test_pivot_index_with_nan(self):
        # GH 3588
        nan = np.nan
        df = DataFrame({"a": ["R1", "R2", nan, "R4"], "b": ["C1", "C2", "C3", "C4"], "c": [10, 15, 17, 20]})
        result = df.pivot("a", "b", "c")
        expected = DataFrame(
            [[nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20]],
            index=Index([nan, "R1", "R2", "R4"], name="a"),
            columns=Index(["C1", "C2", "C3", "C4"], name="b"),
        )
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T)

        # GH9491
        df = DataFrame({"a": pd.date_range("2014-02-01", periods=6, freq="D"), "c": 100 + np.arange(6)})
        df["b"] = df["a"] - pd.Timestamp("2014-02-02")
        df.loc[1, "a"] = df.loc[3, "a"] = nan
        df.loc[1, "b"] = df.loc[4, "b"] = nan

        pv = df.pivot("a", "b", "c")
        self.assertEqual(pv.notnull().values.sum(), len(df))

        for _, row in df.iterrows():
            self.assertEqual(pv.loc[row["a"], row["b"]], row["c"])

        tm.assert_frame_equal(df.pivot("b", "a", "c"), pv.T)
Example #17
0
def make_lines(tracks: pd.DataFrame, transformation: dict) -> list:
    def make_line(start: pd.Series, end: pd.Series) -> str:
        return create_tag('line', {
            'stroke': 'rgba(0, 0, 0, 0.2)',
            'stroke-width': '2',
            'stroke-dasharray': '5,5',
            'x1': transform_x(start['x'], transformation),
            'y1': transform_y(start['y'], transformation),
            'x2': transform_x(end['x'], transformation),
            'y2': transform_y(end['y'], transformation)
        })

    previous = pd.Series(dict(
        x=transformation['x_min'],
        y=tracks.iloc[0]['y']
    ))
    lines = []

    for index, row in tracks.iterrows():
        lines.append(make_line(previous, row))
        previous = row

    if previous['x'] < transformation['x_max']:
        lines.append(make_line(previous, pd.Series(dict(
            x=transformation['x_max'],
            y=previous['y']
        ))))

    return lines
Example #18
0
    def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True,
                                  workers=1, ignore_globs=None, include_globs=None):
        """
        Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines
        blamed to each committer at each timestamp as data.

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :param committer: (optional, defualt=True) true if committer should be reported, false if author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core.
        :return: DataFrame

        """

        if not _has_joblib:
            raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use
            cumulative_blame() instead.''')

        revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints)

        if self.verbose:
            print('Beginning processing for cumulative blame:')

        revisions = json.loads(revs.to_json(orient='index'))
        revisions = [revisions[key] for key in revisions]

        ds = Parallel(n_jobs=workers, backend='threading', verbose=5)(
            delayed(_parallel_cumulative_blame_func)
            (self, x, committer, ignore_globs, include_globs) for x in revisions
        )

        revs = DataFrame(ds)
        del revs['rev']

        revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp))
        revs.set_index(keys=['date'], drop=True, inplace=True)
        revs = revs.fillna(0.0)

        # drop 0 cols
        for col in revs.columns.values:
            if col != 'col':
                if revs[col].sum() == 0:
                    del revs[col]

        # drop 0 rows
        keep_idx = []
        committers = [x for x in revs.columns.values if x != 'date']
        for idx, row in revs.iterrows():
            if sum([row[x] for x in committers]) > 0:
                keep_idx.append(idx)

        revs = revs.ix[keep_idx]
        revs.sort_index(ascending=False, inplace=True)

        return revs
Example #19
0
def write_dialer(filepath: str, dialer: pd.DataFrame):
    """
    Write to fixed width dialer format - expect each column to be left justified data frame with no need for character padding
    line endings are carriage returns in windows - FIX??
    """
    with open(filepath, 'w') as f:
        for i, row in dialer.iterrows():
            f.write(''.join(row.tolist()) + "\n")
Example #20
0
    def fillna_dict(cls, prop):
        """
        Use trade history then fill empty with value row above
        """
        df = DataFrame(prop)
        df = df.replace(['', 'DEBIT', 'CREDIT'], numpy.nan)
        df = df.fillna(method='ffill')

        return [r.to_dict() for k, r in df.iterrows()]
Example #21
0
def set_cabin(df: DataFrame):
    for index, row in df.iterrows():
        # is nan
        if isinstance(row['Cabin'], float):
            df.loc[index, ['Cabin']] = 'X'
        else:
            df.loc[index, ['Cabin']] = row['Cabin'][0]
    df['Cabin'] = df['Cabin'].astype('object')
    return df
Example #22
0
 def save_to_file(self, fn):
     gg = DataFrame(self.power_series_apps_table)
     try:
         del gg['diff1']
         del gg['diff2']
     except Exception:
         print('')
         
     gg['Loc Events'] = self.loc.events_apps_1min['Apps']
     apps = self.loc.metadata.get_channels()
     sd = {}
     #Initialize series with 0s
     for app in apps:
         sd[app] = Series(0, index=gg.index)
         
     #Count location events for each appliance
     for index, row in gg.iterrows():
         try:
             if len(row['Loc Events']) > 0:
                 for app in apps:
                     n = row['Loc Events'].count(app)
                     sd[app][index] = n
         except Exception:
             continue
     
     if self.loc.name == 'REDD':
         sd[(3,4)] = sd[3]
         sd[(10,20)] = sd[10]
         del sd[3]
         del sd[4]
         del sd[10]
         del sd[20]
       
     #Change column names and append them to gral table
     locevents = DataFrame(sd)
     locevents.columns = [(str(col) + ' locEv') for col in locevents]        
     for locEv in locevents:
         gg[locEv] = locevents[locEv]
         
     
     #Get power values of each appliance and resample for 1min
     act = DataFrame(self.loc.appliances_consuming_times)
     act = act.resample('1Min')
            
     if self.loc.name == 'REDD':
         del act[3]
         del act[10]
         act.columns = [(3,4), 5,6,7,8,9,11,12,13,14,15,16,17,18,19,(10,20)]
     act.columns = [(str(col) + ' conEv') for col in act]
     
     for app in act:
         gg[app] = act[app]        
     gg.columns = [str(col) for col in gg]
     gg = gg[sorted(gg.columns)]
     gg.to_csv(fn)   
     return
Example #23
0
def make_circles(tracks: pd.DataFrame, transformation: dict) -> list:
    def make_circle(track: pd.Series):
        return create_tag('circle', {
            'r': 16,
            'cx': transform_x(track['x'], transformation),
            'cy': transform_y(track['y'], transformation),
            'style': 'fill:{}'.format(get_color(track, tracks))
        })

    return [make_circle(row) for index, row in tracks.iterrows()]
Example #24
0
 def test_iterrows_corner(self):
     # gh-12222
     df = DataFrame(
         {'a': [datetime.datetime(2015, 1, 1)], 'b': [None], 'c': [None],
          'd': [''], 'e': [[]], 'f': [set()], 'g': [{}]})
     expected = Series(
         [datetime.datetime(2015, 1, 1), None, None, '', [], set(), {}],
         index=list('abcdefg'), name=0, dtype='object')
     _, result = next(df.iterrows())
     tm.assert_series_equal(result, expected)
def record_match_data(min_seq):
    matches = db.match.find({"match_seq_num": { '$gt': min_seq } })
    # import pdb; pdb.set_trace()
    for match in matches:
        if match["human_players"] == 10 and match["duration"] > 1200:

            data_frame = DataFrame(match["players"])
            radiant_heroes = data_frame[data_frame['player_slot']<128]['hero_id'].tolist()
            dire_heroes = data_frame[data_frame['player_slot']>=128]['hero_id'].tolist()
            for index, row in data_frame.iterrows():
                if (row["player_slot"] < 128):
                    radiant_heroes.remove(row["hero_id"])
                    teammate = radiant_heroes
                    opponent = dire_heroes
                    is_win = bool(match["radiant_win"])
                else:
                    dire_heroes.remove(row["hero_id"])
                    teammate = dire_heroes
                    opponent = radiant_heroes
                    is_win = not bool(match["radiant_win"])
                record_json = json.loads(row.to_json())
                record_json['win'] = is_win
                record_json['match_id'] = match['match_id']
                record_json['match_seq'] = match['match_seq_num']
                record_json['teammate'] = teammate
                record_json['opponent'] = opponent

                item = []

                for x in range(0,6):
                    if record_json["item_{}".format(x)]>0:
                        if "item_{}" in record_json:
                            item.append(record_json["item_{}".format(x)])
                            del record_json["item_{}".format(x)]
                        if "item_{}_name" in record_json:
                            del record_json["item_{}_name".format(x)]

                record_json['item'] = item

                count = statics_db.match_record.find({'$and':[{'hero_id':row['hero_id']},{'match_id':match['match_id']}]}).count()
                if count == 0:
                    statics_db.match_record.insert_one(record_json)
            max_solved_seq_num = max(statics_db.max_solved_seq_num.find({"value_name":"max_solved_seq_num"})[0]["value"],match["match_seq_num"])
            statics_db.max_solved_seq_num.update_one(
                {"value_name":"max_solved_seq_num"},
                {
                    "$set":
                    {
                        "value":max_solved_seq_num
                    },
                    "$currentDate": {"lastModified": True}
                }
            )
            logging.info("match handle:"+str(max_solved_seq_num))
Example #26
0
def word_freq(file_name, suffix='_wordfreq', sep='\t', threshold=.5):
	print "start word_freq"
	# start = datetime.datetime.now()
	# print start
	reviews = pd.read_csv(file_name, error_bad_lines=False, sep=sep)
	cb = reviews['stopword_body']
	rate = reviews['Rating']
	# label all words with the rating
	cb_temp = []
	for i, c in enumerate(cb):
		cb_temp.append([(w, rate[i]) for w in ast.literal_eval(c)])
	reviews['stopword_body'] = cb_temp
	# calculate_time(start)
	# get the corpus of all reviews, lists of all words with label
	'''--------------------------------------------------------'''
	cop_wl = []
	for b in cb_temp:
		# change the unicode data to the raw string
		# cop_wl += [(unicodedata.normalize('NFKD', w[0]).encode('utf-8','replace'), w[1]) for w in b if type(w[0])==unicode]
		cop_wl += b
	'''--------------------------------------------------------'''
	# calculate_time(start)
	# word frequency of the corpus with label
	wfq = nltk.FreqDist(cop_wl)
	# calculate_time(start)
	# get the word list of all reviews without label
	cop = [w[0] for w in cop_wl]
	cop = set(cop)
	cop_len = len(cop)
	# calculate_time(start)
	# get freq of all words in one list
	wfq_l = []
	for w in cop:
		for i in range(1, 6):
			wfq_l.append(wfq[(w, i)])

	# calculate_time(start)
	# reshape the list to a matrix
	wfq_mx = DataFrame(np.array(wfq_l).reshape((cop_len,5)), index=pd.Index(cop), columns=pd.Index([1,2,3,4,5]))
	# calculate_time(start)
	# calculate the prob of each rating
	w_s = []
	w_sum = []
	for i, r in wfq_mx.iterrows():
		word_sum = wfq_mx.ix[i].sum()
		# wfq_mx.ix[i] = wfq_mx.ix[i]/word_sum
		w_s.append(word_useful_score(list(wfq_mx.ix[i]), word_sum))
		w_sum.append(word_sum)

	wfq_mx['score'] = w_s
	wfq_mx['sum'] = w_sum
	wfq_mx = wfq_mx.sort(columns='sum').ix[-int(len(w_s) * threshold):,:]
	print wfq_mx
	wfq_mx.to_csv(file_name.split('.')[0] + suffix + '.' + file_name.split('.')[1], sep='\t')
    def test_append_list_of_series_dicts(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=['foo', 'bar', 'baz', 'qux'])

        dicts = [x.to_dict() for idx, x in df.iterrows()]

        result = df.append(dicts, ignore_index=True)
        expected = df.append(df, ignore_index=True)
        assert_frame_equal(result, expected)

        # different columns
        dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
                 {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
        result = df.append(dicts, ignore_index=True, sort=True)
        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        assert_frame_equal(result, expected)
Example #28
0
 def train(self, training_data):
   preds = DataFrame(training_data['prediction'])
   preds['actual'] = training_data['actual']
   pred_cols = len(training_data['prediction'].keys())
   results = DataFrame()
   for row in preds.iterrows():
    index, data = row 
    results = results.append( data[range(pred_cols)] == data['actual'] )
   for k in preds[range(pred_cols)]:
     self.weights[k] = 1/variance(1-results[k])
   # If we have infinte weights make them 2* the sum of the other 
   if any(x == inf for x in self.weights):
     tot_weight = sum( [x for x in self.weights.values() if x != inf] )
     for wk in self.weights:
       if self.weights[wk] == inf:
         self.weights[wk] = 2*tot_weight
Example #29
0
 def request_player_info_1(self, response):
     r_json = json.loads(response.body_as_unicode())
     result_set = r_json[u'resultSets'][0]
     df = DataFrame(data=result_set[u'rowSet'], columns=result_set[u'headers']).set_index('PERSON_ID')
     for id_, data in df.iterrows():
         p = PlayerItem()
         p['nba_player_id'] = id_
         p['nba_player_code'] = data['PLAYERCODE']
         p['is_active'] = bool(data['ROSTERSTATUS'])
         yield FormRequest(
             url = 'http://stats.nba.com/stats/commonplayerinfo/',
             method = 'GET',
             formdata = {'PlayerID': str(id_)},
             meta = dict(player=p),
             callback = self.request_player_info_2
         )
Example #30
0
    def test_mixed_index_at_iat_loc_iloc_dataframe(self):
        # GH 19860
        df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]],
                       columns=['a', 'b', 'c', 1, 2])
        for rowIdx, row in df.iterrows():
            for el, item in row.iteritems():
                assert df.at[rowIdx, el] == df.loc[rowIdx, el] == item

        for row in range(2):
            for i in range(5):
                assert df.iat[row, i] == df.iloc[row, i] == row * 5 + i

        with pytest.raises(KeyError):
            df.at[0, 3]
        with pytest.raises(KeyError):
            df.loc[0, 3]