def send_to_db(self): conn = sqlite3.connect('data2.sqlite', timeout=30) c = conn.cursor() df = DataFrame(self.__dict__.items(), index=self.__dict__.keys()) df = df.drop(0,1) df = df.transpose() df = df.sort(axis=1) df.to_sql('earnings_calendar', conn, if_exists='append', index=False)
def test(): """DataFrame editor test""" from numpy import nan df1 = DataFrame([ [True, "bool"], [1+1j, "complex"], ['test', "string"], [1.11, "float"], [1, "int"], [np.random.rand(3, 3), "Unkown type"], ["Large value", 100], ["áéí", "unicode"] ], index=['a', 'b', nan, nan, nan, 'c', "Test global max", 'd'], columns=[nan, 'Type']) out = test_edit(df1) print("out:", out) out = test_edit(df1.iloc[0]) print("out:", out) df1 = DataFrame(np.random.rand(100001, 10)) # Sorting large DataFrame takes time df1.sort(columns=[0, 1], inplace=True) out = test_edit(df1) print("out:", out) out = test_edit(TimeSeries(np.arange(10))) print("out:", out) return out
def calculate_top_10_solutions(self): '''calcualte all schemes and select top 10 solutions''' columns = ['name','rate','money'] if isfile( learning_progres_csv ): scheme_profit = read_csv(learning_progres_csv) else: scheme_profit = DataFrame(columns = columns) scheme_profit.set_index('name',inplace = True) with open(learning_progres_csv, 'w+') as csvfile: writer = csv.DictWriter(csvfile,delimiter=',',fieldnames = columns) writer.writeheader() csvfile.flush() for sc in self.generate_all_schemes(): if sc.name not in scheme_profit.index: e = evaluator(sc) scheme_profit.ix[sc.name] = rate,money = e.calculate() writer.writerow({'name':sc.name,'rate':rate,'money':money}) csvfile.flush() if self.log: print(sc.name + ' - ' + str(money) + ' \t rate = ' + str(rate)) else: writer.writerow({'name':sc.name,'rate':scheme_profit.rate[sc.name],'money':scheme_profit.money[sc.name]}) if self.log: print(sc.name + ' - ' + str(scheme_profit.money[sc.name]) + ' \t rate = ' + str(scheme_profit.rate[sc.name])) csvfile.flush() #TODO:write into scheme scheme_profit.sort(['money'],ascending = False) return scheme_profit[:10].to_dict()
def test(): """DataFrame editor test""" from numpy import nan from pandas.util.testing import assert_frame_equal, assert_series_equal df1 = DataFrame( [[True, "bool"], [1 + 1j, "complex"], ['test', "string"], [1.11, "float"], [1, "int"], [np.random.rand(3, 3), "Unkown type"], ["Large value", 100], ["áéí", "unicode"]], index=['a', 'b', nan, nan, nan, 'c', "Test global max", 'd'], columns=[nan, 'Type']) out = test_edit(df1) assert_frame_equal(df1, out) result = Series([True, "bool"], index=[nan, 'Type'], name='a') out = test_edit(df1.iloc[0]) assert_series_equal(result, out) # Sorting large DataFrame takes time df1 = DataFrame(np.random.rand(100100, 10)) df1.sort(columns=[0, 1], inplace=True) out = test_edit(df1) assert_frame_equal(out, df1) series = Series(np.arange(10), name=0) out = test_edit(series) assert_series_equal(series, out)
def test(): """DataFrame editor test""" from numpy import nan from pandas.util.testing import assert_frame_equal, assert_series_equal df1 = DataFrame([ [True, "bool"], [1+1j, "complex"], ['test', "string"], [1.11, "float"], [1, "int"], [np.random.rand(3, 3), "Unkown type"], ["Large value", 100], ["áéí", "unicode"] ], index=['a', 'b', nan, nan, nan, 'c', "Test global max", 'd'], columns=[nan, 'Type']) out = test_edit(df1) assert_frame_equal(df1, out) result = Series([True, "bool"], index=[nan, 'Type'], name='a') out = test_edit(df1.iloc[0]) assert_series_equal(result, out) # Sorting large DataFrame takes time df1 = DataFrame(np.random.rand(100100, 10)) df1.sort(columns=[0, 1], inplace=True) out = test_edit(df1) assert_frame_equal(out, df1) series = Series(np.arange(10), name=0) out = test_edit(series) assert_series_equal(series, out)
def analyze(): data = DataFrame() for architecture in performer.ARCHITECTURES: performer.initialize('test', architecture, 8, 112) print performer.DATASET_TEST data1 = read_csv(performer.DATASET_TEST, sep='\t') data = concat([data, data1], ignore_index=True) data.sort('benchmark', inplace=True) print 'analyze:', data.columns.values # results = data.ix[data.groupby(['architecture', 'benchmark'])['latency'].idxmin()] results = data results['graph'] = [ performer.string_to_graph(t) for t in results['topology'] ] results['average_hop_count'] = [ average_shortest_path_length(g) for g in results['graph'] ] results['link_lengths'] = [ get_edge_attributes(g, 'length').values() for g in results['graph'] ] mask = (results['architecture'] == 'small_world') | (results['architecture'] == 'optimum') print data[mask][['architecture', 'benchmark', 'latency']] performer.plot_figures(results) # for normalized_attribute, attribute in zip(performer.NORMALIZED_ATTRIBUTES, performer.ATTRIBUTES): # normlized_values = [] # for index, row in results.iterrows(): # mesh_index = (results['architecture'] == 'mesh') & (results['benchmark'] == row['benchmark']) # normlized_values.append(row[attribute]/squeeze(results[mesh_index][attribute])) # results[normalized_attribute] = normlized_values return
def __parallel_evaluation(self, X): from mpi4py import MPI from pandas import DataFrame comm = MPI.COMM_WORLD n_procs, _ = X.shape # Spawning processes to test kriging mixture comm = MPI.COMM_SELF.Spawn(sys.executable, args=['ego_evaluation.py'], maxprocs=n_procs) # scatter the models and data comm.bcast(self.fitness, root=MPI.ROOT) comm.scatter([(k, X[k, :]) \ for k in range(n_procs)], root=MPI.ROOT) # Synchronization while the children process are performing # heavy computations... comm.Barrier() # Gether the fitted model from the childrenn process # Note that 'None' is only valid in master-slave working mode result = comm.gather(None, root=MPI.ROOT) comm.Disconnect() # register the measures data = DataFrame([[d['index'], d['y']] \ for d in result], columns=['index', 'y']) data.sort('index', inplace=True) return array(data['y'])
def test_sort(self): frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"]) # 9816 deprecated with tm.assert_produces_warning(FutureWarning): frame.sort(columns="A") with tm.assert_produces_warning(FutureWarning): frame.sort()
def transform_dataframe(self, df): """ Use matplotlib to compute boxplot statistics on timeseries data. """ from pandas import DataFrame group = self.get_grouping(len(df.columns)) serializer = self.get_serializer() value_col = serializer.value_field series_col = serializer.key_fields[0] param_cols = serializer.parameter_fields ncols = 1 + len(param_cols) if "index" in group: # Separate stats for each column in dataset groups = {col: df[col] for col in df.columns} else: # Stats for entire dataset df = df.stack().stack().stack() df.reset_index(inplace=True) index = serializer.get_index(df) df.set_index(index[0], inplace=True) groups = {(value_col, ) + ('all', ) * ncols: df.value} # Compute stats for each column, potentially grouped by year all_stats = [] for g, series in groups.items(): if g[0] != serializer.value_field: continue series_info = g[-1] param_info = list(reversed(g[1:-1])) if "year" in group or "month" in group: groupby = "year" if "year" in group else "month" dstats = self.compute_boxplots(series, groupby) for s in dstats: s[series_col] = series_info for pname, pval in zip(param_cols, param_info): s[pname] = pval else: stats = self.compute_boxplot(series) stats[series_col] = series_info for pname, pval in zip(param_cols, param_info): stats[pname] = pval dstats = [stats] all_stats += dstats df = DataFrame(all_stats) index = [series_col] + param_cols if "year" in group: index = ['year'] + index elif "month" in group: index = ['month'] + index df.sort(index, inplace=True) df.set_index(index, inplace=True) df.columns.name = "" df = df.unstack().unstack() if "year" in group or "month" in group: df = df.unstack() return df
def test_sort(self): frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) # 9816 deprecated with tm.assert_produces_warning(FutureWarning): frame.sort(columns='A') with tm.assert_produces_warning(FutureWarning): frame.sort()
def get_runs_df(self): """ Returns all table as dataframe, sorted with most recent entry on bottom (ascending order) """ df = DataFrame([{k: v for k, v in r.items()} for r in self.table.scan()]) if df.empty: return df else: df.sort(columns=['dt'], ascending=True, inplace=True) # force df to have columns in this order return df[['dt', 'start', 'end']]
def filter_tags(tag_pickle='results/material_tags.pickle', exclude_tags='results/exclude.csv', n=50): exclude_words, duplicate_sets = load_filter_tags(exclude_tags) with open(tag_pickle, 'r') as f: t = DataFrame(pickle.load(f)['result']).set_index('_id') for setn in duplicate_sets: t.ix[setn[0]] += sum(map(lambda x: t.ix[x] , setn[1:])) for tag in setn[1:]: t.drop(tag, inplace=True) for tag in exclude_words: t.drop(tag, inplace=True) t.sort(ascending=False) return t[:n].index
def calculate_accuracy_by_category(y_test, predicted): """Calculates the accuracy of each by category. This is used for the outcome of a classifier. Parameters: y_test (array): the y_test predicted (array): the predicted values Returns: A data frame with the predicted values""" df = DataFrame({'Target': y_test, 'Predicted': predicted}) df['Score'] = df.Target == df.Predicted df = df.groupby('Target').apply(lambda x: 100.0 * sum(x.Score) / len(x)) df.sort() return df
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = DataFrame(left, columns=['id', 'name', 'amount']) right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = DataFrame(right, columns=['city', 'id']) lsym = symbol('lsym', 'var * {id: int, name: string, amount: real}') rsym = symbol('rsym', 'var * {city: string, id: int}') convert = lambda df: set(df.to_records(index=False).tolist()) assert (convert(compute(join(lsym, rsym), {lsym: left, rsym: right})) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')])) assert (convert(compute(join(lsym, rsym, how='left'), {lsym: left, rsym: right})) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, np.nan), (4, 'Dennis', 400, 'Moscow')])) df = compute(join(lsym, rsym, how='right'), {lsym: left, rsym: right}) expected = DataFrame([(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (3., np.nan, np.nan, 'lsymA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) result = df.sort('id').to_records(index=False) expected = expected.sort('id').to_records(index=False) np.array_equal(result, expected) df = compute(join(lsym, rsym, how='outer'), {lsym: left, rsym: right}) expected = DataFrame([(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (2., 'Bob', 200., np.nan), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) result = df.sort('id').to_records(index=False) expected = expected.sort('id').to_records(index=False) np.array_equal(result, expected)
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = DataFrame(left, columns=['id', 'name', 'amount']) right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = DataFrame(right, columns=['city', 'id']) L = symbol('L', 'var * {id: int, name: string, amount: real}') R = symbol('R', 'var * {city: string, id: int}') convert = lambda df: set(df.to_records(index=False).tolist()) assert convert(compute(join(L, R), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert convert(compute(join(L, R, how='left'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, np.nan), (4, 'Dennis', 400, 'Moscow')]) df = compute(join(L, R, how='right'), {L: left, R: right}) expected = DataFrame( [(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) assert str(df.sort('id').to_records(index=False)) ==\ str(expected.sort('id').to_records(index=False)) df = compute(join(L, R, how='outer'), {L: left, R: right}) expected = DataFrame( [(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (2., 'Bob', 200., np.nan), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) assert str(df.sort('id').to_records(index=False)) ==\ str(expected.sort('id').to_records(index=False))
def plot_bic_ranks(df, group_by, analysis_col, percentage=True, **kwargs): COL_NAMES = [ 'First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth', 'Seventh' ] bic_df = _create_bic_df(df, group_by, analysis_col, **kwargs) rank = bic_df.rank(axis=1).filter(regex='_BIC$') bic_cols = rank.filter(regex='_BIC$').columns rank_counts = {col: rank[col].value_counts() for col in bic_cols} rank_counts = DataFrame(rank_counts).transpose().fillna(value=0) rank_counts.columns = COL_NAMES[:len(rank_counts)] rank_counts = rank_counts.sort(columns=COL_NAMES[:len(rank_counts)], ascending=False) if percentage: rank_counts = (rank_counts / rank_counts.sum()) * 100 ylabel = 'Percentage' else: ylabel = 'Count' ax = rank_counts['First'].plot(kind='bar', title='Distribution BIC First Place', rot=-30) ax.set_ylabel(ylabel) plt.show() ax = rank_counts.plot(kind='bar', title='Distribution BIC Ranks', rot=-30) ax.set_ylabel(ylabel) return rank_counts
def json_to_df(json_file): page = '' pic = '' text = '' point = '' point_list = [] with open(json_file) as data_file: data = json.load(data_file) for i in range(0, len(data)): if data[i]['type'] == 'PageTurn': page = data[i] elif data[i]['type'] == 'Picture': pic = data[i] elif data[i]['type'] == 'Text': text = data[i] elif data[i]['type'] == 'SampleGaze': point = data[i] point.update(page) point.update(pic) point.update(text) point['type'] = u'SampleGaze' point_list.append(point) elif data[i]['type'] == 'SampleFixation': point = data[i] point.update(page) point.update(pic) point.update(text) point['type'] = u'SampleFixation' point_list.append(point) df = DataFrame(point_list) start_time = df['timestamp'].min() df['timestamp'] = df['timestamp'] - start_time df = df.sort('timestamp') return df
def foreach_dataframe(self, func, force_dict=False, *args, **kwargs): """ Really just does a foreach with each being dfs in a panel. """ d = {} for key, df in self.items(): d[key] = func(df, *args, **kwargs) container = PanelDict for key, result in list(d.items()): if isinstance(result, Series): container = DataFrame break if isinstance(result, DataFrame): container = Panel break index = [] for key, result in list(d.items()): if not isinstance(result, (DataFrame, Series)): continue result.name = key ind = result.index index = set(index).union(ind) if force_dict: return PanelDict(d) res = DataFrame(None, index=index) for key, result in list(d.items()): res = res.join(result) res = res.sort() return res
def predict(self, tree): """ TODO Should take an array and predict every item. A score can be stored. It would follow the guidelines set by scikit-learn. """ tree_rules = self.extract_rules(tree) df = DataFrame(columns=['label', 'prob']) gb = self.posteriori.groupby('label') for key, indexes in gb.groups.items(): apriori_prob = self.apriori[self.apriori.label == key]['freq'].values[0] prob = apriori_prob group_df, missing_prob = self.apply_smoothing(self.posteriori.ix[indexes], tree_rules) for rule in tree_rules: prob_evidence = group_df[group_df.rule == rule]['freq'] if len(prob_evidence) == 0: prob_evidence = missing_prob else: prob_evidence = prob_evidence.values[0] prob *= prob_evidence post = DataFrame({'label':[key], 'prob':[prob]}) df = df.append(post) df.index = np.arange(df.index.size) df = df.sort(columns='prob', ascending=False) return df.ix[df['prob'].idxmax()]
def homer_to_narrow_peaks(self, data, output_file): ''' Given a Homer peak dataframe, extract necessary columns and convert to a broadPeak file. From the IDR package description: NarrowPeak files are in BED6+4 format. It consists of 10 tab-delimited columns 1.chrom string Name of the chromosome 2.chromStart int The starting position of the feature in the chromosome. The first base in a chromosome is numbered 0. 3.chromEnd int The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. 4.name string Name given to a region (preferably unique). Use '.' if no name is assigned 5.score int Indicates how dark the peak will be displayed in the browser (1-1000). If '0', the DCC will assign this based on signal value. Ideally average signalValue per base spread between 100-1000. 6.strand char +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned. 7.signalValue float Measurement of overall (usually, average) enrichment for the region. 8.pValue float Measurement of statistical signficance (-log10). Use -1 if no pValue is assigned. 9.qValue float Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned. 10.peak int Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called. ''' columns = OrderedDict(( ('chrom', self.get_first_column(data, ['chr','chrom', 'chromosome'])), ('chromStart', self.get_first_column(data, ['chromStart','start'])), ('chromEnd', self.get_first_column(data, ['chromEnd','end'])), ('name', self.get_first_column(data, ['#PeakID','PeakID','ID','name'])), ('score', Series([0]*data.shape[0])), # Leave zero so that signalValue column is used ('strand', self.get_first_column(data, ['strand'])), ('signalValue', self.get_first_column(data, self.tag_count_columns)), ('pValue', -np.log10(self.get_first_column(data, self.p_value_columns))), ('qValue', Series([-1]*data.shape[0])), # Leave -1 as no individual FDR is called for each peak ('peak', Series([-1]*data.shape[0])), # Leave -1 as no point-source is called for each peak )) df = DataFrame(columns) df = df.sort(['signalValue','pValue'], ascending=False) df.to_csv(output_file, sep='\t', header=False, index=False)
def _search_by_inchi_fuzzy(self, inchi): # TODO: use openbabel if available matches = difflib.get_close_matches(inchi, self.data_frame.InChI.dropna(), n=5, cutoff=.8) ranks = dict([(match, i) for i, match in enumerate(matches)]) selection = DataFrame(self.data_frame[self.data_frame.InChI.isin(matches)]) selection['search_rank'] = selection.name.map(ranks) return selection.sort('search_rank')
def upsert_unique_indices(apps, schema_editor): datapoint_values_list = ['id','created_at','indicator_id','location_id','campaign_id','data_date'] historical_dps = DataFrame(list(DataPoint.objects.filter(unique_index = -1)\ .values_list('id','created_at','indicator_id','location_id','campaign_id','data_date')), columns=datapoint_values_list) # create the unique index historical_dps = historical_dps.apply(add_unique_index, axis=1) # group by and max on created at, get the most recent upload historical_dps = historical_dps.sort("created_at", ascending=False).groupby("unique_index", as_index=False).first() # get the ids into a list and select them dps_to_update = DataPoint.objects.filter(id__in=list(historical_dps['id'])) print 'dps to update' print len(dps_to_update) # then run a query and update each for dp in dps_to_update: unique_index = historical_dps[historical_dps['id'] == dp.id].iloc[0]['unique_index'] dp.unique_index = unique_index dp.save() # delete all the other duplicates dps_to_delete = DataPoint.objects.filter(unique_index=-1) print 'dps_to_delete' print len(dps_to_delete) dps_to_delete.delete()
def sorted_plot(f, set1, k): temp_avg = np.array([]) for i in range(len(set1[f].unique())): temp_avg = np.append( temp_avg, np.mean(set1['click'][set1[f] == (set1[f].unique())[i]])) if (k == 2): f1 = figure() plt.plot(range(len(set1[f].unique())), temp_avg, 'bo', range(len(set1[f].unique())), temp_avg, 'k') plt.grid() plt.show() fig = plt.figure() ax = fig.add_subplot(111) df_temp = DataFrame({f: set1[f].unique(), 'Avg_click': temp_avg}) df_temp = df_temp.sort(columns='Avg_click') plt.plot(range(len(set1[f].unique())), df_temp['Avg_click'], 'bo', range(len(set1[f].unique())), df_temp['Avg_click'], 'k') for x, y in zip(range(len(set1[f].unique())), df_temp['Avg_click']): # <-- #ax.annotate('(%s, %s)' % xy, xy=xy, textcoords='offset points') plt.text(x - 0.004, y + 0.007, df_temp[f][x], fontsize=12) plt.grid() plt.show() print 'total count =', len(df_temp) print 'maximum_value =', df_temp['Avg_click'].max(), 'at', df_temp[f][ df_temp['Avg_click'] == df_temp['Avg_click'].max()] print 'minimum_value =', df_temp['Avg_click'].min(), 'at', df_temp[f][ df_temp['Avg_click'] == df_temp['Avg_click'].min()] print 'number of zeroes =', len(df_temp[df_temp['Avg_click'] == 0]) print 'number of ones =', len(df_temp[df_temp['Avg_click'] == 1]) print 'Distribution:(including the starting value)' for i in np.arange(0, 1.1, 0.1): print i, '<->', (i + 0.1), ' = ', len( df_temp[(df_temp['Avg_click'] >= i) & (df_temp['Avg_click'] < (i + 0.1))])
def analyze(): signals = read_csv(FILE_SIGNALS) devices = signals["id"].unique() print("got %d signals from %d devices" % (len(signals), len(devices))) signals = signals.groupby(["frequency", "id"]).size() signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices], names=signals.index.names), fill_value=0) signals = signals.unstack("id") # let's only keep frequencies with all signals present candidates = signals.dropna() # suggest frequency where the weakest sensor has the most # received signals, and then the frequency with most total # received signals for all sensors candidates = DataFrame({"total": candidates.sum(axis=1), "weakest": candidates.min(axis=1)}) appropriate_freq = candidates.sort(["weakest", "total"], ascending=False).index[0] print("suggesting frequency %s" % mhz(appropriate_freq)) signals.to_csv("spectrum.csv") import matplotlib.pyplot as plt from matplotlib.ticker import EngFormatter p=signals.plot(kind="Area") p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2)) plt.savefig(FILE_SPECTRUM, dpi=300) print("saved spectrum as %s" % FILE_SPECTRUM)
def adjust(data, adjustments: pd.DataFrame): """ IMPORTANT !!! This method supports single index df :param data: dataframe with data. :param adjustments: list of adjustments in the form of [(date, split_factor/dividend_amount, 'split'/'dividend'), ...] :return adjusted data """ adjustments.sort(key=lambda x: x[0], reverse=True) for (_, row) in adjustments.iterrows(): if row.name[2] == 'split': adjust_split(data=data, split_date=row.name[0], split_factor=row[0]) elif row.name[2] == 'dividend': adjust_dividend(data=data, dividend_date=row.name[0], dividend_amount=row[0]) return data
def json_to_df(j_file): page = '' pic = '' text = '' point = '' point_list = [] with open(j_file) as data_file: data = json.load(data_file) for i in range(0, len(data)): if data[i]['type'] == 'PageTurn': page = data[i] elif data[i]['type'] == 'Picture': pic = data[i] elif data[i]['type'] == 'Text': text = data[i] elif data[i]['type'] == 'SampleGaze': point = data[i] point.update(page) point.update(pic) point.update(text) point['type'] = u'SampleGaze' point_list.append(point) elif data[i]['type'] == 'SampleFixation': point = data[i] point.update(page) point.update(pic) point.update(text) point['type'] = u'SampleFixation' point_list.append(point) df = DataFrame(point_list) start_time = df['timestamp'].min() df['timestamp'] = df['timestamp'] - start_time df = df.sort('timestamp') return df
def Main(): import matplotlib.pyplot as pl companies = [ 'AAPL', 'ADSK', 'GOOG', 'MSFT', 'AUY', 'TWTR', 'YHOO', 'CAT', 'GE', 'CSCO', 'F' ] url = 'http://finance.yahoo.com/d/quotes.csv?s=' + '+'.join( companies) + '&f=nabp' response = urllib2.urlopen(url) data = list(csv.reader(response)) columns = ['Name', 'Pricing - Ask', 'Pricing - Bid', 'Previous close'] data = DataFrame(data, columns=columns) data = data.replace(['N/A'], [0]) data['Previous close'] = data['Previous close'].astype(float) data = data.sort(columns=['Previous close'], ascending=False, axis=0) pl.plot(data['Pricing - Ask'], label='Pricing Ask') pl.plot(data['Pricing - Bid'], label='Pricing Bid') pl.plot(data['Previous close'], label='Previous Close') pl.title("Stock values for some companies (Sorted by Ask)") pl.xticks(np.arange(len(data['Name'])), data['Name'].tolist(), rotation=45) pl.legend() pl.tight_layout() apis_helpers.save_fig(pl, 'yahoo-finance', 'basic') pl.close()
def test_sort_values(self): # API for 9816 # sort_index frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) # 9816 deprecated with tm.assert_produces_warning(FutureWarning): frame.sort(columns='A') with tm.assert_produces_warning(FutureWarning): frame.sort() unordered = frame.ix[[3, 2, 4, 1]] expected = unordered.sort_index() result = unordered.sort_index(axis=0) assert_frame_equal(result, expected) unordered = frame.ix[:, [2, 1, 3, 0]] expected = unordered.sort_index(axis=1) result = unordered.sort_index(axis=1) assert_frame_equal(result, expected) assert_frame_equal(result, expected) # sortlevel mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(level='A', sort_remaining=False) expected = df.sortlevel('A', sort_remaining=False) assert_frame_equal(result, expected) df = df.T result = df.sort_index(level='A', axis=1, sort_remaining=False) expected = df.sortlevel('A', axis=1, sort_remaining=False) assert_frame_equal(result, expected) # MI sort, but no by mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(sort_remaining=False) expected = df.sort_index() assert_frame_equal(result, expected)
def estrutura_contagem(contagem_votos,contagem_votacoes): resultado = DataFrame({"bancada":list(contagem_votos.keys()),"votos_com_governo":list(contagem_votos.values()),"votacoes":list(contagem_votacoes.values())}) #calcula a fidelidade partidária (número de votos pró-governo dividido pelo número de votações válidas) resultado["fidelidade"] = 100*resultado["votos_com_governo"]/resultado["votacoes"] #retira o governo do dataframe resultado = resultado[resultado.bancada != "GOV."] resultado = resultado.sort("fidelidade",ascending=False) return resultado
def __init__(self, column, baseline, adjustments=None): self.column = column self.baseline = baseline.values self.dates = baseline.index self.assets = baseline.columns if adjustments is None: adjustments = DataFrame(index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS) else: # Ensure that columns are in the correct order. adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1) adjustments.sort(["apply_date", "sid"], inplace=True) self.adjustments = adjustments self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date) self.adjustment_end_dates = DatetimeIndex(adjustments.end_date) self.adjustment_sids = Int64Index(adjustments.sid)
def merge_variables(self, to_merge): """Merges time series variables into new time series variables. :param to_merge: dictionary mapping new variable name to list of variables to be merged. :return: """ dold = self._data.copy() s = Series(data=np.zeros((dold.shape[0],)), index=dold.index).replace(0, np.nan) dnew = DataFrame(dict([ (k, s) for k in to_merge.keys() if len(set(to_merge[k]).intersection(dold.columns))>0 ])) for newvar in dnew.columns: for oldvar in to_merge[newvar]: if oldvar in dold.columns: dnew[newvar][dold[oldvar].notnull()] = dold[oldvar][dold[oldvar].notnull()] del dold[oldvar] dnew = dnew.join(dold, how='outer') dnew.sort(axis=1, inplace=True) dnew.sort(axis=0, inplace=True) self._data = dnew
def word_freq(file_name, suffix='_wordfreq', sep='\t', threshold=.5): print "start word_freq" # start = datetime.datetime.now() # print start reviews = pd.read_csv(file_name, error_bad_lines=False, sep=sep) cb = reviews['stopword_body'] rate = reviews['Rating'] # label all words with the rating cb_temp = [] for i, c in enumerate(cb): cb_temp.append([(w, rate[i]) for w in ast.literal_eval(c)]) reviews['stopword_body'] = cb_temp # calculate_time(start) # get the corpus of all reviews, lists of all words with label '''--------------------------------------------------------''' cop_wl = [] for b in cb_temp: # change the unicode data to the raw string # cop_wl += [(unicodedata.normalize('NFKD', w[0]).encode('utf-8','replace'), w[1]) for w in b if type(w[0])==unicode] cop_wl += b '''--------------------------------------------------------''' # calculate_time(start) # word frequency of the corpus with label wfq = nltk.FreqDist(cop_wl) # calculate_time(start) # get the word list of all reviews without label cop = [w[0] for w in cop_wl] cop = set(cop) cop_len = len(cop) # calculate_time(start) # get freq of all words in one list wfq_l = [] for w in cop: for i in range(1, 6): wfq_l.append(wfq[(w, i)]) # calculate_time(start) # reshape the list to a matrix wfq_mx = DataFrame(np.array(wfq_l).reshape((cop_len, 5)), index=pd.Index(cop), columns=pd.Index([1, 2, 3, 4, 5])) # calculate_time(start) # calculate the prob of each rating w_s = [] w_sum = [] for i, r in wfq_mx.iterrows(): word_sum = wfq_mx.ix[i].sum() # wfq_mx.ix[i] = wfq_mx.ix[i]/word_sum w_s.append(word_useful_score(list(wfq_mx.ix[i]), word_sum)) w_sum.append(word_sum) wfq_mx['score'] = w_s wfq_mx['sum'] = w_sum wfq_mx = wfq_mx.sort(columns='sum').ix[-int(len(w_s) * threshold):, :] print wfq_mx wfq_mx.to_csv(file_name.split('.')[0] + suffix + '.' + file_name.split('.')[1], sep='\t')
def set2df(sets, column_names, index=None, sort=True): df = DataFrame(list(sets), columns=column_names, index=index) if sort: df = df.sort(column_names) if index: df.index = index else: df.index = range(len(df)) return df
def _parse_fits(filepath): """Parses a GOES FITS file from http://umbra.nascom.nasa.gov/goes/fits/""" fits = pyfits.open(filepath) header = fits[0].header if len(fits) == 4: if is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%Y'): start_time = datetime.datetime.strptime( fits[0].header['DATE-OBS'], '%d/%m/%Y') elif is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%y'): start_time = datetime.datetime.strptime( fits[0].header['DATE-OBS'], '%d/%m/%y') else: raise ValueError("Date not recognized") xrsb = fits[2].data['FLUX'][0][:, 0] xrsa = fits[2].data['FLUX'][0][:, 1] seconds_from_start = fits[2].data['TIME'][0] elif 1 <= len(fits) <= 3: start_time = parse_time(header['TIMEZERO']) seconds_from_start = fits[0].data[0] xrsb = fits[0].data[1] xrsa = fits[0].data[2] else: raise ValueError("Don't know how to parse this file") times = [ start_time + datetime.timedelta(seconds=int(floor(s)), microseconds=int( (s - floor(s)) * 1e6)) for s in seconds_from_start ] # remove bad values as defined in header comments xrsb[xrsb == -99999] = nan xrsa[xrsa == -99999] = nan # fix byte ordering newxrsa = xrsa.byteswap().newbyteorder() newxrsb = xrsb.byteswap().newbyteorder() data = DataFrame({'xrsa': newxrsa, 'xrsb': newxrsb}, index=times) data.sort(inplace=True) return header, data
def compile_predictions(pred): ''' groups predictions made on patches of an image into a set of labels and confidences Args: pred (array-like): output from call to [some sklearn model].predict Returns: DataFrame: compiled predictions ''' data = DataFrame() data['yhat'] = pred data['confidence'] = 1.0 data = data.groupby('yhat').agg(lambda x: x.sum() / data.shape[0]) data.sort('confidence', ascending=False, inplace=True) data['label'] = data.index data.reset_index(drop=True, inplace=True) return data
def _search_by_name_fuzzy(self, name): matches = difflib.get_close_matches(name, self.data_frame.name.dropna(), n=5, cutoff=.8) ranks = dict([(match, i) for i, match in enumerate(matches)]) selection = DataFrame( self.data_frame[self.data_frame.name.isin(matches)]) selection['search_rank'] = selection.name.map(ranks) return selection.sort('search_rank')
class WaferRun: def __init__(self, run_id, wafer_id, label, measurements): self.run_id = int(run_id) self.wafer_id = int(wafer_id) self.label = int(label) self.measurements = DataFrame(measurements) self.measurements.sort(axis=1, inplace=True) self.measurements.sort_index(inplace=True) @staticmethod def from_files(path, run_id, wafer_id): fn_base = os.path.join(path, '{0}_{1:02}'.format(run_id, wafer_id)) try: df = DataFrame({11: DataFrame.from_csv(fn_base + '.11', header=None, sep='\t', index_col=None, parse_dates=False)[1], 12: DataFrame.from_csv(fn_base + '.12', header=None, sep='\t', index_col=None, parse_dates=False)[1], 15: DataFrame.from_csv(fn_base + '.15', header=None, sep='\t', index_col=None, parse_dates=False)[1], 6: DataFrame.from_csv(fn_base + '.6', header=None, sep='\t', index_col=None, parse_dates=False)[1], 7: DataFrame.from_csv(fn_base + '.7', header=None, sep='\t', index_col=None, parse_dates=False)[1], 8: DataFrame.from_csv(fn_base + '.8', header=None, sep='\t', index_col=None, parse_dates=False)[1]}) except: return None m = re.search('/(normal|abnormal)', path) if m is None: return None label = 1 if m.group(1) == 'abnormal' else -1 return WaferRun(run_id, wafer_id, label, df) def as_nparray(self): """Spits out data as a T x D numpy.array (T=# samples, D=# variables) Notes: Notice what we do here: we start with a pandas.DataFrame where each channel is a column (so you can think of it as a T x D matrix). We first rename the columns to channel numbers,then sort the columns, then sort the index, then transform to numpy.array. """ return self.measurements.sort(axis=1).sort_index().reset_index().as_matrix().astype(float)
def ip_requests(log_list, count_dict) : ''' Pandas applications ''' df = DataFrame(log_list) ips = df.groupby('clientip').size() ips.sort() ips_fd = DataFrame({'Number of requests':ips[-10:]}) ips_fd = ips_fd.sort(columns='Number of requests', ascending=False) count_dict['ips_fd'] = ips_fd return count_dict
def __init__(self, column, baseline, adjustments=None): self.column = column self.baseline = baseline.values self.dates = baseline.index self.assets = baseline.columns if adjustments is None: adjustments = DataFrame( index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS, ) else: # Ensure that columns are in the correct order. adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1) adjustments.sort(['apply_date', 'sid'], inplace=True) self.adjustments = adjustments self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date) self.adjustment_end_dates = DatetimeIndex(adjustments.end_date) self.adjustment_sids = Int64Index(adjustments.sid)
def _search_by_inchi_fuzzy(self, inchi): # TODO: use openbabel if available matches = difflib.get_close_matches(inchi, self.data_frame.InChI.dropna(), n=5, cutoff=.8) ranks = dict([(match, i) for i, match in enumerate(matches)]) selection = DataFrame( self.data_frame[self.data_frame.InChI.isin(matches)]) selection['search_rank'] = selection.name.map(ranks) return selection.sort('search_rank')
def word_freq(file_name, suffix='_wordfreq', sep='\t', threshold=.5): print "start word_freq" # start = datetime.datetime.now() # print start reviews = pd.read_csv(file_name, error_bad_lines=False, sep=sep) cb = reviews['stopword_body'] rate = reviews['Rating'] # label all words with the rating cb_temp = [] for i, c in enumerate(cb): cb_temp.append([(w, rate[i]) for w in ast.literal_eval(c)]) reviews['stopword_body'] = cb_temp # calculate_time(start) # get the corpus of all reviews, lists of all words with label '''--------------------------------------------------------''' cop_wl = [] for b in cb_temp: # change the unicode data to the raw string # cop_wl += [(unicodedata.normalize('NFKD', w[0]).encode('utf-8','replace'), w[1]) for w in b if type(w[0])==unicode] cop_wl += b '''--------------------------------------------------------''' # calculate_time(start) # word frequency of the corpus with label wfq = nltk.FreqDist(cop_wl) # calculate_time(start) # get the word list of all reviews without label cop = [w[0] for w in cop_wl] cop = set(cop) cop_len = len(cop) # calculate_time(start) # get freq of all words in one list wfq_l = [] for w in cop: for i in range(1, 6): wfq_l.append(wfq[(w, i)]) # calculate_time(start) # reshape the list to a matrix wfq_mx = DataFrame(np.array(wfq_l).reshape((cop_len,5)), index=pd.Index(cop), columns=pd.Index([1,2,3,4,5])) # calculate_time(start) # calculate the prob of each rating w_s = [] w_sum = [] for i, r in wfq_mx.iterrows(): word_sum = wfq_mx.ix[i].sum() # wfq_mx.ix[i] = wfq_mx.ix[i]/word_sum w_s.append(word_useful_score(list(wfq_mx.ix[i]), word_sum)) w_sum.append(word_sum) wfq_mx['score'] = w_s wfq_mx['sum'] = w_sum wfq_mx = wfq_mx.sort(columns='sum').ix[-int(len(w_s) * threshold):,:] print wfq_mx wfq_mx.to_csv(file_name.split('.')[0] + suffix + '.' + file_name.split('.')[1], sep='\t')
def list_stock_info(self, stock_list): keys = stock_list.keys() data = DataFrame(self.foundmental_data, index=keys) data = data.sort(columns='earn_ratio', ascending=False) for code in data.index: gross_profit_rate = None #默认值 if code in self.profit_data.index: gross_profit_rate = self.profit_data.ix[code]['gross_profit_rate'] #2. 过滤毛利率过低的股票 if gross_profit_rate and gross_profit_rate < 15: continue self.print_stock_info(code, data.ix[code],gross_profit_rate)
def homer_to_narrow_peaks(self, data, output_file): ''' Given a Homer peak dataframe, extract necessary columns and convert to a broadPeak file. From the IDR package description: NarrowPeak files are in BED6+4 format. It consists of 10 tab-delimited columns 1.chrom string Name of the chromosome 2.chromStart int The starting position of the feature in the chromosome. The first base in a chromosome is numbered 0. 3.chromEnd int The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. 4.name string Name given to a region (preferably unique). Use '.' if no name is assigned 5.score int Indicates how dark the peak will be displayed in the browser (1-1000). If '0', the DCC will assign this based on signal value. Ideally average signalValue per base spread between 100-1000. 6.strand char +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned. 7.signalValue float Measurement of overall (usually, average) enrichment for the region. 8.pValue float Measurement of statistical signficance (-log10). Use -1 if no pValue is assigned. 9.qValue float Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned. 10.peak int Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called. ''' # We don't want to require p-value, as Homer doesn't always output it. # Prep it here if it exists, or substitute tag count. pval_col = self.get_first_column(data, self.p_value_columns, required=False) if pval_col is not None: pvals = -np.log10(pval_col) else: pvals = pvals = [-1] * data.shape[0] columns = OrderedDict(( ('chrom', self.get_first_column(data, ['chr', 'chrom', 'chromosome'])), ('chromStart', self.get_first_column(data, ['chromStart', 'start'])), ('chromEnd', self.get_first_column(data, ['chromEnd', 'end'])), ('name', self.get_first_column(data, ['#PeakID', 'PeakID', 'ID', 'name'])), ('score', Series([0] * data.shape[0]) ), # Leave zero so that signalValue column is used ('strand', self.get_first_column(data, ['strand'])), ('signalValue', self.get_first_column(data, self.tag_count_columns)), ('pValue', pvals), # P-value if it exists, or tag count ('qValue', Series([-1] * data.shape[0]) ), # Leave -1 as no individual FDR is called for each peak ('peak', Series([-1] * data.shape[0]) ), # Leave -1 as no point-source is called for each peak )) df = DataFrame(columns) df = df.sort(['signalValue', 'pValue'], ascending=False) df.to_csv(output_file, sep='\t', header=False, index=False)
def merge_variables(self, to_merge): """Merges time series variables into new time series variables. :param to_merge: dictionary mapping new variable name to list of variables to be merged. :return: """ dold = self._data.copy() s = Series(data=np.zeros((dold.shape[0], )), index=dold.index).replace(0, np.nan) dnew = DataFrame( dict([(k, s) for k in to_merge.keys() if len(set(to_merge[k]).intersection(dold.columns)) > 0])) for newvar in dnew.columns: for oldvar in to_merge[newvar]: if oldvar in dold.columns: dnew[newvar][dold[oldvar].notnull()] = dold[oldvar][ dold[oldvar].notnull()] del dold[oldvar] dnew = dnew.join(dold, how='outer') dnew.sort(axis=1, inplace=True) dnew.sort(axis=0, inplace=True) self._data = dnew
def list_stock_info(self, stock_list): keys = stock_list.keys() data = DataFrame(self.foundmental_data, index=keys) data = data.sort(columns='earn_ratio', ascending=False) for code in data.index: gross_profit_rate = None #默认值 if code in self.profit_data.index: gross_profit_rate = self.profit_data.ix[code][ 'gross_profit_rate'] #2. 过滤毛利率过低的股票 if gross_profit_rate and gross_profit_rate < 15: continue self.print_stock_info(code, data.ix[code], gross_profit_rate)
def _assert_matrix_of_thesaurus_c_is_as_expected(matrix, rows, cols): # rows may come in any order assert set(rows) == set(['g/N', 'a/N', 'd/J', 'b/V', 'a/J_b/N']) # columns must be in alphabetical order assert cols == ['a/N', 'b/V', 'd/J', 'g/N', 'x/X'] # test the vectors for each entry expected_matrix = np.array([ [0.1, 0., 0.2, 0.8, 0.], # ab [0., 0.1, 0.5, 0.3, 0.], # a [0.1, 0., 0.3, 0.6, 0.], # b [0.5, 0.3, 0., 0.7, 0.], # d [0.3, 0.6, 0.7, 0., 0.9] # g ]) # put the rows in the matrix in the order in which they are in expected_matrix matrix_ordered_by_rows = matrix[np.argsort(np.array(rows)), :] assert_array_equal(matrix_ordered_by_rows, expected_matrix) vec_df = DataFrame(matrix, columns=cols, index=rows) from pandas.util.testing import assert_frame_equal expected_frame = DataFrame(expected_matrix, index=['a/J_b/N', 'a/N', 'b/V', 'd/J', 'g/N'], columns=cols) assert_frame_equal(vec_df.sort(axis=0), expected_frame.sort(axis=0))
def expandVocab(self, docs): print 'expanding vocabulary...' freqCounts = self.countTokens(docs) tokenList = [] freqCountList = [] for token in freqCounts: tokenList.append(token) freqCountList.append(freqCounts[token]) expTokenDf = DataFrame({ 'tokens': tokenList, 'freqCounts': freqCountList }) expTokenDf = expTokenDf.sort('freqCounts', ascending=False) expandableTokensFiltered = set( expTokenDf['tokens'][2000:3000]).difference(ENGLISH_STOP_WORDS) batchSize = 10000 print "%d filtered tokens chosen" % len(expandableTokensFiltered) print "Expandable tokens: " print expandableTokensFiltered newDocs = [] for i in xrange(0, len(docs)): doc = docs[i] newDocSplit = doc.split() tokenList = doc.split(' ') start = 0 newTokens = set() while start < len(tokenList): stop = start + batchSize tokens = set(tokenList[start:stop]) start = start + batchSize / 2 tokensToExpand = tokens.intersection(expandableTokensFiltered) newTokens = newTokens.union( self.expandVocabFromSet(tokensToExpand)) newDocSplit.extend(list(newTokens)) newDoc = '' for token in newDocSplit: newDoc += ' ' + token + ' ' newDocs.append(newDoc) if i % 500 == 0: print '\nprocessed %d docs' % i print '%d new tokens added to document' % len(newTokens) print 'new tokens:' print newTokens print len(tokens) return newDocs
def _parse_fits(filepath): """Parses a GOES FITS file from http://umbra.nascom.nasa.gov/goes/fits/""" fits = pyfits.open(filepath) header = fits[0].header if len(fits) == 4: if is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%Y'): start_time = datetime.datetime.strptime(fits[0].header['DATE-OBS'], '%d/%m/%Y') elif is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%y'): start_time = datetime.datetime.strptime(fits[0].header['DATE-OBS'], '%d/%m/%y') else: raise ValueError("Date not recognized") xrsb = fits[2].data['FLUX'][0][:, 0] xrsa = fits[2].data['FLUX'][0][:, 1] seconds_from_start = fits[2].data['TIME'][0] elif 1 <= len(fits) <= 3: start_time = parse_time(header['TIMEZERO']) seconds_from_start = fits[0].data[0] xrsb = fits[0].data[1] xrsa = fits[0].data[2] else: raise ValueError("Don't know how to parse this file") times = [start_time + datetime.timedelta(seconds=int(floor(s)), microseconds=int((s - floor(s)) * 1e6)) for s in seconds_from_start] # remove bad values as defined in header comments xrsb[xrsb == -99999] = nan xrsa[xrsa == -99999] = nan # fix byte ordering newxrsa = xrsa.byteswap().newbyteorder() newxrsb = xrsb.byteswap().newbyteorder() data = DataFrame({'xrsa': newxrsa, 'xrsb': newxrsb}, index=times) data.sort(inplace=True) return header, data
def p_adjust(p, method): if method == 'bonferroni': return np.minimum(p*len(p), 1) if method == 'holm': temp = DataFrame({'p': p}) temp.sort(columns='p', inplace=True) temp['newID'] = range(1, len(temp)+1) temp['p_adj'] = np.minimum(temp['p'] * (1 + len(temp) - temp['newID']), 1) temp.sort(inplace=True) return temp['p_adj'] if method == 'fdr': temp = DataFrame({'p': p}) temp.sort(columns='p', inplace=True, ascending=False) temp['newID'] = range(1, len(temp)+1) temp['p_adj'] = np.minimum(1, len(temp)/temp['newID'] * temp['p']) temp.sort(inplace=True) return np.round(temp['p_adj'], 3)
def split_big_dframe(finalhhframe, hhcat): a = finalhhframe[['weight', 'reg02' ]].groupby('reg02').apply(lambda x: x['weight'].count()) b = DataFrame(a, columns=['count']) c = b.sort(columns=['count'], ascending=False) bool1 = c.cumsum() < c.cumsum()['count'].iloc[-1] / 2 list_reg = list(c[bool1].dropna().index) finalhhframe1 = finalhhframe.ix[finalhhframe['reg02'].isin(list_reg), :] finalhhframe2 = finalhhframe.ix[~finalhhframe['reg02'].isin(list_reg), :] int_columns = ['children', 'old', 'decile'] + [ 'cat{}workers'.format(thecat) for thecat in hhcat['hhcat'].unique() ] finalhhframe1 = merges_rows_bis(int_columns, finalhhframe1) finalhhframe2 = merges_rows_bis(int_columns, finalhhframe2) return finalhhframe1, finalhhframe2
def draw_feature_importance(train_x,clf): feature_names = train_x.columns feature_importance = clf.feature_importances_ df = DataFrame({'feature_names':feature_names,'feature_importances':feature_importance}) df1 = df.sort(columns='feature_importances',ascending=False) df1.index = [i for i in range(len(df1))] fig = plt.figure(num=random.randint(1,10000)) ax = fig.add_subplot(111) ax.set_xticks([i for i in range(len(df.feature_names))]) ax.set_xticklabels(df1.feature_names,rotation=-90) ax.grid() ax.plot(df1.feature_importances,label='feature_importance') plt.subplots_adjust(bottom=0.2) return df1