def tokenize_csv_file(source_df: pd.DataFrame, should_replace_company, should_remove_NE, should_remove_numbers, company_alias=None) -> pd.DataFrame: # source_df = pd.read_csv(source_file_path, encoding='utf-8') source_df = copy.deepcopy(source_df) target_column = 'text' for index, row in source_df.iterrows(): target_text = row[target_column] try: target_text = replace_url(target_text) except TypeError: continue if should_replace_company: company_list = [row['company']] if company_alias is not None and row['company'] in company_alias: company_list.extend(company_alias[row['company']]) target_text = replace_target_company(target_text, company_list) if should_remove_NE: ne_removed_content = remove_named_entities(target_text) else: ne_removed_content = target_text tokens = tokenizer.tokenize(ne_removed_content) if should_remove_numbers: tokens = replace_numbers(tokens) source_df.set_value(index, target_column, ' '.join(tokens)) # source_df.to_csv(dest_file_path, index=False) return source_df
def get_bg_freq(string, step, show=False): """ Get dictionary of frequencies of bigrams :param string: text :param step: cycle step (1 or 2) :param show: show frequencies in matrix with symbols indices (bool) :return: dictionary """ f_dict = {} for i in range(0, len(string) - 1, step): if string[i] + string[i + 1] in f_dict: f_dict[string[i] + string[i + 1]] += 1 else: f_dict[string[i] + string[i + 1]] = 1 if show: df = DataFrame(data=0, index=list(ALPHABET), columns=list(ALPHABET)) for i in f_dict: df.set_value(i[0], i[1], f_dict[i]) set_option('display.max_columns', 10) print(df) return f_dict
def features6to8func(): # print "start" start = timeit.default_timer() data1 = pd.read_csv('Phase1_data/testMod.csv' , sep=',', low_memory=False, usecols=[1,2,3,4,5,6]) # print "loaded" # F6: Maximum No of Clicks on any item #Number of clicks per item in a session datagrp = (data1.groupby(['Session_ID','Item_ID']).size()) datagrpframe=datagrp.reset_index() #print ("No of clicks per item in a session")#datagrpgrame #maximum no of clicks on one item data2= DataFrame({'MaxClicksItems': datagrpframe.groupby(['Session_ID'],sort=False)[0L].max()}).reset_index() # print "done6" #F7: No of Distinct Items in a Session data5=DataFrame({'DistinctItems': datagrpframe.groupby(['Session_ID'],sort=False).size()}).reset_index() data2['Distinct_Items']=data5['DistinctItems'] # print "done7" #F8: Session Time data3=DataFrame(data1.groupby(['Session_ID']).max()).reset_index() # print "done 8.1" data4=DataFrame(data1.groupby(['Session_ID']).min()).reset_index() # print "done 8.2" data3['Session_Time_temp']=(pd.to_datetime(data3.Timestamp)-pd.to_datetime(data4.Timestamp)) for i, row in data3.iterrows(): data3.set_value(i,'Session_Time',data3.iloc[i,6].total_seconds()/60) data2['Session_Time']=data3.Session_Time stop = timeit.default_timer() # print stop - start data2.to_csv("Phase1_data/features6-8.csv",sep=',')
def update_predecessoras(df_crono: pd.DataFrame): ### Atualiza a relação de precedencia dos cronogramas df_iter = df_crono for index, item in df_iter.iterrows(): ids_predec = '' str_predecs = '' if str(item['Predecessoras']).find("segue") >= 0: ls_predec = get_num_precedencia(item['Predecessoras']) # monta a lista de predecessoras # TODO: corrigir falha em múltiplas predecessoras #for el in ls_predec: #if not df_crono[df_crono['número da demanda'] == el].empty: #ids_predec = str(ids_predec + get_index_elem_crono(el, df_crono) + ',') #ids_predec = str(ids_predec + str(df_crono[df_crono["número da demanda"] == el].index[0]) + ',') ids_predec = df_crono[df_crono["número da demanda"].isin( ls_predec)].index.values.tolist() str_predecs = str(ids_predec).strip('[]') # remove a virgula do final da lista if str_predecs[-1:] == ',': str_predecs = str_predecs[:-1] df_crono.set_value(index, 'Predecessoras', str_predecs) return df_crono
def exercise_2_b(degree, SUPG=False): mu_values = [1, 0.1, 0.01] N_values = [8, 16, 32, 64] errors_L2 = DataFrame(index=N_values, columns=mu_values) errors_H1 = DataFrame(index=N_values, columns=mu_values) for mu in mu_values: for N in N_values: u_numerical, V, omega = solve_system_two(N=N, mu=mu, degree=degree, SUPG=SUPG) u_exact = Expression( '(exp(1 / mu * x[0]) - 1) / (exp(1 / mu) - 1)', mu=mu, degree=degree) L2 = errornorm(u_exact, u_numerical, 'L2', degree_rise=3) H1 = errornorm(u_exact, u_numerical, 'H1', degree_rise=3) errors_L2.set_value(N, mu, L2) errors_H1.set_value(N, mu, H1) return errors_L2, errors_H1
def Impute(data_as_DataFrame, kNNGraph, Method = IgnoringNan.mean, target = None ): """Impute(data_as_DataFrame,Graph) -> pandas DataFrame with nan's imputed Imputation is via Graph Neighborhoods of kNNGraph Method is applied to each neighborhood array of values for a vertex with an nan Note: data_as_DataFrame can also be a numpy array """ try: data_as_DataFrame.columns data_as_DataFrame.index DFrame = data_as_DataFrame.copy() except: DFrame = DataFrame( data_as_DataFrame ) cols = DFrame.columns inds = DFrame.index Data = DFrame.as_matrix() m,n = DFrame.shape for i in range(m): nbrs = kNNGraph.neighbors(i) for j in range(n): if( isnan( Data[i,j] ) ): DFrame.set_value( inds[i],cols[j], int( Method( array( [Data[nbr,j] for nbr in nbrs] ) ) ) ) return DFrame
class ClusterRecorder: """ 设置记录类 :return: """ def __init__(self, dataset): self.dataset = dataset try: self.recorder_csv = pandas.read_csv( Properties.getDefaultDataFold() + "/csv/recorder_csv_" + self.dataset + ".csv") except: self.recorder_csv = DataFrame([], columns=[ 'id', 'start', 'end', 'd_c', 'max_distance_c', 'dataset', 'pile_size', 'H', 'note' ]) def setValue(self, row, columns, value): self.recorder_csv.set_value(row, columns, value) self.recorder_csv.set_value(row, 'end', Properties.name_str_FULL()) def save(self): self.recorder_csv.to_csv(Properties.getDefaultDataFold() + "/csv/recorder_csv_" + self.dataset + ".csv")
def calculate(self, metric, out_filepath): metric_fun = self.metric_dict[metric] cid_and_embedding = pd.read_csv(self.drug_list_filepath) cid_list = cid_and_embedding['cid'].values embedding_list = cid_and_embedding['entity'].values embedding_list = [ str_2_float_list(embedding) for embedding in embedding_list ] cid2embedding = dict(zip(cid_list, embedding_list)) assert len(cid_list) == NUM_DRUGS data = np.zeros(shape=(NUM_DRUGS, NUM_DRUGS), dtype=np.float32) frame = DataFrame(data, columns=cid_list, index=cid_list) columns = frame.columns for row_cid, row in frame.iterrows(): row_vector = cid2embedding[row_cid] for col_cid in columns: if row_cid == col_cid: continue col_vector = cid2embedding[col_cid] try: sim = metric_fun(row_vector, col_vector) except ValueError: print(row_cid, col_cid) return try: frame.set_value(row_cid, col_cid, sim) except KeyError: print(row_cid, col_cid) break frame.to_csv(out_filepath)
def gonzales(data, k): #transform the data numpy array to data frame using the id as index points_list = DataFrame(data[:, 1:], index=data[:, 0]) #adding two columns in the points data frame for saving the centers and distance points_list["distance"] = np.nan points_list["center"] = np.nan distance_column_index = points_list.columns.get_loc("distance") #choosing a random point as the first center #center0 = points_list.sample(n=1 , random_state = randint(0,100) , axis=0) center0 = points_list.head(1) centers_list = DataFrame(center0.drop(['distance', 'center'], axis=1)) centers_list['color'] = 'r' colors = "bgcmykw" #=========================================================================== # print(centers_list) # print("==============Initialization finished===========") #=========================================================================== #looping k-1 time to have k centers for k_cycle in range(1, k + 1): # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster max_distance = 0 next_cluster = np.nan #loop on all the points to assign them to their closest center for indexp, p in points_list.iterrows(): #variables to save the choose the closest center min_cluster_distance = math.inf closest_cluster = None for indexc, center in centers_list.iterrows(): dis = spatial.distance.euclidean( center.as_matrix(columns=[0, 1]), p.as_matrix(columns=[0, 1])) if dis < min_cluster_distance: min_cluster_distance = dis closest_cluster = indexc p["distance"] = min_cluster_distance p["center"] = closest_cluster if min_cluster_distance > max_distance: max_distance = min_cluster_distance next_cluster = indexp centers_list = centers_list.append( points_list.ix[[next_cluster], :distance_column_index]) centers_list.set_value(next_cluster, 'color', colors[k_cycle]) #======================================================================= # print(centers_list) # print("==============Cycle finished===========") #======================================================================= centers_list.drop(centers_list.tail(1).index, inplace=True) centers_list.drop(['color'], axis=1, inplace=True) #=========================================================================== # centers_list.plot(kind='scatter', x=0, y=1 , c='r' ) # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2 ) # plt.show() #=========================================================================== #print(points_list) return centers_list.as_matrix(columns=[0, 1])
class DataFrameModel(QAbstractTableModel): ''' data model for a DataFrame class ''' def __init__(self): super(DataFrameModel, self).__init__() self.df = DataFrame() def setDataFrame(self, dataFrame): self.df = dataFrame def signalUpdate(self): ''' tell viewers to update their data (this is full update, not efficient)''' self.layoutChanged.emit() #------------- table display functions ----------------- def headerData(self, section, orientation, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if orientation == Qt.Horizontal: try: return self.df.columns.tolist()[section] except (IndexError, ): return QVariant() elif orientation == Qt.Vertical: try: # return self.df.index.tolist() return self.df.index.tolist()[section] except (IndexError, ): return QVariant() def data(self, index, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if not index.isValid(): return QVariant() return QVariant(str(self.df.ix[index.row(), index.column()])) def flags(self, index): flags = super(DataFrameModel, self).flags(index) col = self.df.columns[index.column()] if hasattr(value, 'toPyObject'): # PyQt4 gets a QVariant value = value.toPyObject() else: # PySide gets an unicode dtype = self.df[col].dtype if dtype != object: value = None if value == '' else dtype.type(value) self.df.set_value(row, col, value) return True def rowCount(self, index=QModelIndex()): return self.df.shape[0] def columnCount(self, index=QModelIndex()): return self.df.shape[1]
def __main__(): print("Libraries initialized") print("Loading data") prompts, responses = processCAHData() print("Data loaded") print("Defining model") model = defineModel() print("Compiling model") sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9) model.compile(optimizer=sgd, loss='sparse_categorical_crossentropy', metrics=['accuracy']) try: model.load_weights("weights.hdf5") except OSError: pass except ValueError: pass print("Cards Against Humanity Generator") print("Scale of Hilarity") print("1 - Not Funny") print("2 - Kind of Funny/Eh") print("3 - Funny") print("") x = DataFrame(columns=[0, 1, 2]) y = DataFrame(columns=[0]) prompt_id, resp_id, sent = generateSentence(prompts, responses) rating = rate(prompts, responses) x.loc[0] = [prompt_id, resp_id, sent] y.loc[0] = rating prompt_id, resp_id, sent = generateSentence(prompts, responses) rating = rate(prompts, responses) x.loc[1] = [prompt_id, resp_id, sent] y.loc[1] = rating y.reset_index x.reset_index print("") print("X :") print(x) print("Y :") print(y) model.fit(x, y, batch_size=1, epochs=1) model.save_weights("weights.hdf5") while True: x = DataFrame() y = DataFrame(columns=['y']) print("") prompt_id, resp_id, sent = generateSentence(prompts, responses) x = x.append(Series([prompt_id, resp_id, sent])) prediction = model.predict(x, batch_size=1) print("Model Prediction : ") prediction = prediction[0] print("1 : " + str(prediction[0])) print("2 : " + str(prediction[1])) print("3 : " + str(prediction[2])) rating = rate(prompts, responses) y.set_value(0, 'y', rating) model.fit(x, y, batch_size=1, epochs=1) model.save_weights("weights.hdf5")
def compute_convergence(error_table, N_vals, l_vals): convergence_table = DataFrame(index=l_vals, columns=['alpha']) for column, lam in zip(error_table.transpose().get_values(), l_vals): h_log = [np.log(1.0 / n) for n in N_vals] error_log = error_table.applymap(np.log) convergence_rate = np.polyfit(h_log, error_log[lam], deg=1)[0] convergence_table.set_value(lam, 'alpha', convergence_rate) return convergence_table
def volunteerListToSchedule(volunteerList, numDays): schedule = DataFrame() for day in range(numDays): row = 0 for volunteer in volunteerList: schedule.set_value(row, day, volunteer.schedule[day]) row = row + 1 return schedule
class DataFrameModel(QAbstractTableModel): ''' data model for a DataFrame class ''' def __init__(self): super(DataFrameModel,self).__init__() self.df = DataFrame() def setDataFrame(self,dataFrame): self.df = dataFrame def signalUpdate(self): ''' tell viewers to update their data (this is full update, not efficient)''' self.layoutChanged.emit() #------------- table display functions ----------------- def headerData(self,section,orientation,role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if orientation == Qt.Horizontal: try: return self.df.columns.tolist()[section] except (IndexError, ): return QVariant() elif orientation == Qt.Vertical: try: return self.df.index.tolist() return [str(i) for i in self.df.index.tolist()[section]] except (IndexError, ): return QVariant() def data(self, index, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if not index.isValid(): return QVariant() return QVariant(str(self.df.ix[index.row(),index.column()])) def flags(self, index): flags = super(DataFrameModel, self).flags(index) flags |= Qt.ItemIsEditable return flags def setData(self, index, value, role): self.df.set_value(self.df.index[index.row()], self.df.columns[index.column()], value.toPyObject()) return True def rowCount(self, index=QModelIndex()): return self.df.shape[0] def columnCount(self, index=QModelIndex()): return self.df.shape[1]
def gonzales(data , k): #transform the data numpy array to data frame using the id as index points_list = DataFrame(data[:, 1:] , index = data[ : , 0]) #adding two columns in the points data frame for saving the centers and distance points_list["distance"] = np.nan points_list["center"] = np.nan distance_column_index = points_list.columns.get_loc("distance") #choosing a random point as the first center #center0 = points_list.sample(n=1 , random_state = randint(0,100) , axis=0) center0 = points_list.head(1) centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1)) centers_list['color'] = 'r' colors = "bgcmykw" #=========================================================================== # print(centers_list) # print("==============Initialization finished===========") #=========================================================================== #looping k-1 time to have k centers for k_cycle in range(1,k+1): # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster max_distance = 0 next_cluster = np.nan #loop on all the points to assign them to their closest center for indexp, p in points_list.iterrows(): #variables to save the choose the closest center min_cluster_distance = math.inf closest_cluster = None for indexc, center in centers_list.iterrows(): dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1])) if dis < min_cluster_distance: min_cluster_distance = dis closest_cluster = indexc p["distance"] = min_cluster_distance p["center"] = closest_cluster if min_cluster_distance > max_distance: max_distance = min_cluster_distance next_cluster = indexp centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index ]) centers_list.set_value(next_cluster, 'color', colors[k_cycle]) #======================================================================= # print(centers_list) # print("==============Cycle finished===========") #======================================================================= centers_list.drop(centers_list.tail(1).index, inplace=True) centers_list.drop(['color'], axis=1 ,inplace=True) #=========================================================================== # centers_list.plot(kind='scatter', x=0, y=1 , c='r' ) # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2 ) # plt.show() #=========================================================================== #print(points_list) return centers_list.as_matrix(columns=[0 ,1])
class DataFrameModel(QAbstractTableModel): ''' data model for a DataFrame class ''' def __init__(self): super(DataFrameModel, self).__init__() self.df = DataFrame() def setDataFrame(self, dataFrame): self.df = dataFrame def signalUpdate(self): ''' tell viewers to update their data (this is full update, not efficient)''' self.layoutChanged.emit() #------------- table display functions ----------------- def headerData(self, section, orientation, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if orientation == Qt.Horizontal: try: return self.df.columns.tolist()[section] except (IndexError, ): return QVariant() elif orientation == Qt.Vertical: try: #return self.df.index.tolist() return self.df.index.tolist()[section] except (IndexError, ): return QVariant() def data(self, index, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if not index.isValid(): return QVariant() return QVariant(str(self.df.ix[index.row(), index.column()])) def flags(self, index): flags = super(DataFrameModel, self).flags(index) flags |= Qt.ItemIsEditable return flags def setData(self, index, value, role): self.df.set_value(str(self.df.index[index.row()]), str(self.df.columns[index.column()]), value.toPyObject()) return True def rowCount(self, index=QModelIndex()): return self.df.shape[0] def columnCount(self, index=QModelIndex()): return self.df.shape[1]
def updatePrice(self, stockDataFrame: pd.DataFrame): priceList = {} for index, row in stockDataFrame.iterrows(): stock = stocktw(index) priceList[index] = stock.getPrice() print(index, priceList[index]) ds = pd.Series(priceList) for i, row in ds.iteritems(): stockDataFrame.set_value(i, "股價", ds[i]) return stockDataFrame
def common_gene_pickle_assembler(gene_id_list, filename): score_list_folder = "TFbinding" folder_location = path.join(current_directory, score_list_folder) folder_contents = os.listdir(folder_location) tf_csv_folder = "Gene_CSV" tf_csv_folder_location = path.join(current_directory, tf_csv_folder) ColumnList = [] tf_id_dict = {} total_genes_score_dict = {} for gene_id in gene_id_list: gene = idconverter.getgene(gene_id).SGDID ColumnList += [ '{} Number of Hits'.format(gene_id), '{} Total Sum of Scores'.format(gene_id) ] if 'gene_{}_score_list.pickle'.format(gene) in folder_contents: gene_pickle_file = open( path.join(folder_location, 'gene_{}_score_list.pickle'.format(gene)), 'rb') gene_dict = pickle.load(gene_pickle_file) gene_pickle_file.close() tf_ids = [] tf_score_dict = {} for tf_id in gene_dict.keys(): total_sum_scores = sum(gene_dict[tf_id]) number_binding_sites = len(np.atleast_1d(gene_dict[tf_id])) tf_score_dict[tf_id] = [number_binding_sites, total_sum_scores] tf_ids += [tf_id] tf_id_dict[gene_id] = set(tf_ids) total_genes_score_dict[gene_id] = tf_score_dict tf_sets = tf_id_dict.values() common_tfs = set.intersection(*tf_sets) common_tfs = list(common_tfs) df = DataFrame(columns=['TF Feature Name', 'TF Common Name'] + ColumnList + ['Medline'] + ['TF Description']) for tf_id in common_tfs: tf_description = idconverter.getgene(tf_id).description tf_common_name = idconverter.getgene(tf_id).common_name tf_feature_name = idconverter.getgene(tf_id).feature_name tf_medline = motif[tf_id].medline tf_medline_url = 'www.pubmed.com/{}'.format(tf_medline) df.set_value(tf_id, 'TF Feature Name', tf_feature_name) df.set_value(tf_id, 'TF Common Name', tf_common_name) df.set_value(tf_id, 'TF Description', tf_description) df.set_value(tf_id, 'Medline', tf_medline_url) for gene_id in total_genes_score_dict.keys(): if tf_id in total_genes_score_dict[gene_id].keys(): df.set_value(tf_id, '{} Number of Hits'.format(gene_id), total_genes_score_dict[gene_id][tf_id][0]) df.set_value(tf_id, '{} Total Sum of Scores'.format(gene_id), total_genes_score_dict[gene_id][tf_id][1]) genes_csv_filename = filename genes_csv = path.join(tf_csv_folder_location, "{}.csv".format(genes_csv_filename)) df.to_csv(genes_csv)
def median_over_months(raw): n_years = len(raw.index) patterns = unique([date.split(' ')[1] for date in raw.columns[1:]]) n_patterns = len(patterns) medians = DataFrame(np.zeros((n_years,1+n_patterns)), columns=['year']+patterns) medians['year'] = raw['year'] for i_year in range(0, n_years): for i_pattern in range(0, n_patterns): columns_for_this_day = [col for col in raw.columns[1:] if col.split(' ')[1] == patterns[i_pattern]] medians.set_value(i_year, patterns[i_pattern], median(raw.iloc[i_year][columns_for_this_day])) return medians
def linear_elasticity(polynomial_order=1): l_values = [1, 10, 100, 1000] N_values = [8, 16, 32, 64] mu = 1 error_table = DataFrame(index=N_values, columns=l_values) for lam in l_values: for N in N_values: error_table.set_value( N, lam, solver(N=N, lam=lam, mu=mu, degree=polynomial_order)) convergence = compute_convergence(error_table, N_values, l_values) return error_table, convergence
def sum_over_patterns(raw, new_name=None): n_years = len(raw.index) sums = DataFrame(np.zeros((n_years,1+12)), columns=['year']+months) sums['year'] = raw['year'] for i_year in range(0, n_years): for i_month in range(0, 12): columns_for_this_month = [col for col in raw.columns[1:] if col.split(' ')[0] == months[i_month]] sums.set_value(i_year, months[i_month], sum(raw.iloc[i_year][columns_for_this_month])) if new_name != None: sums.columns = ['year'] + [col+' '+new_name for col in sums.columns if col != 'year'] return sums
def fixup_initdb_data(df: pd.DataFrame): # Initdb spawns 5 postgres processes, rename those to have a different progname so we can measure all for i, row in df.iterrows(): progname = str(row['progname']) count = i % 6 if progname == 'postgres': assert count != 5, "every sixth row should have name initdb" new_progname = progname + "-child-" + str(count + 1) df.set_value(i, 'progname', new_progname) else: # the final row of each run should be initdb assert count == 5, count assert progname == "initdb", progname return df
def compute_tf_idf_queries(self): # Find total number of document results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents')) tmp = results.fetchone() total_doc = tmp[0] results = self.cursor.execute('SELECT did, total_word, path FROM documents') tmp = results.fetchall() documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path']) documents_df['tf_idf'] = 0.0 no_docterm = {} for query in self.queries: no_docterm[query] = 0 for index, row in documents_df.iterrows(): path = row['path'] with codecs.open(path, 'rt') as f: text = f.read() for query in self.queries: if query in text.decode('utf-8').lower(): no_docterm[query] += 1 for query in self.queries: for index, row in documents_df.iterrows(): total_word = row['total_word'] path = row['path'] with codecs.open(path, 'rt') as f: text = f.read() tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query]) cur_tf_idf = documents_df.get_value(index, 'tf_idf') documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf) results = self.cursor.execute('SELECT did, type, entity FROM entities') tmp = results.fetchall() df = DataFrame(tmp, columns=['did', 'e_type', 'entity']) df['tf_idf'] = 0.0 for index, row in df.iterrows(): did = row['did'] tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0] df.set_value(index, 'tf_idf', tf_idf) del df['did'] df = df.groupby(['e_type', 'entity']).sum().reset_index() return df
def fill_encoded(self, df: pd.DataFrame, fld): field_name = fld['name'] field_encoding = fld['encoding'] field_id = fld['id'] for row in df.itertuples(): plaintext = getattr(row, field_name) field = FieldExtra.select( FieldExtra).where((FieldExtra.FieldId == field_id) & (FieldExtra.Value == plaintext)).first() if field is not None: df.set_value(row.Index, field_name, field.Key) else: encoder = self.val_encoders[field_encoding] df = encoder(plaintext, fld) return df
def median_over_months(raw): n_years = len(raw.index) patterns = unique([date.split(' ')[1] for date in raw.columns[1:]]) n_patterns = len(patterns) medians = DataFrame(np.zeros((n_years, 1 + n_patterns)), columns=['year'] + patterns) medians['year'] = raw['year'] for i_year in range(0, n_years): for i_pattern in range(0, n_patterns): columns_for_this_day = [ col for col in raw.columns[1:] if col.split(' ')[1] == patterns[i_pattern] ] medians.set_value(i_year, patterns[i_pattern], median(raw.iloc[i_year][columns_for_this_day])) return medians
def get_items_history(self, **kwargs): zhost = self._do_request( 'host.get', { 'output': ['hostid', 'snmp_available', 'snmp_error'], 'filter': { 'host': [kwargs.get('host', 0)] } }) if not zhost: return kwargs kwargs.update(zhost[0]) items = DataFrame( self._do_request( 'item.get', { 'output': [ 'hostid', 'itemid', 'name', 'key_', 'value_type', 'lastclock', 'lastvalue' ], 'hostids': [h['hostid'] for h in zhost] })) items = items[items.key_.str.contains(kwargs.get('key', 'icmpping'), regex=True, na=False)] if items.empty: return kwargs if kwargs.get('time_from') and kwargs.get('time_till'): kwargs['period'] = kwargs['time_till'] - kwargs['time_from'] for index, item in items.iterrows(): history = DataFrame( self._do_request( 'history.get', { 'history': item['value_type'], 'itemids': [item['itemid']], 'time_from': kwargs['time_from'], 'time_till': kwargs['time_till'], 'sortfield': "clock" })) if history.empty: continue history.clock = to_numeric(history.clock, errors='coerce') history.value = to_numeric(history.value, errors='coerce') if kwargs.get('workonly', []): history = history.loc[history['clock'].isin( Zapi.filterWorkTimestamp(history.clock.tolist(), **kwargs))] items.set_value(index, 'max', history.value.max()) items.set_value(index, 'min', history.value.min()) items.set_value(index, 'avg', history.value.mean()) kwargs['items'] = items.fillna(0).to_dict(orient='records') return kwargs
def update(test_items_dataframe: pd.DataFrame, test_item: TesTItem) -> pd.DataFrame: """Updates the test_items_dataframe with the result and exception of the test_item Arguments: test_items_dataframe {pd.DataFrame} -- A DataFrame of TesTItems to be updated test_item {TesTItem} -- The TesTItem to use for the update Returns: pd.DataFrame -- A new, updated dataframe """ test_items_dataframe = test_items_dataframe.set_index("test") test_items_dataframe = test_items_dataframe.set_value( test_item.name, "result", test_item.result_str) test_items_dataframe = test_items_dataframe.set_value( test_item.name, "exception", str(test_item.exception)) test_items_dataframe = test_items_dataframe.reset_index("test") return test_items_dataframe
def _get_markup(self, para, markup_dict=None): """get markup """ if not markup_dict: markup_dict = self._MARKUPS df = DataFrame(markup_dict, index=['Enter', 'Exit']).T df['In']=False sects=[] place=0 while place > -1: place = -1 markup = None estr = None for mark, enter in df[df.In==False].Enter.iterkv(): find = para.find(enter) if find > -1 and (find<=place or place==-1): if find == place and len(enter) < len(estr): continue place = find markup = mark estr = enter for mark, exit in df[df.In==True].Exit.iterkv(): find = para.find(exit) if find > -1 and (find<=place or place==-1): if find == place and len(exit) < len(estr): continue place = find markup = mark estr = exit if place > -1: sects.append([para[:place], df[df.In==True].index.tolist()]) df.set_value(markup, 'In', not df.get_value(markup, 'In')) para = para[place+len(estr):] if df.In.any(): raise ValueError( 'the markup does not exit from;\n{}'.format(df[df.In==True])) sects.append([para, []]) return sects
def sum_over_patterns(raw, new_name=None): n_years = len(raw.index) sums = DataFrame(np.zeros((n_years, 1 + 12)), columns=['year'] + months) sums['year'] = raw['year'] for i_year in range(0, n_years): for i_month in range(0, 12): columns_for_this_month = [ col for col in raw.columns[1:] if col.split(' ')[0] == months[i_month] ] sums.set_value(i_year, months[i_month], sum(raw.iloc[i_year][columns_for_this_month])) if new_name != None: sums.columns = ['year'] + [ col + ' ' + new_name for col in sums.columns if col != 'year' ] return sums
def userVectors(self, client): # return a one hot dataframe with subreddit as column and user as row redditors = [ redditor['username'] for redditor in self.client.subRec.users.find() ] subs = [sub['name'] for sub in self.client.subRec.subs.find()] df = DataFrame(0, index=redditors, columns=subs) for user in self.client.subRec.users.find(): df = df.set_value(user['username'], user['subreddit'], 1) return df
def fix_event_type(df: DataFrame): ''' Not sure yet. :param df: Dataframe object. :return: Modified Dataframe. ''' a = time.time() colsf = df['id'].ravel() # list of all IDs unique = pd.Series(colsf).unique() # get unique IDs u_counts = [] # list of unique counts (UNUSED) counts_bucket = [] # bucket of counts (UNUSED) df = pd.get_dummies(df) # create dummy variables todrop = df.sum() < 50 # get columns where sum of dummy column < 50 dropcols = df.columns[todrop] # get those column names df = df.drop(dropcols, axis=1) # drop those columns df['num_events'] = 0 # create number of events columns, set to 0 # print(df.columns) print(str(len(unique))) for ii in range(0, len(unique)): # loop through all the unique IDs subset = df.loc[df['id'] == unique[ii]] # subset by that ID the_dummies = subset.columns != 'id' # get all columns that do not equal that ID aa = subset.iloc[:, subset.columns != 'id'].sum().tolist( ) # get all of those columns to list event_sum = np.sum(aa) # sum all of those # aa = aa.set_index([[subset.index[0]]]) # subset.iloc[:,subset.columns != 'id'] = aa df = df.set_value(subset.index, the_dummies, aa) df = df.set_value(subset.index, 'num_events', event_sum) # df.loc[subset.index] = subset df = df.drop_duplicates('id') print(df) b = time.time() print(b - a) return df
def exercise_1_b(degree): """ returns the L2 and H1 errors when using lagrange elements of given degree. """ frequencies = [1, 10] N_values = [8, 16, 32, 64] errors_L2 = DataFrame(index=N_values, columns=frequencies) errors_H1 = DataFrame(index=N_values, columns=frequencies) for k in frequencies: for N in N_values: u_numerical, V, omega = solve_system_one(N=N, k=k, degree=degree) u_exact = Expression('sin(k*pi*x[0])*cos(k*pi*x[1])', k=k, degree=degree) L2 = errornorm(u_exact, u_numerical, 'l2', degree_rise=3) H1 = errornorm(u_exact, u_numerical, 'h1', degree_rise=3) errors_L2.set_value(N, k, L2) errors_H1.set_value(N, k, H1) return errors_L2, errors_H1
def fix_event_type(df: DataFrame): ''' Not sure yet. :param df: Dataframe object. :return: Modified Dataframe. ''' a = time.time() colsf = df['id'].ravel() # list of all IDs unique = pd.Series(colsf).unique() # get unique IDs u_counts = [] # list of unique counts (UNUSED) counts_bucket = [] # bucket of counts (UNUSED) df = pd.get_dummies(df) # create dummy variables todrop = df.sum() < 50 # get columns where sum of dummy column < 50 dropcols = df.columns[todrop] # get those column names df = df.drop(dropcols, axis=1) # drop those columns df['num_events'] = 0 # create number of events columns, set to 0 # print(df.columns) print(str(len(unique))) for ii in range(0,len(unique)): # loop through all the unique IDs subset = df.loc[df['id'] == unique[ii]] # subset by that ID the_dummies = subset.columns != 'id' # get all columns that do not equal that ID aa = subset.iloc[:, subset.columns != 'id'].sum().tolist() # get all of those columns to list event_sum = np.sum(aa) # sum all of those # aa = aa.set_index([[subset.index[0]]]) # subset.iloc[:,subset.columns != 'id'] = aa df = df.set_value(subset.index, the_dummies, aa) df = df.set_value(subset.index, 'num_events', event_sum) # df.loc[subset.index] = subset df = df.drop_duplicates('id') print(df) b = time.time() print(b-a) return df
def fsev_count(df: DataFrame, fsev: int, feature: str, train: bool, blist: list, bidx: int): colname = 'fsev_' + str(fsev) + '_' + str(feature) if train: a = df[df['fault_severity'] == fsev] b = a[feature].value_counts()[0:60] blist = b.tolist() bidx = b.index bdf = pd.DataFrame(b) df[colname] = 0 # subset = df.loc[df.location.isin(a.index)] for i in range(0, len(blist)): percentile = blist[i] / np.sum(blist) locstr = str(bidx[i]) subset = df.location == locstr df = df.set_value(df.location == locstr, colname, percentile) rval = df if train: rval = [df, blist, bidx] return rval
def fsev_count(df: DataFrame, fsev: int, feature: str, train: bool, blist: list, bidx: int): colname = 'fsev_' + str(fsev) + '_' + str(feature) if train: a = df[df['fault_severity'] == fsev] b = a[feature].value_counts()[0:60] blist = b.tolist() bidx = b.index bdf = pd.DataFrame(b) df[colname] = 0 # subset = df.loc[df.location.isin(a.index)] for i in range(0,len(blist)): percentile = blist[i]/np.sum(blist) locstr = str(bidx[i]) subset = df.location == locstr df = df.set_value(df.location == locstr, colname, percentile) rval = df if train: rval = [df, blist,bidx] return rval
cf_dict=pd.read_csv('SourceData/county_facts_dictionary.csv') cf_dict=cf_dict.set_index('column_name') #pivoting and drop Null values for clean and easy analysis pr_piv= pr[['fips', 'candidate','fraction_votes']].pivot(index='fips', columns='candidate', values='fraction_votes') pr_piv.drop(' No Preference', axis=1, inplace=True) pr_piv.drop(' Uncommitted', axis=1, inplace=True) pr_facts=pd.merge(pr_piv, facts, right_index=True, left_index=True) pr_facts=pr_facts.dropna() c=pr[['candidate','party']].drop_duplicates().sort_values(by=['candidate','party']) t=c[['candidate','party']].apply(tuple, axis=1).tolist() d=dict(t) #skipy linregress l=len(pr_facts.columns) linregress_unpiv = DataFrame('',index=range(l),columns=['party','candidate','fact','Rvalue','Pvalue','StdError','Slope','Intercept']) i=0 for c_X in pr_piv.columns: for c_Y in cf_dict.index: R=linregress(pr_facts[[c_X,c_Y]]) # linregress_unpiv.set_value(i,'party',d[c_X]) linregress_unpiv.set_value(i,'candidate',c_X) linregress_unpiv.set_value(i,'fact',c_Y) linregress_unpiv.set_value(i,'Rvalue',R.rvalue) linregress_unpiv.set_value(i,'Pvalue',R.pvalue) linregress_unpiv.set_value(i,'StdError',R.stderr) linregress_unpiv.set_value(i,'Slope',R.slope) linregress_unpiv.set_value(i,'Intercept',R.intercept) i+=1 linregress_unpiv.to_csv('DataForTableau/primary_results_county_facts_linregress.csv')
def fit(self, annotations): """ Parameters ---------- annotations : (Annotation, Annotation) iterator Returns ------- """ # possible_match[n, m] is the total possible match duration # when there are n A-tracks & m B-tracks possible_match = DataFrame() # actual_match[n, m] is the total actual match duration # when there are n A-tracks & m B-tracks actual_match = DataFrame() # overlap[n, m] is the total duration # when there are n A-tracks & m B-tracks overlap = DataFrame() for n, (A, B) in enumerate(annotations): assert isinstance(A, Annotation), "%r is not an Annotation" % A assert isinstance(B, Annotation), "%r is not an Annotation" % B if n == 0: self.modalityA = A.modality self.modalityB = B.modality else: assert A.modality == self.modalityA, \ "bad modality (%r, %r)" % (self.modalityA, A.modality) assert B.modality == self.modalityB, \ "bad modality (%r, %r)" % (self.modalityB, B.modality) assert A.uri == B.uri, \ "resource mismatch (%r, %r)" % (A.uri, B.uri) timeline, a, b = self._AB2ab(A, B) for segment in timeline: duration = segment.duration # number of tracks atracks = a.tracks(segment) Na = len(atracks) btracks = b.tracks(segment) Nb = len(btracks) if Na == 0 or Nb == 0: continue # number of matching tracks N = len(a.get_labels(segment) & b.get_labels(segment)) # increment possible_match & actual_match try: p_m = possible_match.get_value(Na, Nb) a_m = actual_match.get_value(Na, Nb) ovl = overlap.get_value(Na, Nb) except Exception, e: p_m = 0. a_m = 0. ovl = 0. possible_match = possible_match.set_value(Na, Nb, p_m + min(Na, Nb)*duration) actual_match = actual_match.set_value(Na, Nb, a_m + N*duration) overlap = overlap.set_value(Na, Nb, ovl + duration)
import pandas as pd from pandas import Series,DataFrame import numpy as np # source data pr=pd.read_csv('primary_results.csv') #pivoting pr_piv= pr[['fips', 'candidate','fraction_votes']].pivot(index='fips', columns='candidate', values='fraction_votes') pr_piv.drop(' No Preference', axis=1, inplace=True) pr_piv.drop(' Uncommitted', axis=1, inplace=True) pr_piv=pr_piv.dropna() l=len(pr_piv.index) pr_unpiv = DataFrame('',index=range(l*14),columns=['fips','fraction_votes','candidate']) j=0 while j<len(pr_unpiv): for i in range(0,l-1): for c in pr_piv.columns: pr_unpiv.set_value(j, 'fips', pr_piv.index[i]) pr_unpiv.set_value(j, 'fraction_votes', pr_piv.get_value(pr_piv.index[i],c)) pr_unpiv.set_value(j, 'candidate', c) j+=1 pr_unpiv.to_csv('DataForTableau/primary_results_dropna.csv')
def compute_contour_data(contours_bins, contours_saliences, contours_start_times, stepNotes, minF0, hopsize, normalize=True, extra_features=None): from pandas import DataFrame, concat from numpy import mean, std, array, Inf, zeros """ Create contour pandas dataframe uing contour information previouslly extracted with Essentia. Initializes DataFrame to have all future columns. Parameters ---------- contours_bins: set of bins of the extracted contours contours_saliences: set of saliences of the extracted contours contours_start_times: set of starting times of the extracted contours stepNotes: number of bins per semitone minF0: minimum F0 in the salience functions hopsize: Hop size normalize: [True, False] to normalise the features, as performed in Bittner2015 extra_features: Ncontours * N_features set of extra features apart from the ones used by Bittner2015 (pitch, duration, vibrato, salience) Returns ------- contour_data : DataFrame Pandas data frame with all contour data, to be used for contour classification """ contours_bins = array(contours_bins) contours_saliences = array(contours_saliences) contours_start_times = array(contours_start_times) contour_data = DataFrame headers = [] # Set of headers, containing the first 12 features [0:11] and the first time for each of the contours headers[0:12] = [ 'onset', 'offset', 'duration', 'pitch mean', 'pitch std', 'salience mean', 'salience std', 'salience tot', 'vibrato', 'vib rate', 'vib extent', 'vib coverage', 'first_time' ] # Number of contours Ncont = len(contours_bins) # Find length of longest contour maxLen = 0 for i in range(Ncont): maxLen = max(maxLen, len(contours_bins[i])) # Header "first_time" can be used to find where the contour features end, # and when the contour info starts (time, bin, salience) # Just giving the extra headers some name headers[13:] = (array(range(maxLen * 3))).tolist() contour_data.num_end_cols = 4 # Initialising dataset, following the format from the hacked VAMP MELODIA plugin from J. Salamon contour_data = DataFrame(Inf * zeros([Ncont, len(headers)]), columns=headers) for i in range(Ncont): #print i # Giving values for each row of the dataframe L = len(contours_saliences[i]) # minF0 instead of 55 pitches = 55 * 2**((array(contours_bins[i]) / (12. * stepNotes))) contour_data.set_value(i, 'onset', contours_start_times[i]) contour_data.set_value( i, 'offset', array(contours_start_times[i]) + len(pitches) * hopsize) contour_data.set_value(i, 'duration', len(pitches) * hopsize) contour_data.set_value(i, 'pitch mean', mean(pitches)) contour_data.set_value(i, 'pitch std', std(pitches)) contour_data.set_value(i, 'salience mean', mean(array(contours_saliences[i]))) contour_data.set_value(i, 'salience std', std(array(contours_saliences[i]))) contour_data.set_value(i, 'salience tot', sum(array(contours_saliences[i]))) # In this case, we do not compute vibrato features, so we set them to 0. # This could be updated in order to use also vibrato features from contours extracted with Essentia contour_data.set_value(i, 'vibrato', 0) contour_data.set_value(i, 'vib rate', 0) contour_data.set_value(i, 'vib extent', 0) contour_data.set_value(i, 'vib coverage', 0) # After setting the features, we now give each contour the frame by frame information, e.g for frame0 (fr0), frame 1 (fr1)... # time_fr0, pitch_fr0, salience_fr0, time_fr1, pitch_fr1, salience_fr1, time_fr2, pitch_fr2, salience_fr2, ... contour_data.iloc[ i, 12:12 + L * 3:3] = contours_start_times[i] + hopsize * array(range(L)) contour_data.iloc[i, 13:13 + L * 3:3] = pitches contour_data.iloc[i, 14:14 + L * 3:3] = array(contours_saliences[i]) # If extra features are used, they are set before the first_time # TODO: replace here with this instead of following line. Maybe pandas does not work here for me # contour_data = extend_contour_features(contour_data, extra_features) if extra_features is not None: sal_features_data = contour_data[headers[0:12]] # frame-by-frame fetures frame_by_frame_features_data = contour_data[headers[12:]] dfFeatures = concat([sal_features_data, extra_features], axis=1) contour_data = concat([dfFeatures, frame_by_frame_features_data], axis=1) # All classification labels are initialised (will be updated while performing contour classification). # if exist do not create if 'overlap' not in contour_data.columns: contour_data['overlap'] = -1 if 'labels' not in contour_data.columns: contour_data['labels'] = -1 if 'melodiness' not in contour_data.columns: contour_data['melodiness'] = -1 if 'mel prob' not in contour_data.columns: contour_data['mel prob'] = -1 # Normalising features if normalize: contour_data = cu.normalize_features(contour_data) print "Contour dataframe created" return contour_data
class DataFrameModel(QAbstractTableModel): ''' data model for a DataFrame class ''' def __init__(self): super(DataFrameModel, self).__init__() self.df = DataFrame() def setDataFrame(self, dataFrame): self.df = dataFrame def signalUpdate(self): ''' tell viewers to update their data (this is full update, not efficient)''' self.layoutChanged.emit() #------------- table display functions ----------------- def headerData(self, section, orientation, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if orientation == Qt.Horizontal: try: return self.df.columns.tolist()[section] except (IndexError, ): return QVariant() elif orientation == Qt.Vertical: try: # return self.df.index.tolist() return self.df.index.tolist()[section] except (IndexError, ): return QVariant() def data(self, index, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if not index.isValid(): return QVariant() # gde 2014.02.19 - original implementation only worked # if there were no missing indices. Instead use get_value #return QVariant(str(self.df.ix[index.row(), index.column()])) row = self.df.index[index.row()] col = self.df.columns[index.column()] str_value = str(self.df.get_value(row, col)) return QVariant(str_value) def flags(self, index): flags = super(DataFrameModel, self).flags(index) flags |= Qt.ItemIsEditable return flags def setData(self, index, value, role): row = self.df.index[index.row()] col = self.df.columns[index.column()] if hasattr(value, 'toPyObject'): # PyQt4 gets a QVariant value = value.toPyObject() else: # PySide gets an unicode dtype = self.df[col].dtype if dtype != object: value = None if value == '' else dtype.type(value) self.df.set_value(row, col, value) return True def rowCount(self, index=QModelIndex()): return self.df.shape[0] def columnCount(self, index=QModelIndex()): return self.df.shape[1]
answers = filtered_data[filtered_data.question_id ==qid][['user_id','correct']] answers.columns = ['user_id','answer'] users_subset = users.merge(answers,how='inner',on='user_id') #small adjustment to mean to remove the effect of the question being analized users_subset['mean'] = (users_subset['mean']* users_subset['count']- users_subset['answer'])/(users_subset['count']-1) for quant in STUDENT_QUANTILES: quant2 = score_percentiles[quant] means = users_subset.groupby(by=[users_subset['percentile'] > quant2]).agg({'answer':'mean'}) t = str(int(quant*100)) try: prob_good =means.get_value(True,'answer') results.set_value(qid,'good_'+t,prob_good) except: pass try: prob_bad = means.get_value(False,'answer') results.set_value(qid,'bad_'+t,prob_bad) except: pass ### Plot the resulting ratios for quant in STUDENT_QUANTILES: t = str(int(quant*100)) plt.plot(results['bad_'+t],results['good_'+t],'b.') plt.plot(np.arange(0,1.1,.1),np.arange(0,1.1,.1),'g-',alpha=.5) plt.title("Discrimination: "+t+"th Percentile") plt.ylabel("Proportion Right, Good Students")
pvalue=DataFrame(np.nan,index=index,columns=index) pvalue.index.names=['Party','Candidate'] pvalue.index.lexsort_depth pvalue.columns.lexsort_depth #StdErr stderr=DataFrame(np.nan,index=index,columns=index) stderr.index.names=['Party','Candidate'] stderr.index.lexsort_depth stderr.columns.lexsort_depth # for c_X in pr_piv.columns: for c_Y in pr_piv.columns: R=linregress(pr_piv[[c_X,c_Y]]) p_X=index.get_loc_level(c_X,1)[1][0] p_Y=index.get_loc_level(c_Y,1)[1][0] rvalue.set_value((p_Y,c_Y), (p_X,c_X), R.rvalue) pvalue.set_value((p_Y,c_Y), (p_X,c_X),R.pvalue) stderr.set_value((p_Y,c_Y), (p_X,c_X), R.stderr) #democrats only heatmap(rvalue.loc['Democrat']['Democrat'],'dem_rvalue.png') heatmap(pvalue.loc['Democrat']['Democrat'],'dem_pvalue.png') heatmap(stderr.loc['Democrat']['Democrat'],'dem_stderr.png') #republicans only heatmap(rvalue.loc['Republican']['Republican'],'rep_rvalue.png') heatmap(pvalue.loc['Republican']['Republican'],'rep_pvalue.png') heatmap(stderr.loc['Republican']['Republican'],'rep_stderr.png') #most anticorrelated republicans RepRvalue_idxmin=rvalue.loc['Republican']['Republican'].idxmin(axis=0)
def diag(self): df = self.get_results_dataframe(index_by_code = True) df_nivvie = df.xs('nivvie') df_revdisp = df.xs('revdisp') df_rev = df.xs('rev_trav') + df.xs('pen') + df.xs('rev_cap_net') df_af = df.xs('af') df_pfam = df.xs('pfam') df_mini = df.xs('mini') df_logt = df.xs('logt') df_impo = df.xs('ppe') + df.xs('impo') df_impo.name = "impo+ppe" df_public = df.xs('psoc') + df.xs('ppe') + df.xs('impo') loyer_chef = self.scenario_chef_seul.menage[0]['loyer'] pension_alim_tot = sum([ var['pension_alim'] for var in self.children.values()]) noi = self.children.keys()[0] if self.children[noi]["temps_garde"] == 'alternee_pension_non_decl': df_revdisp['chef'] = ( df_rev['chef'] + df_mini['chef_seul'] + df_af['part']/2 + df_logt['chef_seul'] - pension_alim_tot + df_impo['chef'] ) df_pfam['chef'] = df_af['part']/2 df_logt['chef'] = df_logt['chef_seul'] df_mini['chef'] = df_mini['chef_seul'] df_public['chef'] = ( df_logt['chef_seul'] + df_mini['chef_seul']+ df_pfam['chef'] + df_impo['chef'] ) df_nivvie['chef'] = df_revdisp['chef']/self.uc['chef'] df_revdisp['part'] = ( df_revdisp['part'] - df_af['part']/2 + pension_alim_tot ) df_pfam['part'] -= df_af['part']/2 df_public['part'] = ( df_logt['part'] + df_mini['part']+ df_pfam['part'] + df_impo['part'] ) df_nivvie['part'] = df_revdisp['part']/self.uc['part'] uc_couple = self.uc['couple'] total_cost_before = ((uc_couple-1.5)/uc_couple)*(df_revdisp['couple']) public_cost_before = ( df_public['couple'] - df_public['couple_seul']) private_cost_before = total_cost_before - public_cost_before uc_chef = self.uc['chef'] uc_part = self.uc['part'] total_cost_after_chef = (uc_chef-1)/(uc_chef)*df_revdisp['chef'] total_cost_after_part = (uc_part-1)/(uc_part)*df_revdisp['part'] # total_cost_after = total_cost_after_chef + total_cost_after_part public_cost_after_chef = df_public['chef'] - df_public['chef_seul'] public_cost_after_part = df_public['part'] - df_public['part_seul'] #public_cost_after = ( public_cost_after_chef + public_cost_after_part ) #private_cost_after = total_cost_after - public_cost_after # private_cost_after_chef = total_cost_after_chef + pension_alim_tot - public_cost_after_chef # private_cost_after_part = total_cost_after_part - pension_alim_tot - public_cost_after_part private_cost_after_chef = total_cost_after_chef - public_cost_after_chef private_cost_after_part = total_cost_after_part - public_cost_after_part desunion_public_cost = df_public['part'] + df_public['chef'] - df_public['couple'] nivvie_loss_couple = df_nivvie[u"couple"]/df_nivvie["couple_seul"] nivvie_loss_chef = df_nivvie[u"chef"]/df_nivvie["chef_seul"] nivvie_loss_part = df_nivvie[u"part"]/df_nivvie["part_seul"] df2 = DataFrame( [df_revdisp, df_pfam, df_mini, df_logt, df_impo, df_nivvie]) df2 = df2[ ['couple', 'part', 'chef'] ] df2 = df2.set_value(u"dépense totale pour enfants", 'couple', total_cost_before) df2 = df2.set_value(u"dépense totale pour enfants", 'chef', total_cost_after_chef) df2 = df2.set_value(u"dépense totale pour enfants", 'part', total_cost_after_part) df2 = df2.set_value(u"prise en charge publique de l'enfant", 'couple', public_cost_before) df2 = df2.set_value(u"prise en charge publique de l'enfant", 'chef', public_cost_after_chef) df2 = df2.set_value(u"prise en charge publique de l'enfant", 'part', public_cost_after_part) df2 = df2.set_value(u"prise en charge privée de l'enfant", 'couple', private_cost_before) df2 = df2.set_value(u"prise en charge privée de l'enfant", 'chef', private_cost_after_chef) df2 = df2.set_value(u"prise en charge privée de l'enfant", 'part', private_cost_after_part) df2 = df2.set_value(u"loyer", 'couple', 12*self.scenario.menage[0]['loyer']) df2 = df2.set_value(u"loyer", 'chef', 12*loyer_chef) df2 = df2.set_value(u"loyer", 'part', 12*self.scenario_part.menage[0]['loyer']) df2 = df2.set_value(u"pension", 'couple', 0) df2 = df2.set_value(u"pension", 'chef', -pension_alim_tot ) df2 = df2.set_value(u"pension", 'part', pension_alim_tot) df2 = df2.set_value(u"nivvie_loss", 'couple', nivvie_loss_couple) df2 = df2.set_value(u"nivvie_loss", 'chef', nivvie_loss_chef) df2 = df2.set_value(u"nivvie_loss", 'part', nivvie_loss_part) df2 = df2.set_value(u"coût public de la désunion", "couple", desunion_public_cost ) df2 = df2.T df2.index.name = u"ménage" df2 = df2.reset_index() return df2
class Scores(AnnotationMixin, object): """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df( cls, df, uri=None, modality=None, aggfunc=np.mean ): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ A = cls(uri=uri, modality=modality) A._df = pivot_table( df, values=PYANNOTE_SCORE, rows=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], cols=PYANNOTE_LABEL, aggfunc=aggfunc ) return A def __init__(self, uri=None, modality=None): super(Scores, self).__init__() index = MultiIndex( levels=[[], []], labels=[[], []], names=[PYANNOTE_SEGMENT, PYANNOTE_TRACK] ) self._df = DataFrame(index=index, dtype=np.float64) self.modality = modality self.uri = uri self._timelineHasChanged = True # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key): if isinstance(key, Segment): segment = key self._df = self._df.drop(segment, axis=0) self._timelineHasChanged = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self._df = self._df.drop((segment, track), axis=0) self._timelineHasChanged = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): segment, track, label = key return self._df.get_value((segment, track), label) def get_track_scores(self, segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return {l: self._df.get_value((segment, track), l) for l in self._df} # scores[segment, track, label] = value def __setitem__(self, key, value): segment, track, label = key self._df = self._df.set_value((segment, track), label, value) self._timelineHasChanged = True def labels(self, unknown=True): """List of labels Parameters ---------- unknown : bool, optional When False, do not return Unknown instances When True, return any label (even Unknown instances) Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ labels = sorted(self._df.columns, key=str) if unknown: return labels else: return [l for l in labels if not isinstance(l, Unknown)] def itervalues(self): """Iterate over annotation as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._df = self._df.sort_index() # yield one (segment, track, label) tuple per loop labels = self._df.columns for (segment, track), columns in self._df.iterrows(): for label in labels: value = columns[label] if np.isnan(value): continue else: yield segment, track, label, value def _rank(self, invert): if invert: direction = 1. else: direction = -1. def nan_rank(data): # replace NaN by -inf or +inf depending on the requested direction finite = np.isfinite(data) fixed = np.where(finite, direction*data, -direction*np.inf) # do the actual argsort indices = np.argsort(fixed) # get rank from argsort rank = np.argsort(indices) # special treatment for inverted NaN scores # (we want ranks to start at 0 even in case of NaN) if invert: rank = np.where(finite, rank-(len(data)-np.sum(finite)), np.nan) else: rank = np.where(finite, rank, np.nan) return rank return self._df.apply(nan_rank, axis=1) def rank(self, invert=False): """ Parameters ---------- invert : bool, optional By default, larger scores are better. Set `invert` to True to indicate smaller scores are better. Returns ------- rank : `Scores` """ A = self.__class__(uri=self.uri, modality=self.modality) A._df = self._rank(invert) return A def nbest(self, n, invert=False): """ Parameters ---------- n : int Size of n-best list invert : bool, optional By default, larger scores are better. Set `invert` to True to indicate smaller scores are better. Returns ------- nbest : `Scores` New scores where only n-best are kept. """ df = self._df.copy() nbest = self._rank(invert) < n df[~nbest] = np.nan A = self.__class__(uri=self.uri, modality=self.modality) A._df = df return A def subset(self, labels, invert=False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) A = self.__class__(uri=self.uri, modality=self.modality) A._df = self._df[list(labels)] return A def to_annotation(self, threshold=-np.inf, posterior=False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ annotation = Annotation(uri=self.uri, modality=self.modality) if not self: return annotation best = self.nbest(1, invert=False) if posterior: # compute unknown posterior func = lambda p: 1. - np.nansum(p, axis=1) Pu = self.apply(func, new_columns=['_']) # threshold best target posterior # with unknown posterior and threshold for segment, track, label, value in best.itervalues(): if value < Pu[segment, track, '_'] or value < threshold: label = Unknown() annotation[segment, track] = label else: # threshold best target score with threshold for segment, track, label, value in best.itervalues(): if value < threshold: label = Unknown() annotation[segment, track] = label return annotation def map(self, func): """Apply function to all values""" A = self.__class__(uri=self.uri, modality=self.modality) A._df = func(self._df) return A def apply(self, data_func, new_index=None, new_columns=None): """Apply `data_func` on internal numpy array Parameters ---------- data_func : func Function expecting (index x columns) numpy array as input new_index : iterable, optional When provided, these will be the index of returned array. new_columns : iterable, optional When provided, these will be the columns of returned array. """ new_data = data_func(self._df.values) if new_index is None: new_index = self._df.index if new_columns is None: new_columns = self._df.columns df = DataFrame( data=new_data, index=new_index, columns=new_columns) new_scores = self.__class__(uri=self.uri, modality=self.modality) new_scores._df = df return new_scores def _repr_png_(self): from pyannote.core.notebook import repr_scores return repr_scores(self)
decil, values = mark_weighted_percentiles(nivvie, labels, wprm, method, return_quantiles = True) df2 = DataFrame({"decile" : decil}) df["decile"] = df2["decile"] indexes = { "zrstm" : .01, "zchom": .01, "pfamm" : .01} # TODO change 1% results = DataFrame(index =indexes.keys(), columns = ["total", "pauvre50", "pauvre60"] + ["decile>"+str(decile) for decile in range(0,10)] ) for var, index in indexes.iteritems(): total = df[var]*index*df["wprm"]*df["champm"] pauvre50 = df[var]*index*df["wprm"]*(df["pauvre50m"]<=0)*df["champm"] pauvre60 = df[var]*index*df["wprm"]*(df["pauvre60m"]<=0)*df["champm"] results.set_value(var, "total", total.sum()/1e6) results.set_value(var, "pauvre50", pauvre50.sum()/1e6) results.set_value(var, "pauvre60", pauvre60.sum()/1e6) for decile in range(0,10): temp = df[var]*index*df["wprm"]*(df["decile"]>decile)*df["champm"] results.set_value(var, "decile>"+str(decile), temp.sum()/1e6) del temp print results import os filename = os.path.join(destination_dir,"desindexation.xls") print filename writer = ExcelWriter(str(filename)) results.to_excel(writer) writer.save()
pr_facts = pr_facts.dropna() c = pr[['candidate', 'party']].drop_duplicates().sort_values(by=['candidate', 'party']) t = c[['candidate', 'party']].apply(tuple, axis=1).tolist() d = dict(t) #skipy linregress l = len(pr_facts.columns) linregress_unpiv = DataFrame('', index=range(l), columns=[ 'party', 'candidate', 'fact', 'Rvalue', 'Pvalue', 'StdError', 'Slope', 'Intercept' ]) i = 0 for c_X in pr_piv.columns: for c_Y in cf_dict.index: R = linregress(pr_facts[[c_X, c_Y]]) # linregress_unpiv.set_value(i, 'party', d[c_X]) linregress_unpiv.set_value(i, 'candidate', c_X) linregress_unpiv.set_value(i, 'fact', c_Y) linregress_unpiv.set_value(i, 'Rvalue', R.rvalue) linregress_unpiv.set_value(i, 'Pvalue', R.pvalue) linregress_unpiv.set_value(i, 'StdError', R.stderr) linregress_unpiv.set_value(i, 'Slope', R.slope) linregress_unpiv.set_value(i, 'Intercept', R.intercept) i += 1 linregress_unpiv.to_csv( 'DataForTableau/primary_results_county_facts_linregress.csv')
pvalue.columns.names=['Party','Candidate'] pvalue.columns.lexsort_depth pvalue.index.names=['Fact'] #StdErr stderr = DataFrame(np.nan,index=cf_dict.index,columns=index) stderr.columns.names=['Party','Candidate'] stderr.columns.lexsort_depth stderr.index.names=['Fact'] # for c_X in pr_piv.columns: for c_Y in cf_dict.index: R=linregress(pr_facts[[c_X,c_Y]]) p_X=index.get_loc_level(c_X,1)[1][0] rvalue.set_value(c_Y,(p_X,c_X), R.rvalue) pvalue.set_value(c_Y,(p_X,c_X), R.pvalue) stderr.set_value(c_Y,(p_X,c_X), R.stderr) #It's a huge image and it's hard to review heatmap(rvalue,'rvalue_facts.png') heatmap(pvalue,'pvalue_facts.png') heatmap(stderr,'stderr_facts.png') #Let's find out the most correlated facts to Democrat candidates choice #democrats only DemRvalue=rvalue['Democrat'] DemPvalue=pvalue['Democrat'] DemStdErr=stderr['Democrat']
par_own_merge.to_csv('nt_final_home_owner.csv', header=True) llc_props.to_csv('llcs.csv', header=True) amounts_total_grouped.agg(['count','sum']).to_csv('Ownership_Balance_Totals.csv', header=True) ################################################ output ######################################## #evaluate threshold for most accurate name match thresholds = range(1,101) o = [] a = [] est = DataFrame(columns=('threshold','own_count','addr_count')) for t in thresholds: acc = accuracy(t) est.set_value(t,'threshold',t) est.set_value(t,'own_count',acc[0]) est.set_value(t, 'addr_count',acc[1]) x = sm.add_constant(est[['own_count', 'addr_count']]) reg = sm.OLS(est['threshold'],x).fit() '>>>%matplotlib inline' import pylab pylab.scatter(est.own_count, est.threshold, est.addr_count) pylab.plot(thresholds, r) pylab.plot(r,p) pylab.legend(['precision','recall'],loc=2) ################################################ end output ################################### if __name__ == '__main__':
class Aggregates(object): filter_by = None labels = collections.OrderedDict(( ('var', u"Mesure"), ('entity', u"Entité"), ('dep', u"Dépenses\n(millions d'€)"), ('benef', u"Bénéficiaires\n(milliers)"), ('dep_default', u"Dépenses initiales\n(millions d'€)"), ('benef_default', u"Bénéficiaires\ninitiaux\n(milliers)"), ('dep_real', u"Dépenses\nréelles\n(millions d'€)"), ('benef_real', u"Bénéficiaires\nréels\n(milliers)"), ('dep_diff_abs', u"Diff. absolue\nDépenses\n(millions d'€)"), ('benef_diff_abs', u"Diff absolue\nBénéficiaires\n(milliers)"), ('dep_diff_rel', u"Diff. relative\nDépenses"), ('benef_diff_rel', u"Diff. relative\nBénéficiaires"), )) # TODO: localize show_default = False show_diff = True show_real = True survey_scenario = None totals_df = None varlist = None def __init__(self, survey_scenario = None): if survey_scenario is not None: self.set_survey_scenario(survey_scenario) def clear(self): self.totals_df = None def compute(self): """ Compute the whole table """ self.compute_aggregates(self.filter_by) self.load_amounts_from_file() self.compute_real() self.compute_diff() def compute_aggregates(self, filter_by = None): """ Compute aggregate amounts """ column_by_name = self.simulation.tax_benefit_system.column_by_name V = [] M = {'data': [], 'default': []} B = {'data': [], 'default': []} U = [] M_label = {'data': self.labels['dep'], 'default': self.labels['dep_default']} B_label = {'data': self.labels['benef'], 'default': self.labels['benef_default']} for var in self.varlist: # amounts and beneficiaries from current data and default data if exists montant_benef = self.get_aggregate(var, filter_by) V.append(column_by_name[var].label) entity = column_by_name[var].entity_key_plural U.append(entity) for dataname in montant_benef: M[dataname].append(montant_benef[dataname][0]) B[dataname].append(montant_benef[dataname][1]) # build items list items = [(self.labels['var'], V)] for dataname in M: if M[dataname]: items.append((M_label[dataname], M[dataname])) items.append((B_label[dataname], B[dataname])) items.append((self.labels['entity'], U)) aggr_frame = DataFrame.from_items(items) self.aggr_frame = None for code, label in self.labels.iteritems(): try: col = aggr_frame[label] if self.aggr_frame is None: self.aggr_frame = DataFrame(col) else: self.aggr_frame = self.aggr_frame.join(col, how="outer") except: pass def compute_diff(self): ''' Computes and adds relative differences ''' dep = self.aggr_frame[self.labels['dep']] benef = self.aggr_frame[self.labels['benef']] if self.show_default: ref_dep_label, ref_benef_label = self.labels['dep_default'], self.labels['benef_default'] if ref_dep_label not in self.aggr_frame: return elif self.show_real: ref_dep_label, ref_benef_label = self.labels['dep_real'], self.labels['benef_real'] else: return ref_dep = self.aggr_frame[ref_dep_label] ref_benef = self.aggr_frame[ref_benef_label] self.aggr_frame[self.labels['dep_diff_rel']] = (dep - ref_dep) / abs(ref_dep) self.aggr_frame[self.labels['benef_diff_rel']] = (benef - ref_benef) / abs(ref_benef) self.aggr_frame[self.labels['dep_diff_abs']] = dep - ref_dep self.aggr_frame[self.labels['benef_diff_abs']] = benef - ref_benef def compute_real(self): ''' Adds administrative data to dataframe ''' if self.totals_df is None: return A, B = [], [] for var in self.varlist: # totals from administrative data if var in self.totals_df.index: A.append(self.totals_df.get_value(var, "amount")) B.append(self.totals_df.get_value(var, "benef")) else: A.append(nan) B.append(nan) self.aggr_frame[self.labels['dep_real']] = A self.aggr_frame[self.labels['benef_real']] = B def create_description(self): ''' Creates a description dataframe ''' now = datetime.now() return DataFrame([ u'OpenFisca', u'Calculé le %s à %s' % (now.strftime('%d-%m-%Y'), now.strftime('%H:%M')), u'Système socio-fiscal au %s' % self.simulation.period.start, u"Données d'enquêtes de l'année %s" % str(self.simulation.input_table.survey_year), ]) def get_aggregate(self, variable, filter_by = None): """ Returns aggregate spending, and number of beneficiaries for the relevant entity level Parameters ---------- variable : string name of the variable aggregated according to its entity """ simulation = self.simulation column_by_name = self.simulation.tax_benefit_system.column_by_name column = column_by_name[variable] weight_name = self.weight_column_name_by_entity_key_plural[column.entity_key_plural] filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural) # amounts and beneficiaries from current data and default data if exists # Build weights for each entity data = DataFrame( { variable: simulation.calculate_add(variable), weight_name: simulation.calculate(weight_name), } ) data_default = None datasets = {'data': data} if data_default is not None: datasets['default'] = data_default filter_indicator = True if filter_by: filtered_data = DataFrame( { variable: simulation.calculate(variable), weight_name: simulation.calculate(weight_name), filter_by_name: simulation.calculate(filter_by_name), } ) data_default = None filter_indicator = filtered_data[filter_by_name] m_b = {} weight = data[weight_name] * filter_indicator for name, data in datasets.iteritems(): amount = data[variable] benef = data[variable].values != 0 try: total_amount = int(round(sum(amount * weight) / 10 ** 6)) except: total_amount = nan try: total_benef = int(round(sum(benef * weight) / 10 ** 3)) except: total_benef = nan m_b[name] = [total_amount, total_benef] return m_b def load_amounts_from_file(self, filename = None, year = None): ''' Loads totals from files ''' if year is None: year = self.year if filename is None: data_dir = DATA_DIR try: filename = os.path.join(data_dir, "amounts.h5") store = HDFStore(filename) df_a = store['amounts'] df_b = store['benef'] store.close() self.totals_df = DataFrame(data = { "amount": df_a[year] / 10 ** 6, "benef": df_b[year] / 1000, }) row = DataFrame({'amount': nan, 'benef': nan}, index = ['logt']) self.totals_df = self.totals_df.append(row) # Add some aditionnals totals for col in ['amount', 'benef']: # Deals with logt logt = 0 for var in ['apl', 'alf', 'als']: logt += self.totals_df.get_value(var, col) self.totals_df.set_value('logt', col, logt) # Deals with rsa rmi rsa = 0 for var in ['rmi', 'rsa']: rsa += self.totals_df.get_value(var, col) self.totals_df.set_value('rsa', col, rsa) # Deals with irpp, csg, crds for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']: if col in ['amount']: val = - self.totals_df.get_value(var, col) self.totals_df.set_value(var, col, val) except: # raise Exception(" No administrative data available for year " + str(year)) import warnings warnings.warn("No administrative data available for year %s in file %s" % (str(year), filename)) self.totals_df = None return def save_table(self, directory = None, filename = None, table_format = None): ''' Saves the table to some format ''' now = datetime.now() if table_format is None: if filename is not None: extension = filename[-4:] if extension == '.xls': table_format = 'xls' elif extension == '.csv': table_format = 'csv' else: table_format = 'xls' if directory is None: directory = "." if filename is None: filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format) fname = os.path.join(directory, filename) try: df = self.aggr_frame if table_format == "xls": writer = ExcelWriter(str(fname)) df.to_excel(writer, "aggregates", index= False, header= True) descr = self.create_description() descr.to_excel(writer, "description", index = False, header=False) writer.save() elif table_format == "csv": df.to_csv(fname, "aggregates", index= False, header = True) except Exception, e: raise Exception("Aggregates: Error saving file", str(e))