def tokenize_csv_file(source_df: pd.DataFrame,
                      should_replace_company,
                      should_remove_NE,
                      should_remove_numbers,
                      company_alias=None) -> pd.DataFrame:
    # source_df = pd.read_csv(source_file_path, encoding='utf-8')
    source_df = copy.deepcopy(source_df)
    target_column = 'text'
    for index, row in source_df.iterrows():
        target_text = row[target_column]
        try:
            target_text = replace_url(target_text)
        except TypeError:
            continue
        if should_replace_company:
            company_list = [row['company']]
            if company_alias is not None and row['company'] in company_alias:
                company_list.extend(company_alias[row['company']])
            target_text = replace_target_company(target_text, company_list)
        if should_remove_NE:
            ne_removed_content = remove_named_entities(target_text)
        else:
            ne_removed_content = target_text

        tokens = tokenizer.tokenize(ne_removed_content)
        if should_remove_numbers:
            tokens = replace_numbers(tokens)
        source_df.set_value(index, target_column, ' '.join(tokens))
    # source_df.to_csv(dest_file_path, index=False)
    return source_df
Beispiel #2
0
def get_bg_freq(string, step, show=False):
    """
    Get dictionary of frequencies of bigrams

    :param string: text
    :param step: cycle step (1 or 2)
    :param show: show frequencies in matrix with symbols indices (bool)
    :return: dictionary
    """

    f_dict = {}

    for i in range(0, len(string) - 1, step):
        if string[i] + string[i + 1] in f_dict:
            f_dict[string[i] + string[i + 1]] += 1
        else:
            f_dict[string[i] + string[i + 1]] = 1

    if show:
        df = DataFrame(data=0, index=list(ALPHABET), columns=list(ALPHABET))

        for i in f_dict:
            df.set_value(i[0], i[1], f_dict[i])

        set_option('display.max_columns', 10)
        print(df)

    return f_dict
Beispiel #3
0
def features6to8func():

	# print "start"
	start = timeit.default_timer()
	data1 = pd.read_csv('Phase1_data/testMod.csv' , sep=',', low_memory=False, usecols=[1,2,3,4,5,6])
	# print "loaded"
	# F6: Maximum No of Clicks on any item
	#Number of clicks per item in a session
	datagrp = (data1.groupby(['Session_ID','Item_ID']).size())
	datagrpframe=datagrp.reset_index()
	#print ("No of clicks per item in a session")#datagrpgrame
	#maximum no of clicks on one item
	data2= DataFrame({'MaxClicksItems': datagrpframe.groupby(['Session_ID'],sort=False)[0L].max()}).reset_index()
	# print "done6"

	#F7: No of Distinct Items in a Session
	data5=DataFrame({'DistinctItems': datagrpframe.groupby(['Session_ID'],sort=False).size()}).reset_index()
	data2['Distinct_Items']=data5['DistinctItems']
	# print "done7"

	#F8: Session Time
	data3=DataFrame(data1.groupby(['Session_ID']).max()).reset_index()
	# print "done 8.1"  
	data4=DataFrame(data1.groupby(['Session_ID']).min()).reset_index()
	# print "done 8.2"    
	data3['Session_Time_temp']=(pd.to_datetime(data3.Timestamp)-pd.to_datetime(data4.Timestamp))
	for i, row in data3.iterrows(): 
	    data3.set_value(i,'Session_Time',data3.iloc[i,6].total_seconds()/60)
	data2['Session_Time']=data3.Session_Time

	stop = timeit.default_timer()
	# print stop - start 

	data2.to_csv("Phase1_data/features6-8.csv",sep=',')
def update_predecessoras(df_crono: pd.DataFrame):
    ###   Atualiza a relação de precedencia dos cronogramas
    df_iter = df_crono
    for index, item in df_iter.iterrows():
        ids_predec = ''
        str_predecs = ''
        if str(item['Predecessoras']).find("segue") >= 0:
            ls_predec = get_num_precedencia(item['Predecessoras'])

            # monta a lista de predecessoras
            # TODO: corrigir falha em múltiplas predecessoras
            #for el in ls_predec:
            #if not df_crono[df_crono['número da demanda'] == el].empty:
            #ids_predec = str(ids_predec + get_index_elem_crono(el, df_crono) + ',')
            #ids_predec = str(ids_predec + str(df_crono[df_crono["número da demanda"] == el].index[0]) + ',')

            ids_predec = df_crono[df_crono["número da demanda"].isin(
                ls_predec)].index.values.tolist()

            str_predecs = str(ids_predec).strip('[]')

            # remove a virgula do final da lista
            if str_predecs[-1:] == ',':
                str_predecs = str_predecs[:-1]

        df_crono.set_value(index, 'Predecessoras', str_predecs)

    return df_crono
Beispiel #5
0
def exercise_2_b(degree, SUPG=False):
    mu_values = [1, 0.1, 0.01]
    N_values = [8, 16, 32, 64]

    errors_L2 = DataFrame(index=N_values, columns=mu_values)
    errors_H1 = DataFrame(index=N_values, columns=mu_values)

    for mu in mu_values:
        for N in N_values:
            u_numerical, V, omega = solve_system_two(N=N,
                                                     mu=mu,
                                                     degree=degree,
                                                     SUPG=SUPG)
            u_exact = Expression(
                '(exp(1 / mu * x[0]) - 1) / (exp(1 / mu) - 1)',
                mu=mu,
                degree=degree)

            L2 = errornorm(u_exact, u_numerical, 'L2', degree_rise=3)
            H1 = errornorm(u_exact, u_numerical, 'H1', degree_rise=3)

            errors_L2.set_value(N, mu, L2)
            errors_H1.set_value(N, mu, H1)

    return errors_L2, errors_H1
Beispiel #6
0
def Impute(data_as_DataFrame, kNNGraph, Method = IgnoringNan.mean, target = None ):
    """Impute(data_as_DataFrame,Graph) -> pandas DataFrame with nan's imputed
    
    Imputation is via Graph Neighborhoods of kNNGraph
    Method is applied to each neighborhood array of values for a 
    vertex with an nan
    
    Note: data_as_DataFrame can also be a numpy array 
    """
    
    try:
        data_as_DataFrame.columns
        data_as_DataFrame.index
    
        DFrame = data_as_DataFrame.copy()
    except:
        DFrame = DataFrame( data_as_DataFrame )
        
    cols = DFrame.columns
    inds = DFrame.index
    Data = DFrame.as_matrix()
    
    m,n = DFrame.shape
    for i in range(m):
        nbrs = kNNGraph.neighbors(i)
        for j in range(n):
            if( isnan( Data[i,j] ) ):
                 DFrame.set_value( inds[i],cols[j], int( Method( array( [Data[nbr,j] for nbr in nbrs] ) ) ) )
    return DFrame
class ClusterRecorder:
    """
    设置记录类
    :return:
    """
    def __init__(self, dataset):
        self.dataset = dataset
        try:
            self.recorder_csv = pandas.read_csv(
                Properties.getDefaultDataFold() + "/csv/recorder_csv_" +
                self.dataset + ".csv")
        except:
            self.recorder_csv = DataFrame([],
                                          columns=[
                                              'id', 'start', 'end', 'd_c',
                                              'max_distance_c', 'dataset',
                                              'pile_size', 'H', 'note'
                                          ])

    def setValue(self, row, columns, value):
        self.recorder_csv.set_value(row, columns, value)
        self.recorder_csv.set_value(row, 'end', Properties.name_str_FULL())

    def save(self):
        self.recorder_csv.to_csv(Properties.getDefaultDataFold() +
                                 "/csv/recorder_csv_" + self.dataset + ".csv")
Beispiel #8
0
    def calculate(self, metric, out_filepath):

        metric_fun = self.metric_dict[metric]

        cid_and_embedding = pd.read_csv(self.drug_list_filepath)
        cid_list = cid_and_embedding['cid'].values
        embedding_list = cid_and_embedding['entity'].values
        embedding_list = [
            str_2_float_list(embedding) for embedding in embedding_list
        ]

        cid2embedding = dict(zip(cid_list, embedding_list))

        assert len(cid_list) == NUM_DRUGS
        data = np.zeros(shape=(NUM_DRUGS, NUM_DRUGS), dtype=np.float32)

        frame = DataFrame(data, columns=cid_list, index=cid_list)
        columns = frame.columns
        for row_cid, row in frame.iterrows():
            row_vector = cid2embedding[row_cid]
            for col_cid in columns:
                if row_cid == col_cid:
                    continue
                col_vector = cid2embedding[col_cid]
                try:
                    sim = metric_fun(row_vector, col_vector)
                except ValueError:
                    print(row_cid, col_cid)
                    return
                try:
                    frame.set_value(row_cid, col_cid, sim)
                except KeyError:
                    print(row_cid, col_cid)
                    break
        frame.to_csv(out_filepath)
Beispiel #9
0
def gonzales(data, k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:], index=data[:, 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 = points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance', 'center'], axis=1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1, k + 1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(
                    center.as_matrix(columns=[0, 1]),
                    p.as_matrix(columns=[0, 1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp

        centers_list = centers_list.append(
            points_list.ix[[next_cluster], :distance_column_index])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1, inplace=True)

    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0, 1])
Beispiel #10
0
class DataFrameModel(QAbstractTableModel):
    ''' data model for a DataFrame class '''
    def __init__(self):
        super(DataFrameModel, self).__init__()
        self.df = DataFrame()

    def setDataFrame(self, dataFrame):
        self.df = dataFrame

    def signalUpdate(self):
        ''' tell viewers to update their data (this is full update, not
        efficient)'''
        self.layoutChanged.emit()

    #------------- table display functions -----------------
    def headerData(self, section, orientation, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if orientation == Qt.Horizontal:
            try:
                return self.df.columns.tolist()[section]
            except (IndexError, ):
                return QVariant()
        elif orientation == Qt.Vertical:
            try:
                # return self.df.index.tolist()
                return self.df.index.tolist()[section]
            except (IndexError, ):
                return QVariant()

    def data(self, index, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if not index.isValid():
            return QVariant()

        return QVariant(str(self.df.ix[index.row(), index.column()]))

    def flags(self, index):
        flags = super(DataFrameModel, self).flags(index)
        col = self.df.columns[index.column()]
        if hasattr(value, 'toPyObject'):
            # PyQt4 gets a QVariant
            value = value.toPyObject()
        else:
            # PySide gets an unicode
            dtype = self.df[col].dtype
            if dtype != object:
                value = None if value == '' else dtype.type(value)
        self.df.set_value(row, col, value)
        return True

    def rowCount(self, index=QModelIndex()):
        return self.df.shape[0]

    def columnCount(self, index=QModelIndex()):
        return self.df.shape[1]
Beispiel #11
0
def __main__():
    print("Libraries initialized")
    print("Loading data")
    prompts, responses = processCAHData()
    print("Data loaded")
    print("Defining model")
    model = defineModel()
    print("Compiling model")
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)
    model.compile(optimizer=sgd,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    try:
        model.load_weights("weights.hdf5")
    except OSError:
        pass
    except ValueError:
        pass
    print("Cards Against Humanity Generator")
    print("Scale of Hilarity")
    print("1 - Not Funny")
    print("2 - Kind of Funny/Eh")
    print("3 - Funny")
    print("")
    x = DataFrame(columns=[0, 1, 2])
    y = DataFrame(columns=[0])
    prompt_id, resp_id, sent = generateSentence(prompts, responses)
    rating = rate(prompts, responses)
    x.loc[0] = [prompt_id, resp_id, sent]
    y.loc[0] = rating
    prompt_id, resp_id, sent = generateSentence(prompts, responses)
    rating = rate(prompts, responses)
    x.loc[1] = [prompt_id, resp_id, sent]
    y.loc[1] = rating
    y.reset_index
    x.reset_index
    print("")
    print("X :")
    print(x)
    print("Y :")
    print(y)
    model.fit(x, y, batch_size=1, epochs=1)
    model.save_weights("weights.hdf5")
    while True:
        x = DataFrame()
        y = DataFrame(columns=['y'])
        print("")
        prompt_id, resp_id, sent = generateSentence(prompts, responses)
        x = x.append(Series([prompt_id, resp_id, sent]))
        prediction = model.predict(x, batch_size=1)
        print("Model Prediction : ")
        prediction = prediction[0]
        print("1 : " + str(prediction[0]))
        print("2 : " + str(prediction[1]))
        print("3 : " + str(prediction[2]))
        rating = rate(prompts, responses)
        y.set_value(0, 'y', rating)
        model.fit(x, y, batch_size=1, epochs=1)
        model.save_weights("weights.hdf5")
Beispiel #12
0
def compute_convergence(error_table, N_vals, l_vals):
    convergence_table = DataFrame(index=l_vals, columns=['alpha'])
    for column, lam in zip(error_table.transpose().get_values(), l_vals):
        h_log = [np.log(1.0 / n) for n in N_vals]
        error_log = error_table.applymap(np.log)
        convergence_rate = np.polyfit(h_log, error_log[lam], deg=1)[0]
        convergence_table.set_value(lam, 'alpha', convergence_rate)
    return convergence_table
def volunteerListToSchedule(volunteerList, numDays):
    schedule = DataFrame()
    for day in range(numDays):
        row = 0
        for volunteer in volunteerList:
            schedule.set_value(row, day, volunteer.schedule[day])
            row = row + 1
    return schedule
Beispiel #14
0
class DataFrameModel(QAbstractTableModel):
    ''' data model for a DataFrame class '''
    def __init__(self):
        super(DataFrameModel,self).__init__()
        self.df = DataFrame()

    def setDataFrame(self,dataFrame):
        self.df = dataFrame

    def signalUpdate(self):
        ''' tell viewers to update their data (this is full update, not
        efficient)'''
        self.layoutChanged.emit()

    #------------- table display functions -----------------


    def headerData(self,section,orientation,role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if orientation == Qt.Horizontal:
            try:
                return self.df.columns.tolist()[section]                
            except (IndexError, ):
                return QVariant()
        elif orientation == Qt.Vertical:
            try:
                return self.df.index.tolist()
                return [str(i) for i in self.df.index.tolist()[section]]
            except (IndexError, ):
                return QVariant()

    def data(self, index, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if not index.isValid():
            return QVariant()

        return QVariant(str(self.df.ix[index.row(),index.column()]))

    def flags(self, index):
            flags = super(DataFrameModel, self).flags(index)
            flags |= Qt.ItemIsEditable
            return flags

    def setData(self, index, value, role):
        self.df.set_value(self.df.index[index.row()],
                          self.df.columns[index.column()],
                          value.toPyObject())
        return  True

    def rowCount(self, index=QModelIndex()):
        return self.df.shape[0]

    def columnCount(self, index=QModelIndex()):
        return self.df.shape[1]
Beispiel #15
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
Beispiel #16
0
class DataFrameModel(QAbstractTableModel):
    ''' data model for a DataFrame class '''
    def __init__(self):
        super(DataFrameModel, self).__init__()
        self.df = DataFrame()

    def setDataFrame(self, dataFrame):
        self.df = dataFrame

    def signalUpdate(self):
        ''' tell viewers to update their data (this is full update, not efficient)'''
        self.layoutChanged.emit()

    #------------- table display functions -----------------
    def headerData(self, section, orientation, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if orientation == Qt.Horizontal:
            try:
                return self.df.columns.tolist()[section]
            except (IndexError, ):
                return QVariant()
        elif orientation == Qt.Vertical:
            try:
                #return self.df.index.tolist()
                return self.df.index.tolist()[section]
            except (IndexError, ):
                return QVariant()

    def data(self, index, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if not index.isValid():
            return QVariant()

        return QVariant(str(self.df.ix[index.row(), index.column()]))

    def flags(self, index):
        flags = super(DataFrameModel, self).flags(index)
        flags |= Qt.ItemIsEditable
        return flags

    def setData(self, index, value, role):

        self.df.set_value(str(self.df.index[index.row()]),
                          str(self.df.columns[index.column()]),
                          value.toPyObject())
        return True

    def rowCount(self, index=QModelIndex()):
        return self.df.shape[0]

    def columnCount(self, index=QModelIndex()):
        return self.df.shape[1]
Beispiel #17
0
 def updatePrice(self, stockDataFrame: pd.DataFrame):
     priceList = {}
     for index, row in stockDataFrame.iterrows():
         stock = stocktw(index)
         priceList[index] = stock.getPrice()
         print(index, priceList[index])
     ds = pd.Series(priceList)
     for i, row in ds.iteritems():
         stockDataFrame.set_value(i, "股價", ds[i])
     return stockDataFrame
def common_gene_pickle_assembler(gene_id_list, filename):
    score_list_folder = "TFbinding"
    folder_location = path.join(current_directory, score_list_folder)
    folder_contents = os.listdir(folder_location)
    tf_csv_folder = "Gene_CSV"
    tf_csv_folder_location = path.join(current_directory, tf_csv_folder)
    ColumnList = []
    tf_id_dict = {}
    total_genes_score_dict = {}
    for gene_id in gene_id_list:
        gene = idconverter.getgene(gene_id).SGDID
        ColumnList += [
            '{} Number of Hits'.format(gene_id),
            '{} Total Sum of Scores'.format(gene_id)
        ]
        if 'gene_{}_score_list.pickle'.format(gene) in folder_contents:
            gene_pickle_file = open(
                path.join(folder_location,
                          'gene_{}_score_list.pickle'.format(gene)), 'rb')
            gene_dict = pickle.load(gene_pickle_file)
            gene_pickle_file.close()
            tf_ids = []
            tf_score_dict = {}
            for tf_id in gene_dict.keys():
                total_sum_scores = sum(gene_dict[tf_id])
                number_binding_sites = len(np.atleast_1d(gene_dict[tf_id]))
                tf_score_dict[tf_id] = [number_binding_sites, total_sum_scores]
                tf_ids += [tf_id]
            tf_id_dict[gene_id] = set(tf_ids)
            total_genes_score_dict[gene_id] = tf_score_dict
    tf_sets = tf_id_dict.values()
    common_tfs = set.intersection(*tf_sets)
    common_tfs = list(common_tfs)
    df = DataFrame(columns=['TF Feature Name', 'TF Common Name'] + ColumnList +
                   ['Medline'] + ['TF Description'])
    for tf_id in common_tfs:
        tf_description = idconverter.getgene(tf_id).description
        tf_common_name = idconverter.getgene(tf_id).common_name
        tf_feature_name = idconverter.getgene(tf_id).feature_name
        tf_medline = motif[tf_id].medline
        tf_medline_url = 'www.pubmed.com/{}'.format(tf_medline)
        df.set_value(tf_id, 'TF Feature Name', tf_feature_name)
        df.set_value(tf_id, 'TF Common Name', tf_common_name)
        df.set_value(tf_id, 'TF Description', tf_description)
        df.set_value(tf_id, 'Medline', tf_medline_url)
        for gene_id in total_genes_score_dict.keys():
            if tf_id in total_genes_score_dict[gene_id].keys():
                df.set_value(tf_id, '{} Number of Hits'.format(gene_id),
                             total_genes_score_dict[gene_id][tf_id][0])
                df.set_value(tf_id, '{} Total Sum of Scores'.format(gene_id),
                             total_genes_score_dict[gene_id][tf_id][1])
    genes_csv_filename = filename
    genes_csv = path.join(tf_csv_folder_location,
                          "{}.csv".format(genes_csv_filename))
    df.to_csv(genes_csv)
Beispiel #19
0
def median_over_months(raw):
    n_years = len(raw.index)
    patterns = unique([date.split(' ')[1] for date in raw.columns[1:]])
    n_patterns = len(patterns)
    medians = DataFrame(np.zeros((n_years,1+n_patterns)), columns=['year']+patterns)
    medians['year'] = raw['year']
    for i_year in range(0, n_years):
        for i_pattern in range(0, n_patterns):
            columns_for_this_day = [col for col in raw.columns[1:] if col.split(' ')[1] == patterns[i_pattern]]
            medians.set_value(i_year, patterns[i_pattern], median(raw.iloc[i_year][columns_for_this_day]))
    return medians
Beispiel #20
0
def linear_elasticity(polynomial_order=1):

    l_values = [1, 10, 100, 1000]
    N_values = [8, 16, 32, 64]
    mu = 1
    error_table = DataFrame(index=N_values, columns=l_values)
    for lam in l_values:
        for N in N_values:
            error_table.set_value(
                N, lam, solver(N=N, lam=lam, mu=mu, degree=polynomial_order))
    convergence = compute_convergence(error_table, N_values, l_values)

    return error_table, convergence
Beispiel #21
0
def sum_over_patterns(raw, new_name=None):
     n_years = len(raw.index)
     sums = DataFrame(np.zeros((n_years,1+12)), columns=['year']+months)
     sums['year'] = raw['year']
     for i_year in range(0, n_years):
         for i_month in range(0, 12):
             columns_for_this_month = [col for col in raw.columns[1:] if col.split(' ')[0] == months[i_month]]
             sums.set_value(i_year, months[i_month], sum(raw.iloc[i_year][columns_for_this_month]))
     
     if new_name != None:
         sums.columns = ['year'] +  [col+' '+new_name for col in sums.columns if col != 'year']
         
     return sums
Beispiel #22
0
def fixup_initdb_data(df: pd.DataFrame):
    # Initdb spawns 5 postgres processes, rename those to have a different progname so we can measure all
    for i, row in df.iterrows():
        progname = str(row['progname'])
        count = i % 6
        if progname == 'postgres':
            assert count != 5, "every sixth row should have name initdb"
            new_progname = progname + "-child-" + str(count + 1)
            df.set_value(i, 'progname', new_progname)
        else:
            # the final row of each run should be initdb
            assert count == 5, count
            assert progname == "initdb", progname
    return df
Beispiel #23
0
    def compute_tf_idf_queries(self):
        # Find total number of document
        results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents'))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute('SELECT did, total_word, path FROM documents')
        tmp = results.fetchall()
        documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path'])
        documents_df['tf_idf'] = 0.0

        no_docterm = {}

        for query in self.queries:
            no_docterm[query] = 0

        for index, row in documents_df.iterrows():
            path = row['path']
            with codecs.open(path, 'rt') as f:
                text = f.read()
                for query in self.queries:
                    if query in text.decode('utf-8').lower():
                        no_docterm[query] += 1

        for query in self.queries:
            for index, row in documents_df.iterrows():
                total_word = row['total_word']
                path = row['path']

                with codecs.open(path, 'rt') as f:
                    text = f.read()

                tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query])
                cur_tf_idf = documents_df.get_value(index, 'tf_idf')
                documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf)

        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])
        df['tf_idf'] = 0.0

        for index, row in df.iterrows():
            did = row['did']
            tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0]
            df.set_value(index, 'tf_idf', tf_idf)

        del df['did']
        df = df.groupby(['e_type', 'entity']).sum().reset_index()
        return df
Beispiel #24
0
 def fill_encoded(self, df: pd.DataFrame, fld):
     field_name = fld['name']
     field_encoding = fld['encoding']
     field_id = fld['id']
     for row in df.itertuples():
         plaintext = getattr(row, field_name)
         field = FieldExtra.select(
             FieldExtra).where((FieldExtra.FieldId == field_id)
                               & (FieldExtra.Value == plaintext)).first()
         if field is not None:
             df.set_value(row.Index, field_name, field.Key)
         else:
             encoder = self.val_encoders[field_encoding]
             df = encoder(plaintext, fld)
     return df
Beispiel #25
0
def median_over_months(raw):
    n_years = len(raw.index)
    patterns = unique([date.split(' ')[1] for date in raw.columns[1:]])
    n_patterns = len(patterns)
    medians = DataFrame(np.zeros((n_years, 1 + n_patterns)),
                        columns=['year'] + patterns)
    medians['year'] = raw['year']
    for i_year in range(0, n_years):
        for i_pattern in range(0, n_patterns):
            columns_for_this_day = [
                col for col in raw.columns[1:]
                if col.split(' ')[1] == patterns[i_pattern]
            ]
            medians.set_value(i_year, patterns[i_pattern],
                              median(raw.iloc[i_year][columns_for_this_day]))
    return medians
Beispiel #26
0
 def get_items_history(self, **kwargs):
     zhost = self._do_request(
         'host.get', {
             'output': ['hostid', 'snmp_available', 'snmp_error'],
             'filter': {
                 'host': [kwargs.get('host', 0)]
             }
         })
     if not zhost: return kwargs
     kwargs.update(zhost[0])
     items = DataFrame(
         self._do_request(
             'item.get', {
                 'output': [
                     'hostid', 'itemid', 'name', 'key_', 'value_type',
                     'lastclock', 'lastvalue'
                 ],
                 'hostids': [h['hostid'] for h in zhost]
             }))
     items = items[items.key_.str.contains(kwargs.get('key', 'icmpping'),
                                           regex=True,
                                           na=False)]
     if items.empty: return kwargs
     if kwargs.get('time_from') and kwargs.get('time_till'):
         kwargs['period'] = kwargs['time_till'] - kwargs['time_from']
         for index, item in items.iterrows():
             history = DataFrame(
                 self._do_request(
                     'history.get', {
                         'history': item['value_type'],
                         'itemids': [item['itemid']],
                         'time_from': kwargs['time_from'],
                         'time_till': kwargs['time_till'],
                         'sortfield': "clock"
                     }))
             if history.empty: continue
             history.clock = to_numeric(history.clock, errors='coerce')
             history.value = to_numeric(history.value, errors='coerce')
             if kwargs.get('workonly', []):
                 history = history.loc[history['clock'].isin(
                     Zapi.filterWorkTimestamp(history.clock.tolist(),
                                              **kwargs))]
             items.set_value(index, 'max', history.value.max())
             items.set_value(index, 'min', history.value.min())
             items.set_value(index, 'avg', history.value.mean())
     kwargs['items'] = items.fillna(0).to_dict(orient='records')
     return kwargs
def update(test_items_dataframe: pd.DataFrame,
           test_item: TesTItem) -> pd.DataFrame:
    """Updates the test_items_dataframe with the result and exception of the test_item

    Arguments:
        test_items_dataframe {pd.DataFrame} -- A DataFrame of TesTItems to be updated
        test_item {TesTItem} -- The TesTItem to use for the update

    Returns:
        pd.DataFrame -- A new, updated dataframe
    """
    test_items_dataframe = test_items_dataframe.set_index("test")
    test_items_dataframe = test_items_dataframe.set_value(
        test_item.name, "result", test_item.result_str)
    test_items_dataframe = test_items_dataframe.set_value(
        test_item.name, "exception", str(test_item.exception))
    test_items_dataframe = test_items_dataframe.reset_index("test")
    return test_items_dataframe
Beispiel #28
0
    def _get_markup(self, para, markup_dict=None):
        """get markup """
        if not markup_dict:
            markup_dict = self._MARKUPS
            
        df = DataFrame(markup_dict, index=['Enter', 'Exit']).T
        df['In']=False
        
        sects=[]
        place=0
        while place > -1:
            place = -1
            markup = None
            estr = None
            for mark, enter in df[df.In==False].Enter.iterkv():
                find = para.find(enter)
                if find > -1 and (find<=place or place==-1):
                    if find == place and len(enter) < len(estr):
                        continue
                    place = find
                    markup = mark
                    estr = enter
            for mark, exit in df[df.In==True].Exit.iterkv():
                find = para.find(exit)
                if find > -1 and (find<=place or place==-1):
                    if find == place and len(exit) < len(estr):
                        continue
                    place = find
                    markup = mark
                    estr = exit
        
            if place > -1:
                sects.append([para[:place], df[df.In==True].index.tolist()])
                df.set_value(markup, 'In', not df.get_value(markup, 'In'))
                para = para[place+len(estr):]

        if df.In.any():
            raise ValueError(
                'the markup does not exit from;\n{}'.format(df[df.In==True]))
            
        sects.append([para, []])
                         
        return sects
Beispiel #29
0
def sum_over_patterns(raw, new_name=None):
    n_years = len(raw.index)
    sums = DataFrame(np.zeros((n_years, 1 + 12)), columns=['year'] + months)
    sums['year'] = raw['year']
    for i_year in range(0, n_years):
        for i_month in range(0, 12):
            columns_for_this_month = [
                col for col in raw.columns[1:]
                if col.split(' ')[0] == months[i_month]
            ]
            sums.set_value(i_year, months[i_month],
                           sum(raw.iloc[i_year][columns_for_this_month]))

    if new_name != None:
        sums.columns = ['year'] + [
            col + ' ' + new_name for col in sums.columns if col != 'year'
        ]

    return sums
 def userVectors(self, client):
     # return a one hot dataframe with subreddit as column and user as row
     redditors = [
         redditor['username']
         for redditor in self.client.subRec.users.find()
     ]
     subs = [sub['name'] for sub in self.client.subRec.subs.find()]
     df = DataFrame(0, index=redditors, columns=subs)
     for user in self.client.subRec.users.find():
         df = df.set_value(user['username'], user['subreddit'], 1)
     return df
Beispiel #31
0
def fix_event_type(df: DataFrame):
    '''
    Not sure yet.
    :param df: Dataframe object.
    :return: Modified Dataframe.
    '''

    a = time.time()

    colsf = df['id'].ravel()  # list of all IDs
    unique = pd.Series(colsf).unique()  # get unique IDs
    u_counts = []  # list of unique counts (UNUSED)
    counts_bucket = []  # bucket of counts (UNUSED)
    df = pd.get_dummies(df)  # create dummy variables
    todrop = df.sum() < 50  # get columns where sum of dummy column < 50
    dropcols = df.columns[todrop]  # get those column names
    df = df.drop(dropcols, axis=1)  # drop those columns
    df['num_events'] = 0  # create number of events columns, set to 0
    # print(df.columns)
    print(str(len(unique)))

    for ii in range(0, len(unique)):  # loop through all the unique IDs
        subset = df.loc[df['id'] == unique[ii]]  # subset by that ID
        the_dummies = subset.columns != 'id'  # get all columns that do not equal that ID
        aa = subset.iloc[:, subset.columns != 'id'].sum().tolist(
        )  # get all of those columns to list
        event_sum = np.sum(aa)  # sum all of those

        # aa = aa.set_index([[subset.index[0]]])
        # subset.iloc[:,subset.columns != 'id'] = aa
        df = df.set_value(subset.index, the_dummies, aa)
        df = df.set_value(subset.index, 'num_events', event_sum)
        # df.loc[subset.index] = subset
    df = df.drop_duplicates('id')
    print(df)
    b = time.time()
    print(b - a)
    return df
Beispiel #32
0
def exercise_1_b(degree):
    """
    returns the L2 and H1 errors when using lagrange elements of given degree.
    """
    frequencies = [1, 10]
    N_values = [8, 16, 32, 64]

    errors_L2 = DataFrame(index=N_values, columns=frequencies)
    errors_H1 = DataFrame(index=N_values, columns=frequencies)

    for k in frequencies:
        for N in N_values:
            u_numerical, V, omega = solve_system_one(N=N, k=k, degree=degree)
            u_exact = Expression('sin(k*pi*x[0])*cos(k*pi*x[1])',
                                 k=k,
                                 degree=degree)
            L2 = errornorm(u_exact, u_numerical, 'l2', degree_rise=3)
            H1 = errornorm(u_exact, u_numerical, 'h1', degree_rise=3)

            errors_L2.set_value(N, k, L2)
            errors_H1.set_value(N, k, H1)

    return errors_L2, errors_H1
Beispiel #33
0
def fix_event_type(df: DataFrame):
    '''
    Not sure yet.
    :param df: Dataframe object.
    :return: Modified Dataframe.
    '''

    a = time.time()

    colsf = df['id'].ravel()            # list of all IDs
    unique = pd.Series(colsf).unique()  # get unique IDs
    u_counts = []                       # list of unique counts (UNUSED)
    counts_bucket = []                  # bucket of counts (UNUSED)
    df = pd.get_dummies(df)             # create dummy variables
    todrop = df.sum() < 50              # get columns where sum of dummy column < 50
    dropcols = df.columns[todrop]       # get those column names
    df = df.drop(dropcols, axis=1)      # drop those columns
    df['num_events'] = 0                # create number of events columns, set to 0
    # print(df.columns)
    print(str(len(unique)))

    for ii in range(0,len(unique)):     # loop through all the unique IDs
        subset = df.loc[df['id'] == unique[ii]]     # subset by that ID
        the_dummies = subset.columns != 'id'        # get all columns that do not equal that ID
        aa = subset.iloc[:, subset.columns != 'id'].sum().tolist()  # get all of those columns to list
        event_sum = np.sum(aa)      # sum all of those
        
        # aa = aa.set_index([[subset.index[0]]])
        # subset.iloc[:,subset.columns != 'id'] = aa
        df = df.set_value(subset.index, the_dummies, aa)
        df = df.set_value(subset.index, 'num_events', event_sum)
        # df.loc[subset.index] = subset
    df = df.drop_duplicates('id')
    print(df)
    b = time.time()
    print(b-a)
    return df
Beispiel #34
0
def fsev_count(df: DataFrame, fsev: int, feature: str, train: bool,
               blist: list, bidx: int):

    colname = 'fsev_' + str(fsev) + '_' + str(feature)
    if train:
        a = df[df['fault_severity'] == fsev]
        b = a[feature].value_counts()[0:60]
        blist = b.tolist()
        bidx = b.index
        bdf = pd.DataFrame(b)
    df[colname] = 0
    # subset = df.loc[df.location.isin(a.index)]
    for i in range(0, len(blist)):
        percentile = blist[i] / np.sum(blist)
        locstr = str(bidx[i])
        subset = df.location == locstr
        df = df.set_value(df.location == locstr, colname, percentile)
    rval = df
    if train:
        rval = [df, blist, bidx]
    return rval
Beispiel #35
0
def fsev_count(df: DataFrame, fsev: int,
               feature: str, train: bool,
               blist: list, bidx: int):

    colname = 'fsev_' + str(fsev) + '_' + str(feature)
    if train:
        a = df[df['fault_severity'] == fsev]
        b = a[feature].value_counts()[0:60]
        blist = b.tolist()
        bidx = b.index
        bdf = pd.DataFrame(b)
    df[colname] = 0
    # subset = df.loc[df.location.isin(a.index)]
    for i in range(0,len(blist)):
        percentile = blist[i]/np.sum(blist)
        locstr = str(bidx[i])
        subset = df.location == locstr
        df = df.set_value(df.location == locstr, colname, percentile)
    rval = df
    if train:
        rval = [df, blist,bidx]
    return rval
cf_dict=pd.read_csv('SourceData/county_facts_dictionary.csv')
cf_dict=cf_dict.set_index('column_name')
#pivoting and drop Null values for clean and easy analysis
pr_piv= pr[['fips', 'candidate','fraction_votes']].pivot(index='fips', columns='candidate', values='fraction_votes')
pr_piv.drop(' No Preference', axis=1, inplace=True)
pr_piv.drop(' Uncommitted', axis=1, inplace=True)
pr_facts=pd.merge(pr_piv, facts, right_index=True, left_index=True)
pr_facts=pr_facts.dropna()
c=pr[['candidate','party']].drop_duplicates().sort_values(by=['candidate','party'])
t=c[['candidate','party']].apply(tuple, axis=1).tolist()
d=dict(t)

#skipy linregress
l=len(pr_facts.columns)
linregress_unpiv = DataFrame('',index=range(l),columns=['party','candidate','fact','Rvalue','Pvalue','StdError','Slope','Intercept'])
i=0
for c_X in pr_piv.columns:
  for c_Y in cf_dict.index:
    R=linregress(pr_facts[[c_X,c_Y]])
    #
    linregress_unpiv.set_value(i,'party',d[c_X])
    linregress_unpiv.set_value(i,'candidate',c_X)
    linregress_unpiv.set_value(i,'fact',c_Y)
    linregress_unpiv.set_value(i,'Rvalue',R.rvalue)
    linregress_unpiv.set_value(i,'Pvalue',R.pvalue)
    linregress_unpiv.set_value(i,'StdError',R.stderr)
    linregress_unpiv.set_value(i,'Slope',R.slope)
    linregress_unpiv.set_value(i,'Intercept',R.intercept)
    i+=1
linregress_unpiv.to_csv('DataForTableau/primary_results_county_facts_linregress.csv')
Beispiel #37
0
    def fit(self, annotations):
        """

        Parameters
        ----------
        annotations : (Annotation, Annotation) iterator

        Returns
        -------


        """

        # possible_match[n, m] is the total possible match duration
        # when there are n A-tracks & m B-tracks
        possible_match = DataFrame()

        # actual_match[n, m] is the total actual match duration
        # when there are n A-tracks & m B-tracks
        actual_match = DataFrame()

        # overlap[n, m] is the total duration
        # when there are n A-tracks & m B-tracks
        overlap = DataFrame()

        for n, (A, B) in enumerate(annotations):

            assert isinstance(A, Annotation), "%r is not an Annotation" % A
            assert isinstance(B, Annotation), "%r is not an Annotation" % B
            if n == 0:
                self.modalityA = A.modality
                self.modalityB = B.modality
            else:
                assert A.modality == self.modalityA, \
                    "bad modality (%r, %r)" % (self.modalityA, A.modality)
                assert B.modality == self.modalityB, \
                    "bad modality (%r, %r)" % (self.modalityB, B.modality)
            assert A.uri == B.uri, \
                "resource mismatch (%r, %r)" % (A.uri, B.uri)

            timeline, a, b = self._AB2ab(A, B)

            for segment in timeline:

                duration = segment.duration

                # number of tracks
                atracks = a.tracks(segment)
                Na = len(atracks)
                btracks = b.tracks(segment)
                Nb = len(btracks)

                if Na == 0 or Nb == 0:
                    continue

                # number of matching tracks
                N = len(a.get_labels(segment) & b.get_labels(segment))

                # increment possible_match & actual_match
                try:
                    p_m = possible_match.get_value(Na, Nb)
                    a_m = actual_match.get_value(Na, Nb)
                    ovl = overlap.get_value(Na, Nb)
                except Exception, e:
                    p_m = 0.
                    a_m = 0.
                    ovl = 0.

                possible_match = possible_match.set_value(Na, Nb,
                                                          p_m + min(Na, Nb)*duration)
                actual_match = actual_match.set_value(Na, Nb,
                                                      a_m + N*duration)
                overlap = overlap.set_value(Na, Nb, ovl + duration)
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

# source data
pr=pd.read_csv('primary_results.csv')
#pivoting
pr_piv= pr[['fips', 'candidate','fraction_votes']].pivot(index='fips', columns='candidate', values='fraction_votes')
pr_piv.drop(' No Preference', axis=1, inplace=True)
pr_piv.drop(' Uncommitted', axis=1, inplace=True)
pr_piv=pr_piv.dropna()
l=len(pr_piv.index)
pr_unpiv = DataFrame('',index=range(l*14),columns=['fips','fraction_votes','candidate'])

j=0
while j<len(pr_unpiv):
  for i in range(0,l-1):
    for c in pr_piv.columns:
      pr_unpiv.set_value(j, 'fips', pr_piv.index[i])
      pr_unpiv.set_value(j, 'fraction_votes', pr_piv.get_value(pr_piv.index[i],c))
      pr_unpiv.set_value(j, 'candidate', c)
      j+=1
pr_unpiv.to_csv('DataForTableau/primary_results_dropna.csv')
Beispiel #39
0
def compute_contour_data(contours_bins,
                         contours_saliences,
                         contours_start_times,
                         stepNotes,
                         minF0,
                         hopsize,
                         normalize=True,
                         extra_features=None):
    from pandas import DataFrame, concat
    from numpy import mean, std, array, Inf, zeros
    """ Create contour pandas dataframe uing contour information previouslly extracted with Essentia.
    Initializes DataFrame to have all future columns.
    Parameters
    ----------
    contours_bins: set of bins of the extracted contours
    contours_saliences:  set of saliences of the extracted contours
    contours_start_times:  set of starting times of the extracted contours
    stepNotes: number of bins per semitone
    minF0: minimum F0 in the salience functions
    hopsize: Hop size
    normalize: [True, False] to normalise the features, as performed in Bittner2015
    extra_features: Ncontours * N_features
    set of extra features apart from the ones used by Bittner2015 (pitch, duration, vibrato, salience)

    Returns
    -------
    contour_data : DataFrame
        Pandas data frame with all contour data, to be used for contour classification
    """

    contours_bins = array(contours_bins)
    contours_saliences = array(contours_saliences)
    contours_start_times = array(contours_start_times)
    contour_data = DataFrame
    headers = []

    # Set of headers, containing the first 12 features [0:11] and the first time for each of the contours
    headers[0:12] = [
        'onset', 'offset', 'duration', 'pitch mean', 'pitch std',
        'salience mean', 'salience std', 'salience tot', 'vibrato', 'vib rate',
        'vib extent', 'vib coverage', 'first_time'
    ]

    # Number of contours
    Ncont = len(contours_bins)

    # Find length of longest contour
    maxLen = 0
    for i in range(Ncont):
        maxLen = max(maxLen, len(contours_bins[i]))

    # Header "first_time" can be used to find where the contour features end,
    #  and when the contour info starts (time, bin, salience)

    # Just giving the extra headers some name
    headers[13:] = (array(range(maxLen * 3))).tolist()

    contour_data.num_end_cols = 4

    # Initialising dataset, following the format from the hacked VAMP MELODIA plugin from J. Salamon
    contour_data = DataFrame(Inf * zeros([Ncont, len(headers)]),
                             columns=headers)

    for i in range(Ncont):
        #print i
        # Giving values for each row of the dataframe
        L = len(contours_saliences[i])
        # minF0 instead of 55
        pitches = 55 * 2**((array(contours_bins[i]) / (12. * stepNotes)))
        contour_data.set_value(i, 'onset', contours_start_times[i])
        contour_data.set_value(
            i, 'offset',
            array(contours_start_times[i]) + len(pitches) * hopsize)
        contour_data.set_value(i, 'duration', len(pitches) * hopsize)
        contour_data.set_value(i, 'pitch mean', mean(pitches))
        contour_data.set_value(i, 'pitch std', std(pitches))
        contour_data.set_value(i, 'salience mean',
                               mean(array(contours_saliences[i])))
        contour_data.set_value(i, 'salience std',
                               std(array(contours_saliences[i])))
        contour_data.set_value(i, 'salience tot',
                               sum(array(contours_saliences[i])))

        # In this case, we do not compute vibrato features, so we set them to 0.
        # This could be updated in order to use also vibrato features from contours extracted with Essentia
        contour_data.set_value(i, 'vibrato', 0)
        contour_data.set_value(i, 'vib rate', 0)
        contour_data.set_value(i, 'vib extent', 0)
        contour_data.set_value(i, 'vib coverage', 0)

        # After setting the features, we now give each contour the frame by frame information, e.g for frame0 (fr0), frame 1 (fr1)...
        # time_fr0, pitch_fr0, salience_fr0, time_fr1, pitch_fr1, salience_fr1, time_fr2, pitch_fr2, salience_fr2, ...

        contour_data.iloc[
            i, 12:12 +
            L * 3:3] = contours_start_times[i] + hopsize * array(range(L))
        contour_data.iloc[i, 13:13 + L * 3:3] = pitches
        contour_data.iloc[i, 14:14 + L * 3:3] = array(contours_saliences[i])

    # If extra features are used, they are set before the first_time
    # TODO: replace here with this instead of following line. Maybe pandas does not work here for me


#     contour_data = extend_contour_features(contour_data, extra_features)

    if extra_features is not None:
        sal_features_data = contour_data[headers[0:12]]
        #         frame-by-frame fetures
        frame_by_frame_features_data = contour_data[headers[12:]]
        dfFeatures = concat([sal_features_data, extra_features], axis=1)
        contour_data = concat([dfFeatures, frame_by_frame_features_data],
                              axis=1)

    # All classification labels are initialised (will be updated while performing contour classification).
    # if exist do not create
    if 'overlap' not in contour_data.columns:
        contour_data['overlap'] = -1
    if 'labels' not in contour_data.columns:
        contour_data['labels'] = -1
    if 'melodiness' not in contour_data.columns:
        contour_data['melodiness'] = -1
    if 'mel prob' not in contour_data.columns:
        contour_data['mel prob'] = -1

    # Normalising features
    if normalize:
        contour_data = cu.normalize_features(contour_data)

    print "Contour dataframe created"

    return contour_data
Beispiel #40
0
class DataFrameModel(QAbstractTableModel):
    ''' data model for a DataFrame class '''
    def __init__(self):
        super(DataFrameModel, self).__init__()
        self.df = DataFrame()

    def setDataFrame(self, dataFrame):
        self.df = dataFrame

    def signalUpdate(self):
        ''' tell viewers to update their data (this is full update, not
        efficient)'''
        self.layoutChanged.emit()

    #------------- table display functions -----------------
    def headerData(self, section, orientation, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if orientation == Qt.Horizontal:
            try:
                return self.df.columns.tolist()[section]
            except (IndexError, ):
                return QVariant()
        elif orientation == Qt.Vertical:
            try:
                # return self.df.index.tolist()
                return self.df.index.tolist()[section]
            except (IndexError, ):
                return QVariant()

    def data(self, index, role=Qt.DisplayRole):

        if role != Qt.DisplayRole:
            return QVariant()

        if not index.isValid():
            return QVariant()

        # gde 2014.02.19 - original implementation only worked
        # if there were no missing indices.  Instead use get_value

        #return QVariant(str(self.df.ix[index.row(), index.column()]))
        row = self.df.index[index.row()]
        col = self.df.columns[index.column()]
        str_value = str(self.df.get_value(row, col))
        return QVariant(str_value)

    def flags(self, index):
        flags = super(DataFrameModel, self).flags(index)
        flags |= Qt.ItemIsEditable
        return flags

    def setData(self, index, value, role):
        row = self.df.index[index.row()]
        col = self.df.columns[index.column()]
        if hasattr(value, 'toPyObject'):
            # PyQt4 gets a QVariant
            value = value.toPyObject()
        else:
            # PySide gets an unicode
            dtype = self.df[col].dtype
            if dtype != object:
                value = None if value == '' else dtype.type(value)
        self.df.set_value(row, col, value)
        return True

    def rowCount(self, index=QModelIndex()):
        return self.df.shape[0]

    def columnCount(self, index=QModelIndex()):
        return self.df.shape[1]
Beispiel #41
0
    answers = filtered_data[filtered_data.question_id ==qid][['user_id','correct']]
    answers.columns = ['user_id','answer']
    users_subset = users.merge(answers,how='inner',on='user_id')

    #small adjustment to mean to remove the effect of the question being analized
    users_subset['mean'] = (users_subset['mean']*
                            users_subset['count']-
                            users_subset['answer'])/(users_subset['count']-1)
    
    for quant in STUDENT_QUANTILES:
        quant2 = score_percentiles[quant]
        means = users_subset.groupby(by=[users_subset['percentile'] > quant2]).agg({'answer':'mean'})
        t = str(int(quant*100))
        try:
            prob_good =means.get_value(True,'answer')
            results.set_value(qid,'good_'+t,prob_good)
        except:
            pass
        try:
            prob_bad = means.get_value(False,'answer')
            results.set_value(qid,'bad_'+t,prob_bad)
        except:
            pass

### Plot the resulting ratios
for quant in STUDENT_QUANTILES:
    t = str(int(quant*100))
    plt.plot(results['bad_'+t],results['good_'+t],'b.')
    plt.plot(np.arange(0,1.1,.1),np.arange(0,1.1,.1),'g-',alpha=.5)
    plt.title("Discrimination: "+t+"th Percentile")
    plt.ylabel("Proportion Right, Good Students")
pvalue=DataFrame(np.nan,index=index,columns=index)
pvalue.index.names=['Party','Candidate']
pvalue.index.lexsort_depth
pvalue.columns.lexsort_depth
#StdErr
stderr=DataFrame(np.nan,index=index,columns=index)
stderr.index.names=['Party','Candidate']
stderr.index.lexsort_depth
stderr.columns.lexsort_depth
#
for c_X in pr_piv.columns:
  for c_Y in pr_piv.columns:
    R=linregress(pr_piv[[c_X,c_Y]])
    p_X=index.get_loc_level(c_X,1)[1][0]
    p_Y=index.get_loc_level(c_Y,1)[1][0]
    rvalue.set_value((p_Y,c_Y), (p_X,c_X), R.rvalue)
    pvalue.set_value((p_Y,c_Y), (p_X,c_X),R.pvalue)
    stderr.set_value((p_Y,c_Y), (p_X,c_X), R.stderr)


#democrats only
heatmap(rvalue.loc['Democrat']['Democrat'],'dem_rvalue.png')
heatmap(pvalue.loc['Democrat']['Democrat'],'dem_pvalue.png')
heatmap(stderr.loc['Democrat']['Democrat'],'dem_stderr.png')
#republicans only
heatmap(rvalue.loc['Republican']['Republican'],'rep_rvalue.png')
heatmap(pvalue.loc['Republican']['Republican'],'rep_pvalue.png')
heatmap(stderr.loc['Republican']['Republican'],'rep_stderr.png')

#most anticorrelated republicans
RepRvalue_idxmin=rvalue.loc['Republican']['Republican'].idxmin(axis=0)
Beispiel #43
0
    def diag(self):

        df = self.get_results_dataframe(index_by_code = True)
        df_nivvie = df.xs('nivvie')
        df_revdisp = df.xs('revdisp')
        df_rev = df.xs('rev_trav') + df.xs('pen') + df.xs('rev_cap_net') 
        
        df_af = df.xs('af')

        df_pfam = df.xs('pfam') 
        df_mini = df.xs('mini')
        df_logt = df.xs('logt')
        df_impo = df.xs('ppe') + df.xs('impo')
        df_impo.name = "impo+ppe"
        df_public = df.xs('psoc') + df.xs('ppe') + df.xs('impo')
        
        loyer_chef = self.scenario_chef_seul.menage[0]['loyer']
        
        pension_alim_tot = sum([ var['pension_alim'] for var in self.children.values()])
        
        noi = self.children.keys()[0]
        if self.children[noi]["temps_garde"] == 'alternee_pension_non_decl':
            
            df_revdisp['chef'] = ( df_rev['chef'] + df_mini['chef_seul'] + 
                                   df_af['part']/2 + 
                                   df_logt['chef_seul'] - pension_alim_tot +
                                   df_impo['chef'] )
            df_pfam['chef'] = df_af['part']/2
            df_logt['chef'] = df_logt['chef_seul']
            df_mini['chef']  = df_mini['chef_seul']
            df_public['chef'] = ( df_logt['chef_seul'] + df_mini['chef_seul']+ 
                                  df_pfam['chef'] + df_impo['chef'] )
            df_nivvie['chef'] = df_revdisp['chef']/self.uc['chef']
            
            df_revdisp['part'] = ( df_revdisp['part'] - df_af['part']/2 + 
                                   pension_alim_tot )
            df_pfam['part'] -= df_af['part']/2
            df_public['part'] = ( df_logt['part'] + df_mini['part']+ 
                                  df_pfam['part'] + df_impo['part'] )
            df_nivvie['part'] = df_revdisp['part']/self.uc['part'] 
        
        uc_couple = self.uc['couple']
        total_cost_before = ((uc_couple-1.5)/uc_couple)*(df_revdisp['couple'])
        
        public_cost_before = ( df_public['couple'] - df_public['couple_seul'])
        private_cost_before = total_cost_before - public_cost_before
        
        uc_chef = self.uc['chef']
        uc_part = self.uc['part']
        
        total_cost_after_chef = (uc_chef-1)/(uc_chef)*df_revdisp['chef']
        total_cost_after_part = (uc_part-1)/(uc_part)*df_revdisp['part'] 
        
#        total_cost_after = total_cost_after_chef + total_cost_after_part
        
        public_cost_after_chef = df_public['chef'] - df_public['chef_seul']
        public_cost_after_part = df_public['part'] - df_public['part_seul'] 
        
        #public_cost_after = ( public_cost_after_chef + public_cost_after_part )
        #private_cost_after = total_cost_after - public_cost_after
        # private_cost_after_chef = total_cost_after_chef + pension_alim_tot - public_cost_after_chef
        # private_cost_after_part = total_cost_after_part - pension_alim_tot - public_cost_after_part

        private_cost_after_chef = total_cost_after_chef - public_cost_after_chef
        private_cost_after_part = total_cost_after_part - public_cost_after_part
        
        desunion_public_cost = df_public['part'] + df_public['chef'] - df_public['couple'] 
        
        nivvie_loss_couple = df_nivvie[u"couple"]/df_nivvie["couple_seul"] 
        nivvie_loss_chef = df_nivvie[u"chef"]/df_nivvie["chef_seul"]
        nivvie_loss_part = df_nivvie[u"part"]/df_nivvie["part_seul"]
        
        
        df2 = DataFrame( [df_revdisp, df_pfam, df_mini, df_logt, df_impo, df_nivvie])
        df2 = df2[ ['couple', 'part', 'chef'] ]
        df2 = df2.set_value(u"dépense totale pour enfants", 'couple', total_cost_before)
        df2 = df2.set_value(u"dépense totale pour enfants", 'chef', total_cost_after_chef)
        df2 = df2.set_value(u"dépense totale pour enfants", 'part', total_cost_after_part)
        df2 = df2.set_value(u"prise en charge publique de l'enfant", 'couple', public_cost_before)
        df2 = df2.set_value(u"prise en charge publique de l'enfant", 'chef', public_cost_after_chef)
        df2 = df2.set_value(u"prise en charge publique de l'enfant", 'part', public_cost_after_part)
        df2 = df2.set_value(u"prise en charge privée de l'enfant", 'couple', private_cost_before)
        df2 = df2.set_value(u"prise en charge privée de l'enfant", 'chef', private_cost_after_chef)
        df2 = df2.set_value(u"prise en charge privée de l'enfant", 'part', private_cost_after_part)
        df2 = df2.set_value(u"loyer", 'couple', 12*self.scenario.menage[0]['loyer'])    
        df2 = df2.set_value(u"loyer", 'chef', 12*loyer_chef)
        df2 = df2.set_value(u"loyer", 'part', 12*self.scenario_part.menage[0]['loyer'])
        df2 = df2.set_value(u"pension", 'couple', 0)    
        df2 = df2.set_value(u"pension", 'chef', -pension_alim_tot )
        df2 = df2.set_value(u"pension", 'part', pension_alim_tot)
        
        df2 = df2.set_value(u"nivvie_loss", 'couple', nivvie_loss_couple)    
        df2 = df2.set_value(u"nivvie_loss", 'chef', nivvie_loss_chef)
        df2 = df2.set_value(u"nivvie_loss", 'part', nivvie_loss_part)
        df2 = df2.set_value(u"coût public de la désunion", "couple", desunion_public_cost )
        
        df2 = df2.T
        df2.index.name = u"ménage"
        df2 = df2.reset_index() 
        
        return df2
Beispiel #44
0
class Scores(AnnotationMixin, object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(
        cls, df,
        uri=None, modality=None, aggfunc=np.mean
    ):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        A = cls(uri=uri, modality=modality)
        A._df = pivot_table(
            df, values=PYANNOTE_SCORE,
            rows=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], cols=PYANNOTE_LABEL,
            aggfunc=aggfunc
        )
        return A

    def __init__(self, uri=None, modality=None):
        super(Scores, self).__init__()

        index = MultiIndex(
            levels=[[], []], labels=[[], []],
            names=[PYANNOTE_SEGMENT, PYANNOTE_TRACK]
        )

        self._df = DataFrame(index=index, dtype=np.float64)
        self.modality = modality
        self.uri = uri
        self._timelineHasChanged = True

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self._df = self._df.drop(segment, axis=0)
            self._timelineHasChanged = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self._df = self._df.drop((segment, track), axis=0)
            self._timelineHasChanged = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):
        segment, track, label = key
        return self._df.get_value((segment, track), label)

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return {l: self._df.get_value((segment, track), l) for l in self._df}

    # scores[segment, track, label] = value
    def __setitem__(self, key, value):
        segment, track, label = key
        self._df = self._df.set_value((segment, track), label, value)
        self._timelineHasChanged = True

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        labels = sorted(self._df.columns, key=str)
        if unknown:
            return labels
        else:
            return [l for l in labels if not isinstance(l, Unknown)]

    def itervalues(self):
        """Iterate over annotation as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._df = self._df.sort_index()

        # yield one (segment, track, label) tuple per loop
        labels = self._df.columns
        for (segment, track), columns in self._df.iterrows():
            for label in labels:
                value = columns[label]
                if np.isnan(value):
                    continue
                else:
                    yield segment, track, label, value

    def _rank(self, invert):

        if invert:
            direction = 1.

        else:
            direction = -1.

        def nan_rank(data):

            # replace NaN by -inf or +inf depending on the requested direction
            finite = np.isfinite(data)
            fixed = np.where(finite, direction*data, -direction*np.inf)

            # do the actual argsort
            indices = np.argsort(fixed)
            # get rank from argsort
            rank = np.argsort(indices)

            # special treatment for inverted NaN scores
            # (we want ranks to start at 0 even in case of NaN)
            if invert:
                rank = np.where(finite, rank-(len(data)-np.sum(finite)), np.nan)
            else:
                rank = np.where(finite, rank, np.nan)
            return rank

        return self._df.apply(nan_rank, axis=1)

    def rank(self, invert=False):
        """

        Parameters
        ----------
        invert : bool, optional
            By default, larger scores are better.
            Set `invert` to True to indicate smaller scores are better.

        Returns
        -------
        rank : `Scores`

        """
        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = self._rank(invert)
        return A

    def nbest(self, n, invert=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        invert : bool, optional
            By default, larger scores are better.
            Set `invert` to True to indicate smaller scores are better.

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """
        df = self._df.copy()
        nbest = self._rank(invert) < n
        df[~nbest] = np.nan

        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = df

        return A

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = self._df[list(labels)]

        return A

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        annotation = Annotation(uri=self.uri, modality=self.modality)
        if not self:
            return annotation

        best = self.nbest(1, invert=False)

        if posterior:

            # compute unknown posterior
            func = lambda p: 1. - np.nansum(p, axis=1)
            Pu = self.apply(func, new_columns=['_'])

            # threshold best target posterior
            # with unknown posterior and threshold
            for segment, track, label, value in best.itervalues():

                if value < Pu[segment, track, '_'] or value < threshold:
                    label = Unknown()

                annotation[segment, track] = label

        else:

            # threshold best target score with threshold
            for segment, track, label, value in best.itervalues():
                if value < threshold:
                    label = Unknown()
                annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""
        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = func(self._df)
        return A

    def apply(self, data_func, new_index=None, new_columns=None):
        """Apply `data_func` on internal numpy array

        Parameters
        ----------
        data_func : func
            Function expecting (index x columns) numpy array as input
        new_index : iterable, optional
            When provided, these will be the index of returned array.
        new_columns : iterable, optional
            When provided, these will be the columns of returned array.
        """
        new_data = data_func(self._df.values)

        if new_index is None:
            new_index = self._df.index

        if new_columns is None:
            new_columns = self._df.columns

        df = DataFrame(
            data=new_data,
            index=new_index,
            columns=new_columns)

        new_scores = self.__class__(uri=self.uri, modality=self.modality)
        new_scores._df = df

        return new_scores

    def _repr_png_(self):
        from pyannote.core.notebook import repr_scores
        return repr_scores(self)
Beispiel #45
0
decil, values = mark_weighted_percentiles(nivvie, labels, wprm, method, return_quantiles = True)


df2 = DataFrame({"decile" : decil})
df["decile"] = df2["decile"]



indexes = { "zrstm" : .01, "zchom": .01, "pfamm" : .01} # TODO change 1%
results = DataFrame(index =indexes.keys(), columns = ["total", "pauvre50", "pauvre60"] + ["decile>"+str(decile) for decile in range(0,10)] )

for var, index in indexes.iteritems():
    total = df[var]*index*df["wprm"]*df["champm"]
    pauvre50 = df[var]*index*df["wprm"]*(df["pauvre50m"]<=0)*df["champm"]
    pauvre60 = df[var]*index*df["wprm"]*(df["pauvre60m"]<=0)*df["champm"]
    results.set_value(var, "total", total.sum()/1e6)
    results.set_value(var, "pauvre50", pauvre50.sum()/1e6)
    results.set_value(var, "pauvre60", pauvre60.sum()/1e6)
    for decile in range(0,10):
        temp = df[var]*index*df["wprm"]*(df["decile"]>decile)*df["champm"]
        results.set_value(var, "decile>"+str(decile), temp.sum()/1e6)
        del temp

print results
import os
filename = os.path.join(destination_dir,"desindexation.xls")
print filename
writer = ExcelWriter(str(filename))
results.to_excel(writer)
writer.save()
pr_facts = pr_facts.dropna()
c = pr[['candidate',
        'party']].drop_duplicates().sort_values(by=['candidate', 'party'])
t = c[['candidate', 'party']].apply(tuple, axis=1).tolist()
d = dict(t)

#skipy linregress
l = len(pr_facts.columns)
linregress_unpiv = DataFrame('',
                             index=range(l),
                             columns=[
                                 'party', 'candidate', 'fact', 'Rvalue',
                                 'Pvalue', 'StdError', 'Slope', 'Intercept'
                             ])
i = 0
for c_X in pr_piv.columns:
    for c_Y in cf_dict.index:
        R = linregress(pr_facts[[c_X, c_Y]])
        #
        linregress_unpiv.set_value(i, 'party', d[c_X])
        linregress_unpiv.set_value(i, 'candidate', c_X)
        linregress_unpiv.set_value(i, 'fact', c_Y)
        linregress_unpiv.set_value(i, 'Rvalue', R.rvalue)
        linregress_unpiv.set_value(i, 'Pvalue', R.pvalue)
        linregress_unpiv.set_value(i, 'StdError', R.stderr)
        linregress_unpiv.set_value(i, 'Slope', R.slope)
        linregress_unpiv.set_value(i, 'Intercept', R.intercept)
        i += 1
linregress_unpiv.to_csv(
    'DataForTableau/primary_results_county_facts_linregress.csv')
pvalue.columns.names=['Party','Candidate']
pvalue.columns.lexsort_depth
pvalue.index.names=['Fact']
#StdErr
stderr = DataFrame(np.nan,index=cf_dict.index,columns=index)
stderr.columns.names=['Party','Candidate']
stderr.columns.lexsort_depth
stderr.index.names=['Fact']


#
for c_X in pr_piv.columns:
  for c_Y in cf_dict.index:
    R=linregress(pr_facts[[c_X,c_Y]])
    p_X=index.get_loc_level(c_X,1)[1][0]
    rvalue.set_value(c_Y,(p_X,c_X), R.rvalue)
    pvalue.set_value(c_Y,(p_X,c_X), R.pvalue)
    stderr.set_value(c_Y,(p_X,c_X), R.stderr)

#It's a huge image and it's hard to review
heatmap(rvalue,'rvalue_facts.png')
heatmap(pvalue,'pvalue_facts.png')
heatmap(stderr,'stderr_facts.png')

#Let's find out the most correlated facts to Democrat candidates choice
#democrats only

DemRvalue=rvalue['Democrat']
DemPvalue=pvalue['Democrat']
DemStdErr=stderr['Democrat']
Beispiel #48
0
par_own_merge.to_csv('nt_final_home_owner.csv', header=True)
llc_props.to_csv('llcs.csv', header=True)
amounts_total_grouped.agg(['count','sum']).to_csv('Ownership_Balance_Totals.csv', header=True)


################################################ output ########################################

#evaluate threshold for most accurate name match
thresholds = range(1,101)
o =  []
a = []

est = DataFrame(columns=('threshold','own_count','addr_count'))
for t in thresholds:
    acc = accuracy(t)
    est.set_value(t,'threshold',t) 
    est.set_value(t,'own_count',acc[0])
    est.set_value(t, 'addr_count',acc[1])

x  = sm.add_constant(est[['own_count', 'addr_count']])
reg = sm.OLS(est['threshold'],x).fit()
'>>>%matplotlib inline'
import pylab

pylab.scatter(est.own_count, est.threshold, est.addr_count)
pylab.plot(thresholds, r)
pylab.plot(r,p)
pylab.legend(['precision','recall'],loc=2)
################################################ end output ###################################

if __name__ == '__main__':
class Aggregates(object):
    filter_by = None
    labels = collections.OrderedDict((
        ('var', u"Mesure"),
        ('entity', u"Entité"),
        ('dep', u"Dépenses\n(millions d'€)"),
        ('benef', u"Bénéficiaires\n(milliers)"),
        ('dep_default', u"Dépenses initiales\n(millions d'€)"),
        ('benef_default', u"Bénéficiaires\ninitiaux\n(milliers)"),
        ('dep_real', u"Dépenses\nréelles\n(millions d'€)"),
        ('benef_real', u"Bénéficiaires\nréels\n(milliers)"),
        ('dep_diff_abs', u"Diff. absolue\nDépenses\n(millions d'€)"),
        ('benef_diff_abs', u"Diff absolue\nBénéficiaires\n(milliers)"),
        ('dep_diff_rel', u"Diff. relative\nDépenses"),
        ('benef_diff_rel', u"Diff. relative\nBénéficiaires"),
        ))  # TODO: localize
    show_default = False
    show_diff = True
    show_real = True
    survey_scenario = None
    totals_df = None
    varlist = None

    def __init__(self, survey_scenario = None):
        if survey_scenario is not None:
            self.set_survey_scenario(survey_scenario)

    def clear(self):
        self.totals_df = None

    def compute(self):
        """
        Compute the whole table
        """
        self.compute_aggregates(self.filter_by)
        self.load_amounts_from_file()
        self.compute_real()
        self.compute_diff()

    def compute_aggregates(self, filter_by = None):
        """
        Compute aggregate amounts
        """
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        V = []
        M = {'data': [], 'default': []}
        B = {'data': [], 'default': []}
        U = []

        M_label = {'data': self.labels['dep'],
                   'default': self.labels['dep_default']}
        B_label = {'data': self.labels['benef'],
                   'default': self.labels['benef_default']}

        for var in self.varlist:
            # amounts and beneficiaries from current data and default data if exists
            montant_benef = self.get_aggregate(var, filter_by)
            V.append(column_by_name[var].label)
            entity = column_by_name[var].entity_key_plural

            U.append(entity)
            for dataname in montant_benef:
                M[dataname].append(montant_benef[dataname][0])
                B[dataname].append(montant_benef[dataname][1])

        # build items list
        items = [(self.labels['var'], V)]

        for dataname in M:
            if M[dataname]:
                items.append((M_label[dataname], M[dataname]))
                items.append((B_label[dataname], B[dataname]))

        items.append((self.labels['entity'], U))
        aggr_frame = DataFrame.from_items(items)

        self.aggr_frame = None
        for code, label in self.labels.iteritems():
            try:
                col = aggr_frame[label]
                if self.aggr_frame is None:
                    self.aggr_frame = DataFrame(col)
                else:
                    self.aggr_frame = self.aggr_frame.join(col, how="outer")
            except:
                pass

    def compute_diff(self):
        '''
        Computes and adds relative differences
        '''

        dep = self.aggr_frame[self.labels['dep']]
        benef = self.aggr_frame[self.labels['benef']]

        if self.show_default:
            ref_dep_label, ref_benef_label = self.labels['dep_default'], self.labels['benef_default']
            if ref_dep_label not in self.aggr_frame:
                return
        elif self.show_real:
            ref_dep_label, ref_benef_label = self.labels['dep_real'], self.labels['benef_real']
        else:
            return

        ref_dep = self.aggr_frame[ref_dep_label]
        ref_benef = self.aggr_frame[ref_benef_label]

        self.aggr_frame[self.labels['dep_diff_rel']] = (dep - ref_dep) / abs(ref_dep)
        self.aggr_frame[self.labels['benef_diff_rel']] = (benef - ref_benef) / abs(ref_benef)
        self.aggr_frame[self.labels['dep_diff_abs']] = dep - ref_dep
        self.aggr_frame[self.labels['benef_diff_abs']] = benef - ref_benef

    def compute_real(self):
        '''
        Adds administrative data to dataframe
        '''
        if self.totals_df is None:
            return
        A, B = [], []
        for var in self.varlist:
            # totals from administrative data
            if var in self.totals_df.index:
                A.append(self.totals_df.get_value(var, "amount"))
                B.append(self.totals_df.get_value(var, "benef"))
            else:
                A.append(nan)
                B.append(nan)
        self.aggr_frame[self.labels['dep_real']] = A
        self.aggr_frame[self.labels['benef_real']] = B

    def create_description(self):
        '''
        Creates a description dataframe
        '''
        now = datetime.now()
        return DataFrame([
            u'OpenFisca',
            u'Calculé le %s à %s' % (now.strftime('%d-%m-%Y'), now.strftime('%H:%M')),
            u'Système socio-fiscal au %s' % self.simulation.period.start,
            u"Données d'enquêtes de l'année %s" % str(self.simulation.input_table.survey_year),
            ])

    def get_aggregate(self, variable, filter_by = None):
        """
        Returns aggregate spending, and number of beneficiaries
        for the relevant entity level

        Parameters
        ----------
        variable : string
                   name of the variable aggregated according to its entity
        """
        simulation = self.simulation
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        column = column_by_name[variable]
        weight_name = self.weight_column_name_by_entity_key_plural[column.entity_key_plural]
        filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
        # amounts and beneficiaries from current data and default data if exists
        # Build weights for each entity
        data = DataFrame(
            {
                variable: simulation.calculate_add(variable),
                weight_name: simulation.calculate(weight_name),
                }
            )
        data_default = None

        datasets = {'data': data}
        if data_default is not None:
            datasets['default'] = data_default
        filter_indicator = True
        if filter_by:
            filtered_data = DataFrame(
                {
                    variable: simulation.calculate(variable),
                    weight_name: simulation.calculate(weight_name),
                    filter_by_name: simulation.calculate(filter_by_name),
                    }
                )
            data_default = None
            filter_indicator = filtered_data[filter_by_name]
        m_b = {}

        weight = data[weight_name] * filter_indicator
        for name, data in datasets.iteritems():
            amount = data[variable]
            benef = data[variable].values != 0
            try:
                total_amount = int(round(sum(amount * weight) / 10 ** 6))
            except:
                total_amount = nan
            try:
                total_benef = int(round(sum(benef * weight) / 10 ** 3))
            except:
                total_benef = nan

            m_b[name] = [total_amount, total_benef]

        return m_b

    def load_amounts_from_file(self, filename = None, year = None):
        '''
        Loads totals from files
        '''
        if year is None:
            year = self.year
        if filename is None:
            data_dir = DATA_DIR

        try:
            filename = os.path.join(data_dir, "amounts.h5")
            store = HDFStore(filename)

            df_a = store['amounts']
            df_b = store['benef']
            store.close()
            self.totals_df = DataFrame(data = {
                "amount": df_a[year] / 10 ** 6,
                "benef": df_b[year] / 1000,
                })
            row = DataFrame({'amount': nan, 'benef': nan}, index = ['logt'])
            self.totals_df = self.totals_df.append(row)

            # Add some aditionnals totals
            for col in ['amount', 'benef']:
                # Deals with logt
                logt = 0
                for var in ['apl', 'alf', 'als']:
                    logt += self.totals_df.get_value(var, col)
                self.totals_df.set_value('logt', col, logt)

                # Deals with rsa rmi
                rsa = 0
                for var in ['rmi', 'rsa']:
                    rsa += self.totals_df.get_value(var, col)
                self.totals_df.set_value('rsa', col, rsa)

                # Deals with irpp, csg, crds
                for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']:
                    if col in ['amount']:
                        val = - self.totals_df.get_value(var, col)
                        self.totals_df.set_value(var, col, val)
        except:
            #  raise Exception(" No administrative data available for year " + str(year))
            import warnings
            warnings.warn("No administrative data available for year %s in file %s" % (str(year), filename))
            self.totals_df = None
            return

    def save_table(self, directory = None, filename = None, table_format = None):
        '''
        Saves the table to some format
        '''
        now = datetime.now()
        if table_format is None:
            if filename is not None:
                extension = filename[-4:]
                if extension == '.xls':
                    table_format = 'xls'
                elif extension == '.csv':
                    table_format = 'csv'
            else:
                table_format = 'xls'

        if directory is None:
            directory = "."
        if filename is None:
            filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format)

        fname = os.path.join(directory, filename)

        try:
            df = self.aggr_frame
            if table_format == "xls":
                writer = ExcelWriter(str(fname))
                df.to_excel(writer, "aggregates", index= False, header= True)
                descr = self.create_description()
                descr.to_excel(writer, "description", index = False, header=False)
                writer.save()
            elif table_format == "csv":
                df.to_csv(fname, "aggregates", index= False, header = True)
        except Exception, e:
                raise Exception("Aggregates: Error saving file", str(e))