Example #1
0
def Impute(data_as_DataFrame, kNNGraph, Method = IgnoringNan.mean, target = None ):
    """Impute(data_as_DataFrame,Graph) -> pandas DataFrame with nan's imputed
    
    Imputation is via Graph Neighborhoods of kNNGraph
    Method is applied to each neighborhood array of values for a 
    vertex with an nan
    
    Note: data_as_DataFrame can also be a numpy array 
    """
    
    try:
        data_as_DataFrame.columns
        data_as_DataFrame.index
    
        DFrame = data_as_DataFrame.copy()
    except:
        DFrame = DataFrame( data_as_DataFrame )
        
    cols = DFrame.columns
    inds = DFrame.index
    Data = DFrame.as_matrix()
    
    m,n = DFrame.shape
    for i in range(m):
        nbrs = kNNGraph.neighbors(i)
        for j in range(n):
            if( isnan( Data[i,j] ) ):
                 DFrame.set_value( inds[i],cols[j], int( Method( array( [Data[nbr,j] for nbr in nbrs] ) ) ) )
    return DFrame
Example #2
0
class DataFrameModel(QAbstractTableModel):
    ''' data model for a DataFrame class '''
    def __init__(self):
        super(DataFrameModel, self).__init__()
        self.df = DataFrame()

    def setDataFrame(self, dataFrame):
        self.df = dataFrame

    def signalUpdate(self):
        ''' tell viewers to update their data (this is full update, not
        efficient)'''
        self.layoutChanged.emit()

    #------------- table display functions -----------------
    def headerData(self, section, orientation, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if orientation == Qt.Horizontal:
            try:
                return self.df.columns.tolist()[section]
            except (IndexError, ):
                return QVariant()
        elif orientation == Qt.Vertical:
            try:
                # return self.df.index.tolist()
                return self.df.index.tolist()[section]
            except (IndexError, ):
                return QVariant()

    def data(self, index, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if not index.isValid():
            return QVariant()

        return QVariant(str(self.df.ix[index.row(), index.column()]))

    def flags(self, index):
        flags = super(DataFrameModel, self).flags(index)
        col = self.df.columns[index.column()]
        if hasattr(value, 'toPyObject'):
            # PyQt4 gets a QVariant
            value = value.toPyObject()
        else:
            # PySide gets an unicode
            dtype = self.df[col].dtype
            if dtype != object:
                value = None if value == '' else dtype.type(value)
        self.df.set_value(row, col, value)
        return True

    def rowCount(self, index=QModelIndex()):
        return self.df.shape[0]

    def columnCount(self, index=QModelIndex()):
        return self.df.shape[1]
Example #3
0
class DataFrameModel(QAbstractTableModel):
    ''' data model for a DataFrame class '''
    def __init__(self):
        super(DataFrameModel,self).__init__()
        self.df = DataFrame()

    def setDataFrame(self,dataFrame):
        self.df = dataFrame

    def signalUpdate(self):
        ''' tell viewers to update their data (this is full update, not
        efficient)'''
        self.layoutChanged.emit()

    #------------- table display functions -----------------


    def headerData(self,section,orientation,role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if orientation == Qt.Horizontal:
            try:
                return self.df.columns.tolist()[section]                
            except (IndexError, ):
                return QVariant()
        elif orientation == Qt.Vertical:
            try:
                return self.df.index.tolist()
                return [str(i) for i in self.df.index.tolist()[section]]
            except (IndexError, ):
                return QVariant()

    def data(self, index, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if not index.isValid():
            return QVariant()

        return QVariant(str(self.df.ix[index.row(),index.column()]))

    def flags(self, index):
            flags = super(DataFrameModel, self).flags(index)
            flags |= Qt.ItemIsEditable
            return flags

    def setData(self, index, value, role):
        self.df.set_value(self.df.index[index.row()],
                          self.df.columns[index.column()],
                          value.toPyObject())
        return  True

    def rowCount(self, index=QModelIndex()):
        return self.df.shape[0]

    def columnCount(self, index=QModelIndex()):
        return self.df.shape[1]
Example #4
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
Example #5
0
def median_over_months(raw):
    n_years = len(raw.index)
    patterns = unique([date.split(' ')[1] for date in raw.columns[1:]])
    n_patterns = len(patterns)
    medians = DataFrame(np.zeros((n_years,1+n_patterns)), columns=['year']+patterns)
    medians['year'] = raw['year']
    for i_year in range(0, n_years):
        for i_pattern in range(0, n_patterns):
            columns_for_this_day = [col for col in raw.columns[1:] if col.split(' ')[1] == patterns[i_pattern]]
            medians.set_value(i_year, patterns[i_pattern], median(raw.iloc[i_year][columns_for_this_day]))
    return medians
Example #6
0
def sum_over_patterns(raw, new_name=None):
     n_years = len(raw.index)
     sums = DataFrame(np.zeros((n_years,1+12)), columns=['year']+months)
     sums['year'] = raw['year']
     for i_year in range(0, n_years):
         for i_month in range(0, 12):
             columns_for_this_month = [col for col in raw.columns[1:] if col.split(' ')[0] == months[i_month]]
             sums.set_value(i_year, months[i_month], sum(raw.iloc[i_year][columns_for_this_month]))
     
     if new_name != None:
         sums.columns = ['year'] +  [col+' '+new_name for col in sums.columns if col != 'year']
         
     return sums
Example #7
0
    def compute_tf_idf_queries(self):
        # Find total number of document
        results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents'))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute('SELECT did, total_word, path FROM documents')
        tmp = results.fetchall()
        documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path'])
        documents_df['tf_idf'] = 0.0

        no_docterm = {}

        for query in self.queries:
            no_docterm[query] = 0

        for index, row in documents_df.iterrows():
            path = row['path']
            with codecs.open(path, 'rt') as f:
                text = f.read()
                for query in self.queries:
                    if query in text.decode('utf-8').lower():
                        no_docterm[query] += 1

        for query in self.queries:
            for index, row in documents_df.iterrows():
                total_word = row['total_word']
                path = row['path']

                with codecs.open(path, 'rt') as f:
                    text = f.read()

                tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query])
                cur_tf_idf = documents_df.get_value(index, 'tf_idf')
                documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf)

        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])
        df['tf_idf'] = 0.0

        for index, row in df.iterrows():
            did = row['did']
            tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0]
            df.set_value(index, 'tf_idf', tf_idf)

        del df['did']
        df = df.groupby(['e_type', 'entity']).sum().reset_index()
        return df
Example #8
0
def fix_event_type(df: DataFrame):
    '''
    Not sure yet.
    :param df: Dataframe object.
    :return: Modified Dataframe.
    '''

    a = time.time()

    colsf = df['id'].ravel()            # list of all IDs
    unique = pd.Series(colsf).unique()  # get unique IDs
    u_counts = []                       # list of unique counts (UNUSED)
    counts_bucket = []                  # bucket of counts (UNUSED)
    df = pd.get_dummies(df)             # create dummy variables
    todrop = df.sum() < 50              # get columns where sum of dummy column < 50
    dropcols = df.columns[todrop]       # get those column names
    df = df.drop(dropcols, axis=1)      # drop those columns
    df['num_events'] = 0                # create number of events columns, set to 0
    # print(df.columns)
    print(str(len(unique)))

    for ii in range(0,len(unique)):     # loop through all the unique IDs
        subset = df.loc[df['id'] == unique[ii]]     # subset by that ID
        the_dummies = subset.columns != 'id'        # get all columns that do not equal that ID
        aa = subset.iloc[:, subset.columns != 'id'].sum().tolist()  # get all of those columns to list
        event_sum = np.sum(aa)      # sum all of those
        
        # aa = aa.set_index([[subset.index[0]]])
        # subset.iloc[:,subset.columns != 'id'] = aa
        df = df.set_value(subset.index, the_dummies, aa)
        df = df.set_value(subset.index, 'num_events', event_sum)
        # df.loc[subset.index] = subset
    df = df.drop_duplicates('id')
    print(df)
    b = time.time()
    print(b-a)
    return df
Example #9
0
def fsev_count(df: DataFrame, fsev: int,
               feature: str, train: bool,
               blist: list, bidx: int):

    colname = 'fsev_' + str(fsev) + '_' + str(feature)
    if train:
        a = df[df['fault_severity'] == fsev]
        b = a[feature].value_counts()[0:60]
        blist = b.tolist()
        bidx = b.index
        bdf = pd.DataFrame(b)
    df[colname] = 0
    # subset = df.loc[df.location.isin(a.index)]
    for i in range(0,len(blist)):
        percentile = blist[i]/np.sum(blist)
        locstr = str(bidx[i])
        subset = df.location == locstr
        df = df.set_value(df.location == locstr, colname, percentile)
    rval = df
    if train:
        rval = [df, blist,bidx]
    return rval
cf_dict=pd.read_csv('SourceData/county_facts_dictionary.csv')
cf_dict=cf_dict.set_index('column_name')
#pivoting and drop Null values for clean and easy analysis
pr_piv= pr[['fips', 'candidate','fraction_votes']].pivot(index='fips', columns='candidate', values='fraction_votes')
pr_piv.drop(' No Preference', axis=1, inplace=True)
pr_piv.drop(' Uncommitted', axis=1, inplace=True)
pr_facts=pd.merge(pr_piv, facts, right_index=True, left_index=True)
pr_facts=pr_facts.dropna()
c=pr[['candidate','party']].drop_duplicates().sort_values(by=['candidate','party'])
t=c[['candidate','party']].apply(tuple, axis=1).tolist()
d=dict(t)

#skipy linregress
l=len(pr_facts.columns)
linregress_unpiv = DataFrame('',index=range(l),columns=['party','candidate','fact','Rvalue','Pvalue','StdError','Slope','Intercept'])
i=0
for c_X in pr_piv.columns:
  for c_Y in cf_dict.index:
    R=linregress(pr_facts[[c_X,c_Y]])
    #
    linregress_unpiv.set_value(i,'party',d[c_X])
    linregress_unpiv.set_value(i,'candidate',c_X)
    linregress_unpiv.set_value(i,'fact',c_Y)
    linregress_unpiv.set_value(i,'Rvalue',R.rvalue)
    linregress_unpiv.set_value(i,'Pvalue',R.pvalue)
    linregress_unpiv.set_value(i,'StdError',R.stderr)
    linregress_unpiv.set_value(i,'Slope',R.slope)
    linregress_unpiv.set_value(i,'Intercept',R.intercept)
    i+=1
linregress_unpiv.to_csv('DataForTableau/primary_results_county_facts_linregress.csv')
pvalue=DataFrame(np.nan,index=index,columns=index)
pvalue.index.names=['Party','Candidate']
pvalue.index.lexsort_depth
pvalue.columns.lexsort_depth
#StdErr
stderr=DataFrame(np.nan,index=index,columns=index)
stderr.index.names=['Party','Candidate']
stderr.index.lexsort_depth
stderr.columns.lexsort_depth
#
for c_X in pr_piv.columns:
  for c_Y in pr_piv.columns:
    R=linregress(pr_piv[[c_X,c_Y]])
    p_X=index.get_loc_level(c_X,1)[1][0]
    p_Y=index.get_loc_level(c_Y,1)[1][0]
    rvalue.set_value((p_Y,c_Y), (p_X,c_X), R.rvalue)
    pvalue.set_value((p_Y,c_Y), (p_X,c_X),R.pvalue)
    stderr.set_value((p_Y,c_Y), (p_X,c_X), R.stderr)


#democrats only
heatmap(rvalue.loc['Democrat']['Democrat'],'dem_rvalue.png')
heatmap(pvalue.loc['Democrat']['Democrat'],'dem_pvalue.png')
heatmap(stderr.loc['Democrat']['Democrat'],'dem_stderr.png')
#republicans only
heatmap(rvalue.loc['Republican']['Republican'],'rep_rvalue.png')
heatmap(pvalue.loc['Republican']['Republican'],'rep_pvalue.png')
heatmap(stderr.loc['Republican']['Republican'],'rep_stderr.png')

#most anticorrelated republicans
RepRvalue_idxmin=rvalue.loc['Republican']['Republican'].idxmin(axis=0)
Example #12
0
class Scores(AnnotationMixin, object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(
        cls, df,
        uri=None, modality=None, aggfunc=np.mean
    ):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        A = cls(uri=uri, modality=modality)
        A._df = pivot_table(
            df, values=PYANNOTE_SCORE,
            rows=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], cols=PYANNOTE_LABEL,
            aggfunc=aggfunc
        )
        return A

    def __init__(self, uri=None, modality=None):
        super(Scores, self).__init__()

        index = MultiIndex(
            levels=[[], []], labels=[[], []],
            names=[PYANNOTE_SEGMENT, PYANNOTE_TRACK]
        )

        self._df = DataFrame(index=index, dtype=np.float64)
        self.modality = modality
        self.uri = uri
        self._timelineHasChanged = True

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self._df = self._df.drop(segment, axis=0)
            self._timelineHasChanged = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self._df = self._df.drop((segment, track), axis=0)
            self._timelineHasChanged = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):
        segment, track, label = key
        return self._df.get_value((segment, track), label)

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return {l: self._df.get_value((segment, track), l) for l in self._df}

    # scores[segment, track, label] = value
    def __setitem__(self, key, value):
        segment, track, label = key
        self._df = self._df.set_value((segment, track), label, value)
        self._timelineHasChanged = True

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        labels = sorted(self._df.columns, key=str)
        if unknown:
            return labels
        else:
            return [l for l in labels if not isinstance(l, Unknown)]

    def itervalues(self):
        """Iterate over annotation as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._df = self._df.sort_index()

        # yield one (segment, track, label) tuple per loop
        labels = self._df.columns
        for (segment, track), columns in self._df.iterrows():
            for label in labels:
                value = columns[label]
                if np.isnan(value):
                    continue
                else:
                    yield segment, track, label, value

    def _rank(self, invert):

        if invert:
            direction = 1.

        else:
            direction = -1.

        def nan_rank(data):

            # replace NaN by -inf or +inf depending on the requested direction
            finite = np.isfinite(data)
            fixed = np.where(finite, direction*data, -direction*np.inf)

            # do the actual argsort
            indices = np.argsort(fixed)
            # get rank from argsort
            rank = np.argsort(indices)

            # special treatment for inverted NaN scores
            # (we want ranks to start at 0 even in case of NaN)
            if invert:
                rank = np.where(finite, rank-(len(data)-np.sum(finite)), np.nan)
            else:
                rank = np.where(finite, rank, np.nan)
            return rank

        return self._df.apply(nan_rank, axis=1)

    def rank(self, invert=False):
        """

        Parameters
        ----------
        invert : bool, optional
            By default, larger scores are better.
            Set `invert` to True to indicate smaller scores are better.

        Returns
        -------
        rank : `Scores`

        """
        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = self._rank(invert)
        return A

    def nbest(self, n, invert=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        invert : bool, optional
            By default, larger scores are better.
            Set `invert` to True to indicate smaller scores are better.

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """
        df = self._df.copy()
        nbest = self._rank(invert) < n
        df[~nbest] = np.nan

        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = df

        return A

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = self._df[list(labels)]

        return A

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        annotation = Annotation(uri=self.uri, modality=self.modality)
        if not self:
            return annotation

        best = self.nbest(1, invert=False)

        if posterior:

            # compute unknown posterior
            func = lambda p: 1. - np.nansum(p, axis=1)
            Pu = self.apply(func, new_columns=['_'])

            # threshold best target posterior
            # with unknown posterior and threshold
            for segment, track, label, value in best.itervalues():

                if value < Pu[segment, track, '_'] or value < threshold:
                    label = Unknown()

                annotation[segment, track] = label

        else:

            # threshold best target score with threshold
            for segment, track, label, value in best.itervalues():
                if value < threshold:
                    label = Unknown()
                annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""
        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = func(self._df)
        return A

    def apply(self, data_func, new_index=None, new_columns=None):
        """Apply `data_func` on internal numpy array

        Parameters
        ----------
        data_func : func
            Function expecting (index x columns) numpy array as input
        new_index : iterable, optional
            When provided, these will be the index of returned array.
        new_columns : iterable, optional
            When provided, these will be the columns of returned array.
        """
        new_data = data_func(self._df.values)

        if new_index is None:
            new_index = self._df.index

        if new_columns is None:
            new_columns = self._df.columns

        df = DataFrame(
            data=new_data,
            index=new_index,
            columns=new_columns)

        new_scores = self.__class__(uri=self.uri, modality=self.modality)
        new_scores._df = df

        return new_scores

    def _repr_png_(self):
        from pyannote.core.notebook import repr_scores
        return repr_scores(self)
Example #13
0
    def diag(self):

        df = self.get_results_dataframe(index_by_code = True)
        df_nivvie = df.xs('nivvie')
        df_revdisp = df.xs('revdisp')
        df_rev = df.xs('rev_trav') + df.xs('pen') + df.xs('rev_cap_net') 
        
        df_af = df.xs('af')

        df_pfam = df.xs('pfam') 
        df_mini = df.xs('mini')
        df_logt = df.xs('logt')
        df_impo = df.xs('ppe') + df.xs('impo')
        df_impo.name = "impo+ppe"
        df_public = df.xs('psoc') + df.xs('ppe') + df.xs('impo')
        
        loyer_chef = self.scenario_chef_seul.menage[0]['loyer']
        
        pension_alim_tot = sum([ var['pension_alim'] for var in self.children.values()])
        
        noi = self.children.keys()[0]
        if self.children[noi]["temps_garde"] == 'alternee_pension_non_decl':
            
            df_revdisp['chef'] = ( df_rev['chef'] + df_mini['chef_seul'] + 
                                   df_af['part']/2 + 
                                   df_logt['chef_seul'] - pension_alim_tot +
                                   df_impo['chef'] )
            df_pfam['chef'] = df_af['part']/2
            df_logt['chef'] = df_logt['chef_seul']
            df_mini['chef']  = df_mini['chef_seul']
            df_public['chef'] = ( df_logt['chef_seul'] + df_mini['chef_seul']+ 
                                  df_pfam['chef'] + df_impo['chef'] )
            df_nivvie['chef'] = df_revdisp['chef']/self.uc['chef']
            
            df_revdisp['part'] = ( df_revdisp['part'] - df_af['part']/2 + 
                                   pension_alim_tot )
            df_pfam['part'] -= df_af['part']/2
            df_public['part'] = ( df_logt['part'] + df_mini['part']+ 
                                  df_pfam['part'] + df_impo['part'] )
            df_nivvie['part'] = df_revdisp['part']/self.uc['part'] 
        
        uc_couple = self.uc['couple']
        total_cost_before = ((uc_couple-1.5)/uc_couple)*(df_revdisp['couple'])
        
        public_cost_before = ( df_public['couple'] - df_public['couple_seul'])
        private_cost_before = total_cost_before - public_cost_before
        
        uc_chef = self.uc['chef']
        uc_part = self.uc['part']
        
        total_cost_after_chef = (uc_chef-1)/(uc_chef)*df_revdisp['chef']
        total_cost_after_part = (uc_part-1)/(uc_part)*df_revdisp['part'] 
        
#        total_cost_after = total_cost_after_chef + total_cost_after_part
        
        public_cost_after_chef = df_public['chef'] - df_public['chef_seul']
        public_cost_after_part = df_public['part'] - df_public['part_seul'] 
        
        #public_cost_after = ( public_cost_after_chef + public_cost_after_part )
        #private_cost_after = total_cost_after - public_cost_after
        # private_cost_after_chef = total_cost_after_chef + pension_alim_tot - public_cost_after_chef
        # private_cost_after_part = total_cost_after_part - pension_alim_tot - public_cost_after_part

        private_cost_after_chef = total_cost_after_chef - public_cost_after_chef
        private_cost_after_part = total_cost_after_part - public_cost_after_part
        
        desunion_public_cost = df_public['part'] + df_public['chef'] - df_public['couple'] 
        
        nivvie_loss_couple = df_nivvie[u"couple"]/df_nivvie["couple_seul"] 
        nivvie_loss_chef = df_nivvie[u"chef"]/df_nivvie["chef_seul"]
        nivvie_loss_part = df_nivvie[u"part"]/df_nivvie["part_seul"]
        
        
        df2 = DataFrame( [df_revdisp, df_pfam, df_mini, df_logt, df_impo, df_nivvie])
        df2 = df2[ ['couple', 'part', 'chef'] ]
        df2 = df2.set_value(u"dépense totale pour enfants", 'couple', total_cost_before)
        df2 = df2.set_value(u"dépense totale pour enfants", 'chef', total_cost_after_chef)
        df2 = df2.set_value(u"dépense totale pour enfants", 'part', total_cost_after_part)
        df2 = df2.set_value(u"prise en charge publique de l'enfant", 'couple', public_cost_before)
        df2 = df2.set_value(u"prise en charge publique de l'enfant", 'chef', public_cost_after_chef)
        df2 = df2.set_value(u"prise en charge publique de l'enfant", 'part', public_cost_after_part)
        df2 = df2.set_value(u"prise en charge privée de l'enfant", 'couple', private_cost_before)
        df2 = df2.set_value(u"prise en charge privée de l'enfant", 'chef', private_cost_after_chef)
        df2 = df2.set_value(u"prise en charge privée de l'enfant", 'part', private_cost_after_part)
        df2 = df2.set_value(u"loyer", 'couple', 12*self.scenario.menage[0]['loyer'])    
        df2 = df2.set_value(u"loyer", 'chef', 12*loyer_chef)
        df2 = df2.set_value(u"loyer", 'part', 12*self.scenario_part.menage[0]['loyer'])
        df2 = df2.set_value(u"pension", 'couple', 0)    
        df2 = df2.set_value(u"pension", 'chef', -pension_alim_tot )
        df2 = df2.set_value(u"pension", 'part', pension_alim_tot)
        
        df2 = df2.set_value(u"nivvie_loss", 'couple', nivvie_loss_couple)    
        df2 = df2.set_value(u"nivvie_loss", 'chef', nivvie_loss_chef)
        df2 = df2.set_value(u"nivvie_loss", 'part', nivvie_loss_part)
        df2 = df2.set_value(u"coût public de la désunion", "couple", desunion_public_cost )
        
        df2 = df2.T
        df2.index.name = u"ménage"
        df2 = df2.reset_index() 
        
        return df2
pvalue.columns.names=['Party','Candidate']
pvalue.columns.lexsort_depth
pvalue.index.names=['Fact']
#StdErr
stderr = DataFrame(np.nan,index=cf_dict.index,columns=index)
stderr.columns.names=['Party','Candidate']
stderr.columns.lexsort_depth
stderr.index.names=['Fact']


#
for c_X in pr_piv.columns:
  for c_Y in cf_dict.index:
    R=linregress(pr_facts[[c_X,c_Y]])
    p_X=index.get_loc_level(c_X,1)[1][0]
    rvalue.set_value(c_Y,(p_X,c_X), R.rvalue)
    pvalue.set_value(c_Y,(p_X,c_X), R.pvalue)
    stderr.set_value(c_Y,(p_X,c_X), R.stderr)

#It's a huge image and it's hard to review
heatmap(rvalue,'rvalue_facts.png')
heatmap(pvalue,'pvalue_facts.png')
heatmap(stderr,'stderr_facts.png')

#Let's find out the most correlated facts to Democrat candidates choice
#democrats only

DemRvalue=rvalue['Democrat']
DemPvalue=pvalue['Democrat']
DemStdErr=stderr['Democrat']
Example #15
0
decil, values = mark_weighted_percentiles(nivvie, labels, wprm, method, return_quantiles = True)


df2 = DataFrame({"decile" : decil})
df["decile"] = df2["decile"]



indexes = { "zrstm" : .01, "zchom": .01, "pfamm" : .01} # TODO change 1%
results = DataFrame(index =indexes.keys(), columns = ["total", "pauvre50", "pauvre60"] + ["decile>"+str(decile) for decile in range(0,10)] )

for var, index in indexes.iteritems():
    total = df[var]*index*df["wprm"]*df["champm"]
    pauvre50 = df[var]*index*df["wprm"]*(df["pauvre50m"]<=0)*df["champm"]
    pauvre60 = df[var]*index*df["wprm"]*(df["pauvre60m"]<=0)*df["champm"]
    results.set_value(var, "total", total.sum()/1e6)
    results.set_value(var, "pauvre50", pauvre50.sum()/1e6)
    results.set_value(var, "pauvre60", pauvre60.sum()/1e6)
    for decile in range(0,10):
        temp = df[var]*index*df["wprm"]*(df["decile"]>decile)*df["champm"]
        results.set_value(var, "decile>"+str(decile), temp.sum()/1e6)
        del temp

print results
import os
filename = os.path.join(destination_dir,"desindexation.xls")
print filename
writer = ExcelWriter(str(filename))
results.to_excel(writer)
writer.save()
Example #16
0
    def fit(self, annotations):
        """

        Parameters
        ----------
        annotations : (Annotation, Annotation) iterator

        Returns
        -------


        """

        # possible_match[n, m] is the total possible match duration
        # when there are n A-tracks & m B-tracks
        possible_match = DataFrame()

        # actual_match[n, m] is the total actual match duration
        # when there are n A-tracks & m B-tracks
        actual_match = DataFrame()

        # overlap[n, m] is the total duration
        # when there are n A-tracks & m B-tracks
        overlap = DataFrame()

        for n, (A, B) in enumerate(annotations):

            assert isinstance(A, Annotation), "%r is not an Annotation" % A
            assert isinstance(B, Annotation), "%r is not an Annotation" % B
            if n == 0:
                self.modalityA = A.modality
                self.modalityB = B.modality
            else:
                assert A.modality == self.modalityA, \
                    "bad modality (%r, %r)" % (self.modalityA, A.modality)
                assert B.modality == self.modalityB, \
                    "bad modality (%r, %r)" % (self.modalityB, B.modality)
            assert A.uri == B.uri, \
                "resource mismatch (%r, %r)" % (A.uri, B.uri)

            timeline, a, b = self._AB2ab(A, B)

            for segment in timeline:

                duration = segment.duration

                # number of tracks
                atracks = a.tracks(segment)
                Na = len(atracks)
                btracks = b.tracks(segment)
                Nb = len(btracks)

                if Na == 0 or Nb == 0:
                    continue

                # number of matching tracks
                N = len(a.get_labels(segment) & b.get_labels(segment))

                # increment possible_match & actual_match
                try:
                    p_m = possible_match.get_value(Na, Nb)
                    a_m = actual_match.get_value(Na, Nb)
                    ovl = overlap.get_value(Na, Nb)
                except Exception, e:
                    p_m = 0.
                    a_m = 0.
                    ovl = 0.

                possible_match = possible_match.set_value(Na, Nb,
                                                          p_m + min(Na, Nb)*duration)
                actual_match = actual_match.set_value(Na, Nb,
                                                      a_m + N*duration)
                overlap = overlap.set_value(Na, Nb, ovl + duration)
class Aggregates(object):
    filter_by = None
    labels = collections.OrderedDict((
        ('var', u"Mesure"),
        ('entity', u"Entité"),
        ('dep', u"Dépenses\n(millions d'€)"),
        ('benef', u"Bénéficiaires\n(milliers)"),
        ('dep_default', u"Dépenses initiales\n(millions d'€)"),
        ('benef_default', u"Bénéficiaires\ninitiaux\n(milliers)"),
        ('dep_real', u"Dépenses\nréelles\n(millions d'€)"),
        ('benef_real', u"Bénéficiaires\nréels\n(milliers)"),
        ('dep_diff_abs', u"Diff. absolue\nDépenses\n(millions d'€)"),
        ('benef_diff_abs', u"Diff absolue\nBénéficiaires\n(milliers)"),
        ('dep_diff_rel', u"Diff. relative\nDépenses"),
        ('benef_diff_rel', u"Diff. relative\nBénéficiaires"),
        ))  # TODO: localize
    show_default = False
    show_diff = True
    show_real = True
    survey_scenario = None
    totals_df = None
    varlist = None

    def __init__(self, survey_scenario = None):
        if survey_scenario is not None:
            self.set_survey_scenario(survey_scenario)

    def clear(self):
        self.totals_df = None

    def compute(self):
        """
        Compute the whole table
        """
        self.compute_aggregates(self.filter_by)
        self.load_amounts_from_file()
        self.compute_real()
        self.compute_diff()

    def compute_aggregates(self, filter_by = None):
        """
        Compute aggregate amounts
        """
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        V = []
        M = {'data': [], 'default': []}
        B = {'data': [], 'default': []}
        U = []

        M_label = {'data': self.labels['dep'],
                   'default': self.labels['dep_default']}
        B_label = {'data': self.labels['benef'],
                   'default': self.labels['benef_default']}

        for var in self.varlist:
            # amounts and beneficiaries from current data and default data if exists
            montant_benef = self.get_aggregate(var, filter_by)
            V.append(column_by_name[var].label)
            entity = column_by_name[var].entity_key_plural

            U.append(entity)
            for dataname in montant_benef:
                M[dataname].append(montant_benef[dataname][0])
                B[dataname].append(montant_benef[dataname][1])

        # build items list
        items = [(self.labels['var'], V)]

        for dataname in M:
            if M[dataname]:
                items.append((M_label[dataname], M[dataname]))
                items.append((B_label[dataname], B[dataname]))

        items.append((self.labels['entity'], U))
        aggr_frame = DataFrame.from_items(items)

        self.aggr_frame = None
        for code, label in self.labels.iteritems():
            try:
                col = aggr_frame[label]
                if self.aggr_frame is None:
                    self.aggr_frame = DataFrame(col)
                else:
                    self.aggr_frame = self.aggr_frame.join(col, how="outer")
            except:
                pass

    def compute_diff(self):
        '''
        Computes and adds relative differences
        '''

        dep = self.aggr_frame[self.labels['dep']]
        benef = self.aggr_frame[self.labels['benef']]

        if self.show_default:
            ref_dep_label, ref_benef_label = self.labels['dep_default'], self.labels['benef_default']
            if ref_dep_label not in self.aggr_frame:
                return
        elif self.show_real:
            ref_dep_label, ref_benef_label = self.labels['dep_real'], self.labels['benef_real']
        else:
            return

        ref_dep = self.aggr_frame[ref_dep_label]
        ref_benef = self.aggr_frame[ref_benef_label]

        self.aggr_frame[self.labels['dep_diff_rel']] = (dep - ref_dep) / abs(ref_dep)
        self.aggr_frame[self.labels['benef_diff_rel']] = (benef - ref_benef) / abs(ref_benef)
        self.aggr_frame[self.labels['dep_diff_abs']] = dep - ref_dep
        self.aggr_frame[self.labels['benef_diff_abs']] = benef - ref_benef

    def compute_real(self):
        '''
        Adds administrative data to dataframe
        '''
        if self.totals_df is None:
            return
        A, B = [], []
        for var in self.varlist:
            # totals from administrative data
            if var in self.totals_df.index:
                A.append(self.totals_df.get_value(var, "amount"))
                B.append(self.totals_df.get_value(var, "benef"))
            else:
                A.append(nan)
                B.append(nan)
        self.aggr_frame[self.labels['dep_real']] = A
        self.aggr_frame[self.labels['benef_real']] = B

    def create_description(self):
        '''
        Creates a description dataframe
        '''
        now = datetime.now()
        return DataFrame([
            u'OpenFisca',
            u'Calculé le %s à %s' % (now.strftime('%d-%m-%Y'), now.strftime('%H:%M')),
            u'Système socio-fiscal au %s' % self.simulation.period.start,
            u"Données d'enquêtes de l'année %s" % str(self.simulation.input_table.survey_year),
            ])

    def get_aggregate(self, variable, filter_by = None):
        """
        Returns aggregate spending, and number of beneficiaries
        for the relevant entity level

        Parameters
        ----------
        variable : string
                   name of the variable aggregated according to its entity
        """
        simulation = self.simulation
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        column = column_by_name[variable]
        weight_name = self.weight_column_name_by_entity_key_plural[column.entity_key_plural]
        filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
        # amounts and beneficiaries from current data and default data if exists
        # Build weights for each entity
        data = DataFrame(
            {
                variable: simulation.calculate_add(variable),
                weight_name: simulation.calculate(weight_name),
                }
            )
        data_default = None

        datasets = {'data': data}
        if data_default is not None:
            datasets['default'] = data_default
        filter_indicator = True
        if filter_by:
            filtered_data = DataFrame(
                {
                    variable: simulation.calculate(variable),
                    weight_name: simulation.calculate(weight_name),
                    filter_by_name: simulation.calculate(filter_by_name),
                    }
                )
            data_default = None
            filter_indicator = filtered_data[filter_by_name]
        m_b = {}

        weight = data[weight_name] * filter_indicator
        for name, data in datasets.iteritems():
            amount = data[variable]
            benef = data[variable].values != 0
            try:
                total_amount = int(round(sum(amount * weight) / 10 ** 6))
            except:
                total_amount = nan
            try:
                total_benef = int(round(sum(benef * weight) / 10 ** 3))
            except:
                total_benef = nan

            m_b[name] = [total_amount, total_benef]

        return m_b

    def load_amounts_from_file(self, filename = None, year = None):
        '''
        Loads totals from files
        '''
        if year is None:
            year = self.year
        if filename is None:
            data_dir = DATA_DIR

        try:
            filename = os.path.join(data_dir, "amounts.h5")
            store = HDFStore(filename)

            df_a = store['amounts']
            df_b = store['benef']
            store.close()
            self.totals_df = DataFrame(data = {
                "amount": df_a[year] / 10 ** 6,
                "benef": df_b[year] / 1000,
                })
            row = DataFrame({'amount': nan, 'benef': nan}, index = ['logt'])
            self.totals_df = self.totals_df.append(row)

            # Add some aditionnals totals
            for col in ['amount', 'benef']:
                # Deals with logt
                logt = 0
                for var in ['apl', 'alf', 'als']:
                    logt += self.totals_df.get_value(var, col)
                self.totals_df.set_value('logt', col, logt)

                # Deals with rsa rmi
                rsa = 0
                for var in ['rmi', 'rsa']:
                    rsa += self.totals_df.get_value(var, col)
                self.totals_df.set_value('rsa', col, rsa)

                # Deals with irpp, csg, crds
                for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']:
                    if col in ['amount']:
                        val = - self.totals_df.get_value(var, col)
                        self.totals_df.set_value(var, col, val)
        except:
            #  raise Exception(" No administrative data available for year " + str(year))
            import warnings
            warnings.warn("No administrative data available for year %s in file %s" % (str(year), filename))
            self.totals_df = None
            return

    def save_table(self, directory = None, filename = None, table_format = None):
        '''
        Saves the table to some format
        '''
        now = datetime.now()
        if table_format is None:
            if filename is not None:
                extension = filename[-4:]
                if extension == '.xls':
                    table_format = 'xls'
                elif extension == '.csv':
                    table_format = 'csv'
            else:
                table_format = 'xls'

        if directory is None:
            directory = "."
        if filename is None:
            filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format)

        fname = os.path.join(directory, filename)

        try:
            df = self.aggr_frame
            if table_format == "xls":
                writer = ExcelWriter(str(fname))
                df.to_excel(writer, "aggregates", index= False, header= True)
                descr = self.create_description()
                descr.to_excel(writer, "description", index = False, header=False)
                writer.save()
            elif table_format == "csv":
                df.to_csv(fname, "aggregates", index= False, header = True)
        except Exception, e:
                raise Exception("Aggregates: Error saving file", str(e))
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

# source data
pr=pd.read_csv('primary_results.csv')
#pivoting
pr_piv= pr[['fips', 'candidate','fraction_votes']].pivot(index='fips', columns='candidate', values='fraction_votes')
pr_piv.drop(' No Preference', axis=1, inplace=True)
pr_piv.drop(' Uncommitted', axis=1, inplace=True)
pr_piv=pr_piv.dropna()
l=len(pr_piv.index)
pr_unpiv = DataFrame('',index=range(l*14),columns=['fips','fraction_votes','candidate'])

j=0
while j<len(pr_unpiv):
  for i in range(0,l-1):
    for c in pr_piv.columns:
      pr_unpiv.set_value(j, 'fips', pr_piv.index[i])
      pr_unpiv.set_value(j, 'fraction_votes', pr_piv.get_value(pr_piv.index[i],c))
      pr_unpiv.set_value(j, 'candidate', c)
      j+=1
pr_unpiv.to_csv('DataForTableau/primary_results_dropna.csv')
Example #19
0
    answers = filtered_data[filtered_data.question_id ==qid][['user_id','correct']]
    answers.columns = ['user_id','answer']
    users_subset = users.merge(answers,how='inner',on='user_id')

    #small adjustment to mean to remove the effect of the question being analized
    users_subset['mean'] = (users_subset['mean']*
                            users_subset['count']-
                            users_subset['answer'])/(users_subset['count']-1)
    
    for quant in STUDENT_QUANTILES:
        quant2 = score_percentiles[quant]
        means = users_subset.groupby(by=[users_subset['percentile'] > quant2]).agg({'answer':'mean'})
        t = str(int(quant*100))
        try:
            prob_good =means.get_value(True,'answer')
            results.set_value(qid,'good_'+t,prob_good)
        except:
            pass
        try:
            prob_bad = means.get_value(False,'answer')
            results.set_value(qid,'bad_'+t,prob_bad)
        except:
            pass

### Plot the resulting ratios
for quant in STUDENT_QUANTILES:
    t = str(int(quant*100))
    plt.plot(results['bad_'+t],results['good_'+t],'b.')
    plt.plot(np.arange(0,1.1,.1),np.arange(0,1.1,.1),'g-',alpha=.5)
    plt.title("Discrimination: "+t+"th Percentile")
    plt.ylabel("Proportion Right, Good Students")