def find_degree_vector(dfi):
    from pandas import DataFrame as DF
    results = [{dof: (dfi.columns[i], 'coh')} for i, dof in enumerate(DF.sum(dfi))]
    temp = [{dof: (dfi.index[i], 'doc')} for i, dof in enumerate(DF.sum(dfi, axis=1))]
    results.extend(temp)
    # results = [{i[0]: (i[1], 'coh')} for i in DF.sum(dfi)]
    return sorted(results)
Example #2
0
def propNoteGraph(data_test,b_u,b_i,mu,L,R):
    # Give the interesting graphic
    index_note = np.arange(1,6)
    count_1 = np.zeros([5,2])
    count_2 = np.zeros([5,2])
    notes = DataFrame(count_1,index=index_note,columns=['BON','MAUVAIS'])
    notes_naif = DataFrame(count_2,index=index_note,columns=['BON','MAUVAIS'])
    
    for r in range(data_test.shape[0]):
#        r_pred = round(mu + b_u[data_test.user_id.values[r]] + b_i[data_test.movie_id.values[r]] + X[data_test.user_id.values[r],data_test.movie_id.values[r]])           
        mean = mu + b_u[data_test[r,0]] + b_i[data_test[r,1]]        
        r_pred = round(mean + np.dot(L[data_test[r,0],:],R[data_test[r,1],:]))          
        r_pred = min(5,r_pred)
        r_pred = max(1,r_pred)
        r_true = int(round(mean+data_test[r,2]))
        r_naif = round(mean)

        if r_naif==r_true:
            notes_naif.BON[r_true]+=1
        else:
            notes_naif.MAUVAIS[r_true]+=1
        
        if r_pred==r_true:
            notes.BON[r_true]+=1
        else:
            notes.MAUVAIS[r_pred]+=1
                
    notes_naif_prop = notes_naif.div(notes_naif.sum(1),axis=0)
    notes_prop = notes.div(notes.sum(1),axis=0)
    
    notes_naif_VS_algo = pd.concat([notes_prop.BON,notes_naif_prop.BON], axis=1)
    notes_naif_VS_algo.columns = ['ALGO','NAIF']
    return notes_naif_VS_algo
Example #3
0
def hmm_build(alphabet, aln, threshold, sigma):
    '''Given alphabet, multiple alignment aln, insertion threshold and pseudocount sigma,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # k states
    states_ = ['M{0} D{0} I{0}'.format(i).split() for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']
    
    # building matrices
    transitions = DataFrame(data=0.0, columns=states, index=states)
    emissions = DataFrame(data=0.0, columns=alphabet, index=states) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # scale rows to [0, 1]
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)
    
    #add pseudocounts
    transitions.iloc[:2, 1:4] += sigma
    transitions.iloc[-4:-1, -2:] += sigma
    for i in range(k):
        transitions.iloc[i*3-1:i*3+2, i*3+1:i*3+4] += sigma
        emissions.iloc[i*3+1:i*3+3, :] += sigma
    emissions.iloc[-2, :] += sigma
    
    # scale again
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

    return transitions, emissions
Example #4
0
def descriptiveStatsDataFrame():
    df = DataFrame([[1.4, np.nan], [7, 5], [np.nan, np.nan], [7,10]], index=['a','b','c','d'], columns=['one','two'])
    print (df)
    print ('Column Sum: \n{}'.format(df.sum(axis=0)))
    print ('Row Sum: \n{}'.format(df.sum(axis=1)))
    print ('Do not skip NA: \n{}'.format(df.sum(axis=1, skipna=False)))
    print ('Index with min Value: \n{}'.format(df.idxmin()))
    print ('Summary Statistic: \n{}'.format(df.describe()))
Example #5
0
def aggregate_chunks(mod_features_df, modality):
    without_info_df = mod_features_df.query('field != "info"')
    cnt_df = DataFrame([list(mod_features_df.loc[('info', 'count'), :].values)] * len(without_info_df),
                       index=without_info_df.index)
    agg_df = without_info_df * cnt_df
    agg_df = DataFrame(agg_df.sum(axis=1) / cnt_df.sum(axis=1), index=without_info_df.index)
    agg_df['modality'] = modality
    agg_df.set_index('modality', append=True, inplace=True)
    agg_df = agg_df.reorder_levels(['modality', 'field', 'feature'])
    return agg_df
Example #6
0
def summaryStatDataFrame():
    df = DataFrame(np.arange(12).reshape(4,3),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green', 'Red','Green']])
    df.index.names = ['key1','key2']
    df.columns.names = ['state','color']
    print (df)
    print ('Sum of key1: \n{}'.format(df.sum(level='key1')))
    print ('Sum of key2: \n{}'.format(df.sum(level='key2')))
    print ('Sum of state: \n{}'.format(df.sum(level='state', axis = 1)))
    print ('Sum of color: \n{}'.format(df.sum(level='color', axis = 1)))
Example #7
0
def project_participation_evolution(
        pm_frame, all_authors, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to projects with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = list(all_authors)
        title = "Participation per project in Polymath\
                 (threshold = {})".format(n)
    else:
        thread_type = 'research threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = set().union(
            *data['research threads', 'authors (accumulated)'])
        title = "Participation per project in Polymath\
                 (threshold = {}, only research-threads)".format(n)
    data.index = data.index.droplevel(1)
    author_project = DataFrame(columns=all_authors)
    for author in author_project.columns:
        author_project[author] = data[
            thread_type, 'authors (accumulated)'].apply(
                lambda project, author=author: author in project)
    author_project = author_project.T
    author_project = author_project.sort_values(by=data.index.tolist(),
                                                ascending=False)
    author_project = author_project.drop(
        "Anonymous") if skip_anon else author_project
    select = author_project.sum(axis=1) >= n
    return author_project, data.index, select, title
Example #8
0
def thread_participation_evolution(
        pm_frame, project, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to threads in project with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        title = "Participation per thread in {} (threshold = {})".format(
            project, n)
    else:
        thread_type = 'research threads'
        title = "Participation per thread in {}\
                 (threshold = {}, only research-threads)".format(project, n)
    data = pm_frame.loc[project][['basic', thread_type]]
    data = data.dropna()
    all_authors = set().union(*data[thread_type, 'authors'])
    author_thread = DataFrame(columns=all_authors)
    for author in author_thread.columns:
        author_thread[author] = data[thread_type, 'authors'].apply(
            lambda thread, author=author: author in thread)
    author_thread = author_thread.T
    author_thread = author_thread.sort_values(by=data.index.tolist(),
                                              ascending=False)
    author_thread = author_thread.drop(
        "Anonymous") if skip_anon else author_thread
    author_thread.columns.name = "Threads"
    select = author_thread.sum(axis=1) >= n
    return author_thread, data.index, select, title
Example #9
0
def analyze():
    signals = read_csv(FILE_SIGNALS)
    devices = signals["id"].unique()
    
    print("got %d signals from %d devices" % (len(signals), len(devices)))

    signals = signals.groupby(["frequency", "id"]).size()
    signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices],
                                                      names=signals.index.names),
                              fill_value=0)
    signals = signals.unstack("id")
    
    # let's only keep frequencies with all signals present
    candidates = signals.dropna()
    # suggest frequency where the weakest sensor has the most
    # received signals, and then the frequency with most total
    # received signals for all sensors
    candidates = DataFrame({"total":   candidates.sum(axis=1),
                            "weakest": candidates.min(axis=1)})
    appropriate_freq = candidates.sort(["weakest", "total"],
                                       ascending=False).index[0]
    print("suggesting frequency %s" % mhz(appropriate_freq))

    signals.to_csv("spectrum.csv")
    
    import matplotlib.pyplot as plt
    from matplotlib.ticker import EngFormatter

    p=signals.plot(kind="Area")
    p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2))
    plt.savefig(FILE_SPECTRUM, dpi=300)
    print("saved spectrum as %s" % FILE_SPECTRUM)
def getNgrams(query, corpus, startYear, endYear, smoothing):
    
    
    params = dict(content=query, year_start=startYear, year_end=endYear,
                  corpus=corpora[corpus], smoothing=1)
    
    req = requests.get('http://books.google.com/ngrams/graph', params=params)
    res = re.findall('var data = (.*?);\\n', req.text)
    if res:
        data = {qry['ngram']: qry['timeseries']
                for qry in literal_eval(res[0])}
        df = DataFrame(data)
        df_sum = df.sum(axis=1)
        
        final_sum = df_sum.loc[[0]]
        
        
    else:
        df = DataFrame()
        
    
    
    final_sum.to_csv(filename, mode = 'a', header = False, index = False)
    print('Data saved to %s' % filename)
    return req.url, params['content'], df
Example #11
0
    def _get_most_frequent_word(lower_rank_bound: int,
                                dtm_data_frame: pd.DataFrame) -> pd.DataFrame:
        """Get the most frequent words in final_matrix and words.

        The new count matrix will consists of only the most frequent words in
        the whole corpus.
        :param lower_rank_bound: the lowest rank to remain in the matrix
                                 (the rank is determined by the word's number
                                 of appearance in the whole corpus)
                                 (ranked from high to low)
        :param dtm_data_frame: the dtm in the form of panda data frame.
                                the indices(rows) are segment names
                                the columns are words.
        :return:
            dtm data frame with only the most frequent words
        """
        # get the word count of each word in the corpus (a panda series)
        corpus_word_count: pd.Series = dtm_data_frame.sum(axis='index')

        # sort the word list
        sorted_word_count: pd.Series \
            = corpus_word_count.sort_values(ascending=False)

        # get the first "lower_rank_bound" number of item
        most_frequent_counts: pd.Series \
            = sorted_word_count.head(lower_rank_bound)

        # get the most frequent words (the index of the count)
        most_frequent_words = most_frequent_counts.index

        return dtm_data_frame[most_frequent_words]
Example #12
0
def hmm_build(alphabet, aln, threshold):
    '''given alphabet, multiple alignment aln, and insertion threshold,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # k states
    states_ = [('M'+ str(i), 'D' + str(i), 'I' + str(i)) for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']

    # building matrices
    transitions = DataFrame(data=0.0, columns=states, index=states)
    emissions = DataFrame(data=0.0, columns=alphabet, index=states) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # normalize rows
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

    return transitions, emissions
Example #13
0
    def agg(self):
        dframe = DataFrame(index=self.column.index)

        dframe = self._build_dframe(dframe, self.columns)
        column_names = [self._name_for_idx(i) for i in xrange(0, 2)]
        dframe = dframe.dropna(subset=column_names)

        dframe = DataFrame([dframe.sum().to_dict()])

        return self._add_calculated_column(dframe)
Example #14
0
    def agg(self):
        dframe = DataFrame(index=[0])

        columns = [
            Series([col]) for col in [self.column.sum(), len(self.column)]]

        dframe = self._build_dframe(dframe, columns)
        dframe = DataFrame([dframe.sum().to_dict()])

        return self._add_calculated_column(dframe)
Example #15
0
def numpy_dot():
    '''
    Imagine a point system in which each country is awarded 4 points for each
    gold medal,  2 points for each silver medal, and one point for each
    bronze medal.

    Using the numpy.dot function, create a new dataframe called
    'olympic_points_df' that includes:
        a) a column called 'country_name' with the country name
        b) a column called 'points' with the total number of points the country
           earned at the Sochi olympics.

    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea',
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    olympic_medal_counts = {'country_name':countries,
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}


    df = DataFrame(olympic_medal_counts)



    gold_points = df[['gold']].applymap(lambda x: x*4)
    silver_points = df[['silver']].applymap(lambda x: x*2)
    bronze_points = df[['bronze']].applymap(lambda x: x*1)

    medal_points = DataFrame({'gold_points': gold_points.ix[:,0],
                              'silver_points': silver_points.ix[:,0],
                              'bronze_points':bronze_points.ix[:,0]})

    medal_sums = medal_points.sum(axis=1)

    #instructors solution
    #medal_counts = df[['gold', 'silver', 'bronze']]
    #points = numpy.dot(medal_counts, [4, 2, 1])

    olympic_points_df = DataFrame({'country_name': countries,
                                   'points': medal_sums})

    return olympic_points_df
Example #16
0
    def test_stale_cached_series_bug_473(self):

        # this is chained, but ok
        with option_context('chained_assignment', None):
            Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
                          columns=('e', 'f', 'g', 'h'))
            repr(Y)
            Y['e'] = Y['e'].astype('object')
            Y['g']['c'] = np.NaN
            repr(Y)
            result = Y.sum()  # noqa
            exp = Y['g'].sum()  # noqa
            assert pd.isna(Y['g']['c'])
Example #17
0
class I8Merge(object):

    params = ['inner', 'outer', 'left', 'right']
    param_names = ['how']

    def setup(self, how):
        low, high, n = -1000, 1000, 10**6
        self.left = DataFrame(np.random.randint(low, high, (n, 7)),
                              columns=list('ABCDEFG'))
        self.left['left'] = self.left.sum(axis=1)
        self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1)
        self.right = self.right.reset_index(drop=True)
        self.right['right'] *= -1

    def time_i8merge(self, how):
        merge(self.left, self.right, how=how)
def generate_probability_vector_result(output_path):

    cluster_frame = pd.read_csv(output_path + '/clusters.csv', header=None)
    cluster_frame = cluster_frame.set_index(cluster_frame.ix[:,0]).ix[:, 1:]
    cluster_array = cluster_frame.values

    points_frame = pd.read_csv(output_path + '/points.csv', header=None)
    # points_frame = points_frame.drop_duplicates()
    points_array = points_frame.values

    distance_matrix = pw.euclidean_distances(cluster_array, points_array)
    distance_matrix = distance_matrix.T
    distance_frame = DataFrame(distance_matrix)
    # print(distance_frame)
    # print(distance_frame.sum(axis=1))
    distance_frame = distance_frame.div(distance_frame.sum(axis=1), axis=0)
    distance_frame.to_csv(output_path + '/probability.csv')
Example #19
0
def edbSave():
    '获取客户剪切板中的edb代码,并调用接口获取edb指标的具体数据'

    # 获取客户剪切板中的代码及输入的起始与结束日期
    codes = getCodeFromClipboard()
    start = sDate()
    end = eDate()

    data = w.edb(codes, start, end, "Fill=Previous")
    datachg = [d.strftime('%y-%m-%d') for d in data.Times]
    df = DataFrame(data.Data, index=data.Codes, columns=datachg).T
    print('-' * 85)
    print(df)
    print('-' * 85)
    print('统计指标:')
    print(df.describe())
    print("sum", " " * 3, str(df.sum()).split(sep="    ")[1].rjust(10))
    return df
Example #20
0
def calc_happiness(order, guest_dict):
    df = DataFrame(columns=order, index=order)

    for idx,guest in enumerate(order[:-1]):
        # print "{} -> {}: {}".format(
        #     guest,
        #     order[idx+1],
        #     gd[guest][order[idx+1]]
        #     )

        df[guest][order[idx+1]] = guest_dict[guest][order[idx+1]]
        df[order[idx+1]][guest] = guest_dict[order[idx+1]][guest]

    df[order[0]][order[-1]] = guest_dict[order[0]][order[-1]]
    df[order[-1]][order[0]] = guest_dict[order[-1]][order[0]]


    return df.sum().sum()
Example #21
0
def predict_random_category(y_test, n=1000):
    """ Uses boostrapping to compute the expected prediction by chance for each category.
    Parameters:
        y_test (array): Labels
        n (int): the number of times randomize.
    Returns:
        Series containg the accuracy for each class.."""

    # Create a data frame with random predictions.
    random_ = DataFrame({i: shuffle_predict(y_test) for i in xrange(n)})
    random_.index = y_test

    # Calculate the random and
    random_ = 1.0 * random_.sum(axis=1) / n
    grouped = random_.groupby(level=0)
    mean = grouped.mean() * 100.0
    sd = grouped.std() * 100.0
    return mean, sd
def two_column_summary(df, index, column, do_totals=True, do_prob=False):
    """returns a DataFrame contingency (frequency) summary table for two columns.

    arguments:
    df       -- input DataFrame
    index    -- the column used to summarize vertically
    column   -- the column used to summarize vertically
    to_total -- places a row and column to summarize the total along that axis
    do_prob  -- instead of return frequency, return probablity
    """

    # test input
    if not (column in df.columns):
        raise ValueError("[two_column_summary] '%s' no a valid column name" % column)
    if not (index in df.columns):
        raise ValueError("[two_column_summary] '%s' no a valid column name" % index)

    # group for each column
    unique_col_values = df[column].unique()
    cols = []
    for v in unique_col_values:
        mask = df[column]==v
        cols.append(df[mask].groupby(index))

    # glue groups back together
    df_summary = DataFrame()
    for idx, c in enumerate(cols):
        d = c.count()
        d.columns = [unique_col_values[idx]]
        df_summary = pandas.concat([df_summary, d], axis=1)

    # add total
    if do_totals:
        df_summary['total']    = df_summary.apply(sum, axis=1)
        df_summary.ix['total'] = df_summary.sum()

    # make into probablity
    if do_prob:
        df_summary = df_summary/df_summary.ix['total']['total']

    return df_summary
Example #23
0
 def plot_centre_crowd(self, thresh=2, show_threads=False, **kwargs):
     """Plotting evolution of number of participants close to centre"""
     project, show, _ = ac.handle_kwargs(**kwargs)
     data = self.__get_centre_distances(thresh, split=False)
     data_close = DataFrame({
         '6 hours': data[data <= .25].count(axis=1),
         '12 hours': data[(data <= .5) & (data > .25)].count(axis=1),
         '24 hours': data[(data <= 1) & (data > .5)].count(axis=1)},
                            columns=['6 hours', '12 hours', '24 hours'])
     plt.style.use(SETTINGS['style'])
     y_max = data_close.sum(axis=1).max()
     _, axes = plt.subplots()
     data_close.plot(kind="area", ax=axes, stacked=True,
                     color=['darkslategray', 'steelblue', 'lightgray'])
     axes.set_yticks(range(1, y_max + 1))
     axes.set_ylabel("Number of participants")
     axes.set_title("Crowd close to the centre of discussion in {}".format(
         project))
     axes.xaxis.set_ticks_position('bottom')
     axes.yaxis.set_ticks_position('left')
     if show_threads:
         self.__show_threads(axes)
     ac.show_or_save(show)
Example #24
0
    def process(self, start_time: datetime, end_time: datetime, input:DataFrame):
        if str(self.name) not in '+-*/':
            raise ValueError("Unknown math function: " + str(self.name))

        ret = DataFrame()

        # two args means we're doing A + B
        if len(self._args) == 2:
            left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0]
            right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1]

            for l_col in left.columns:
                for r_col in right.columns:
                    if self.name == '+':
                        t = left[l_col] + right[r_col]
                    elif self.name == '-':
                        t = left[l_col] - right[r_col]
                    elif self.name == '*':
                        t = left[l_col] * right[r_col]
                    elif self.name == '/':
                        t = left[l_col] / right[r_col]
                    else:
                        raise ValueError("Unknown operator: " + str(self.name))

                    t = DataFrame(t)
                    t.columns = [l_col + self.name + r_col]

                    print(left.head())
                    print(right.head())
                    print(t.head())
                    ret = ret.combine_first(t)

        else:  # everything is in the input DataFrame
            ret = DataFrame(input.sum(axis=0))
            ret.columns = [' + '.join(input.columns)]

        return ret
Example #25
0
def fix_event_type(df: DataFrame):
    '''
    Not sure yet.
    :param df: Dataframe object.
    :return: Modified Dataframe.
    '''

    a = time.time()

    colsf = df['id'].ravel()            # list of all IDs
    unique = pd.Series(colsf).unique()  # get unique IDs
    u_counts = []                       # list of unique counts (UNUSED)
    counts_bucket = []                  # bucket of counts (UNUSED)
    df = pd.get_dummies(df)             # create dummy variables
    todrop = df.sum() < 50              # get columns where sum of dummy column < 50
    dropcols = df.columns[todrop]       # get those column names
    df = df.drop(dropcols, axis=1)      # drop those columns
    df['num_events'] = 0                # create number of events columns, set to 0
    # print(df.columns)
    print(str(len(unique)))

    for ii in range(0,len(unique)):     # loop through all the unique IDs
        subset = df.loc[df['id'] == unique[ii]]     # subset by that ID
        the_dummies = subset.columns != 'id'        # get all columns that do not equal that ID
        aa = subset.iloc[:, subset.columns != 'id'].sum().tolist()  # get all of those columns to list
        event_sum = np.sum(aa)      # sum all of those
        
        # aa = aa.set_index([[subset.index[0]]])
        # subset.iloc[:,subset.columns != 'id'] = aa
        df = df.set_value(subset.index, the_dummies, aa)
        df = df.set_value(subset.index, 'num_events', event_sum)
        # df.loc[subset.index] = subset
    df = df.drop_duplicates('id')
    print(df)
    b = time.time()
    print(b-a)
    return df
for i, group  in enumerate(Groups):
    TotalCells_Area[SGroup[i]+'_Mean'] = TotalCells_Area[group].mean(axis =1)
    TotalCells_Area[SGroup[i]+'_Values']= TotalCells_Area[group].count(axis=1)
    TotalCells_Area[SGroup[i]+'_Stdev'] = TotalCells_Area[group].std(axis=1)
    TotalCells_Area[SGroup[i]+'_Serror'] = TotalCells_Area[SGroup[i]+'_Stdev']/np.sqrt(TotalCells_Area[SGroup[i]+'_Values'])
## Saving Table

TotalCells.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2  p16 p30 p60 Septotemporal\SGZ\Total_Cells_C57_SW_SGZ.csv')
SGZArea.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2  p16 p30 p60 Septotemporal\SGZ\SGZArea_C57_SW_SGZ.csv')
TotalCells_Area.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2  p16 p30 p60 Septotemporal\SGZ\Total_Cells_Area_C57_SW_SGZ.csv')

 TotalCells[Dictionary['C57_p30']]
 TotalCells[Dictionary['C57']]
#Calculating Density for C57 and C57_p30
 
TotalCellsSum = TotalCells.sum()
SGZAreaSum = SGZArea.sum()     
Density = TotalCellsSum/SGZAreaSum
Density =  Density[0:13]
DensityTable = Series([Density[0:4].mean(),Density[4:9].mean(),Density[9:13].mean()], index =['C57 P16','C57 P30','C57 P60'])
C57_p16_Error =Density[0:4].std()/sqrt(Density[0:4].count())
C57_p30_Error = Density[4:9].std()/sqrt(Density[4:9].count())
C57_Error = Density[9:13].std()/sqrt(Density[9:13].count())

#Density.to_csv('C:\Users\KBermudez-Hernandez\Documents\Dropbox\Figures\Figure1 p16 p30 p60 density\Prox_density_bar_graph\P16 P30 P60 Prox1 Density.csv')

#Plotting Density Graph
plt.figure()   
DensityTable.plot(kind='bar',yerr=[C57_p16_Error,C57_p30_Error,C57_Error])
plt.ylabel('Density of Prox1 in SGZ')
plt.xticks(rotation=0)
Example #27
0
class Scores(object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(
        cls, df,
        uri=None, modality=None, aggfunc=np.mean
    ):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        dataframe = pivot_table(
            df, values=PYANNOTE_SCORE,
            index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL,
            aggfunc=aggfunc
        )

        annotation = Annotation(uri=uri, modality=modality)
        for index, _ in dataframe.iterrows():
            segment = Segment(*index[0])
            track = index[1]
            annotation[segment, track] = ''

        labels = dataframe.columns

        return cls(uri=uri, modality=modality,
                   annotation=annotation, labels=labels,
                   values=dataframe.values)

    def __init__(self, uri=None, modality=None,
                 annotation=None, labels=None,
                 values=None, dtype=None):

        super(Scores, self).__init__()

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        if annotation:
            annotation = annotation.copy()
            index = Index(
                [s + (t, ) for s, t in annotation.itertracks()],
                name=names)

        else:
            annotation = Annotation(uri=uri, modality=modality)
            index = MultiIndex(levels=[list() for name in names],
                               labels=[list() for name in names],
                               names=names)

        self.annotation_ = annotation
        columns = None if labels is None else list(labels)
        data = None if values is None else np.array(values)
        dtype = np.float if values is None else values.dtype

        self.dataframe_ = DataFrame(data=data, dtype=dtype,
                                    index=index, columns=columns)

        self.hasChanged_ = True

        self.modality = modality
        self.uri = uri

    def copy(self):
        self._reindexIfNeeded()
        copied = self.__class__(uri=self.uri, modality=self.modality)
        copied.dataframe_ = self.dataframe_.copy()
        copied.annotation_ = self.annotation_.copy()
        copied.hasChanged_ = self.hasChanged_
        return copied

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self.dataframe_.drop(tuple(segment), axis=0, inplace=True)
            del self.annotation_[segment]
            self.hasChanged_ = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self.dataframe_.drop(tuple(segment) + (track, ),
                                 axis=0, inplace=True)
            del self.annotation_[segment, track]
            self.hasChanged_ = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key
        return self.dataframe_.at[tuple(segment) + (track, ), label]

    # scores[segment, track, label] = value
    # scores[segment, label] ==== scores[segment, '_', label]
    def __setitem__(self, key, value):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key

        # do not add empty track
        if not segment:
            return

        self.dataframe_.at[tuple(segment) + (track,), label] = value
        self.annotation_[segment, track] = label
        self.hasChanged_ = True

    def __len__(self):
        """Number of annotated segments"""
        return len(self.annotation_)

    def __nonzero__(self):
        return self.__bool__()

    def __bool__(self):
        """False if annotation is empty"""
        return True if self.annotation_ else False

    def __contains__(self, included):
        """Check if segments are annotated

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` is annotated, False otherwise.
        """
        return included in self.annotation_

    def __iter__(self):
        """Iterate over sorted segments"""
        return iter(self.annotation_.get_timeline())

    def __reversed__(self):
        """Reverse iterate over sorted segments"""
        return reversed(self.annotation_.get_timeline())

    def itersegments(self):
        return iter(self)

    def tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return self.annotation_.get_tracks(segment)

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return self.annotation_.has_track(segment, track)

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        return self.annotation_.get_track_by_name(track)

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name


        Returns
        -------
        track : str
            New track name
        """

        return self.annotation_.new_track(segment, candidate=None, prefix=None)

    def itertracks(self):
        """Iterate over annotation as (segment, track) tuple"""
        return self.annotation_.itertracks()

    def itervalues(self):
        """Iterate over scores as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._reindexIfNeeded()

        labels = self.labels()

        # yield one (segment, track, label) tuple per loop
        for index, columns in self.dataframe_.iterrows():
            segment = Segment(*index[:-1])
            track = index[-1]
            for label in labels:
                value = columns[label]
                if not np.isnan(value):
                    yield segment, track, label, value

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return dict(self.dataframe_.xs(tuple(segment) + (track, )))

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        labels = sorted(self.dataframe_.columns, key=str)
        if unknown:
            return labels
        else:
            return [l for l in labels if not isinstance(l, Unknown)]

    def _reindexIfNeeded(self):

        if not self.hasChanged_:
            return

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        new_index = Index(
            [s + (t, ) for s, t in self.annotation_.itertracks()],
            name=names)

        self.dataframe_ = self.dataframe_.reindex(new_index)

        self.hasChanged_ = False

        return

    def retrack(self):
        """
        """

        self._reindexIfNeeded()
        retracked = self.copy()

        annotation = self.annotation_.retrack()
        retracked.annotation_ = annotation

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]
        new_index = Index(
            [s + (t, ) for s, t in annotation.itertracks()],
            name=names)
        retracked.dataframe_.index = new_index

        return retracked

    def apply(self, func, axis=0):

        applied = self.copy()
        applied.dataframe_ = self.dataframe_.apply(func, axis=axis)
        applied.hasChanged_ = True

        return applied

    def rank(self, ascending=False):
        """

        Parameters
        ----------
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        rank : `Scores`

        """

        ranked = self.copy()
        ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1,
                                                      ascending=ascending)
        ranked.hasChanged_ = True
        return ranked

    def nbest(self, n, ascending=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """

        filtered = self.copy()
        ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending)
        filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n,
                                                        other=np.NaN)
        filtered.hasChanged_ = True
        return filtered

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        self._reindexIfNeeded()

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        subset = Scores(uri=self.uri, modality=self.modality)
        subset.annotation_ = self.annotation_
        subset.dataframe_ = self.dataframe_[list(labels)]

        return subset

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        if not self:
            return Annotation(uri=self.uri, modality=self.modality)

        best = self.nbest(1, ascending=False)
        large_enough = best.copy()

        if posterior:
            unknown_posterior = 1. - self.dataframe_.sum(axis=1)

            large_enough.dataframe_ = (
                ((best.dataframe_.T > unknown_posterior) &
                 (best.dataframe_.T > threshold)).T
            )

        else:

            large_enough.dataframe_ = (
                (best.dataframe_.T > threshold).T
            )

        large_enough.dataframe_.where(best.dataframe_.notnull(),
                                      inplace=True, other=np.NaN)

        annotation = Annotation(uri=self.uri, modality=self.modality)
        for segment, track, label, value in large_enough.itervalues():
            label = label if value else Unknown()
            annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""

        mapped = self.copy()
        mapped.dataframe_ = self.dataframe_.applymap(func)
        mapped.hasChanged_ = True
        return mapped

    def crop(self, focus, mode='strict'):
        """Crop on focus

        Parameters
        ----------
        focus : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : same type as caller
            Cropped version of the caller containing only tracks matching
            the provided focus and mode.

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.

        """

        if isinstance(focus, Segment):
            return self.crop(Timeline([focus], uri=self.uri), mode=mode)

        self._reindexIfNeeded()
        cropped = self.copy()

        if mode in ['strict', 'loose']:

            new_annotation = self.annotation_.crop(focus, mode=mode)
            keep = [new_annotation.has_track(segment, track)
                    for segment, track in self.itertracks()]
            cropped.dataframe_ = self.dataframe_[keep]
            cropped.annotation_ = new_annotation
            cropped.hasChanged_ = True

            return cropped

        elif mode in ['intersection']:

            raise NotImplementedError('')

            # # two original segments might be cropped into the same resulting
            # # segment -- therefore, we keep track of the mapping
            # intersection, mapping = timeline.crop(coverage,
            #                                       mode=mode, mapping=True)
            #
            # # create new empty annotation
            # A = self.__class__(uri=self.uri, modality=self.modality)
            #
            # for cropped in intersection:
            #     for original in mapping[cropped]:
            #         for track in self.tracks(original):
            #             # try to use original track name (candidate)
            #             # if it already exists, create a brand new one
            #             new_track = A.new_track(cropped, candidate=track)
            #             # copy each value, column by column
            #             for label in self.dataframe_.columns:
            #                 value = self.dataframe_.get_value((original, track),
            #                                            label)
            #                 A.dataframe_ = A.dataframe_.set_value((cropped, new_track),
            #                                         label, value)
            #
            # return A

    def __str__(self):
        """Human-friendly representation"""
        if self:
            self._reindexIfNeeded()
            return str(self.dataframe_)
        else:
            return ""

    def _repr_png_(self):
        from .notebook import repr_scores
        return repr_scores(self)
Example #28
0
import numpy as np
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df.ix['b']
## Summarizing and Computing Descriptive Statistics
# reductions or summary statistics
f = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
Example #29
0
print "df.head - first 5 rows"
print df.head()
import os
os.remove(r'./births1880.txt')

uqNames = df['Names'].unique()

print "df['names'].unique()"
print uqNames

print "df.names.describe()"
print df['Names'].describe()

df = df.groupby("Names")  #group by name
print df
df = df.sum() # applys sum to each groupBy obj
print df
#above is equivalent os select sum(births) from df group by names;


Sorted = df.sort(columns="Births", ascending=False)
print Sorted.head(1)

#or
df['Births'].max()

#Create Graph
df['Births'].plot(kind="bar")

print "The most popular name"
df.sort(columns='Births', ascending=False)
for i, group  in enumerate(Groups):
    TotalCells_Area[SGroup[i]+'_Mean'] = TotalCells_Area[group].mean(axis =1)
    TotalCells_Area[SGroup[i]+'_Values']= TotalCells_Area[group].count(axis=1)
    TotalCells_Area[SGroup[i]+'_Stdev'] = TotalCells_Area[group].std(axis=1)
    TotalCells_Area[SGroup[i]+'_Serror'] = TotalCells_Area[SGroup[i]+'_Stdev']/np.sqrt(TotalCells_Area[SGroup[i]+'_Values'])
## Saving Table

TotalCells.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\Total_Cells_CreBax_C_H_Hilus.csv')
HilusArea.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\HilusArea_CreBax_C_H_Hilus.csv')
TotalCells_Area.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\Total_Cells_Area_C_H_CreBax_Hilus.csv')



#Calculating Density of All
 
TotalCellsSum = TotalCells.sum()
HilusAreaSum = HilusArea.sum()     
Density = TotalCellsSum/HilusAreaSum

DensityTable = Series([Density[P30_Positive_Tam].mean(),Density[P30_Negative_Tam].mean(),Density[P60_Positive_Tam].mean(),Density[P60_Negative_Tam].mean()], index =['P30_Positive_Tam','P30_Negative_Tam', 'P60_Positive_Tam' ,'P60_Negative_Tam'])
P30_Positive_Tam_Error =Density[P30_Positive_Tam].std()/sqrt(Density[P30_Positive_Tam].count())
P30_Negative_Tam_Error = Density[P30_Negative_Tam].std()/sqrt(Density[P30_Negative_Tam].count())
P60_Positive_Tam_Error = Density[P60_Positive_Tam].std()/sqrt(Density[P60_Positive_Tam].count())
P60_Negative_Tam_Error = Density[P60_Negative_Tam].std()/sqrt(Density[P60_Negative_Tam].count())



#Density.to_csv('C:\Users\KBermudez-Hernandez\Documents\Dropbox\Figures\Figure1 p16 p30 p60 density\Prox_density_bar_graph\P16 P30 P60 Prox1 Density.csv')

#Plotting Density Graph
plt.figure()