def propNoteGraph(data_test,b_u,b_i,mu,L,R):
    # Give the interesting graphic
    index_note = np.arange(1,6)
    count_1 = np.zeros([5,2])
    count_2 = np.zeros([5,2])
    notes = DataFrame(count_1,index=index_note,columns=['BON','MAUVAIS'])
    notes_naif = DataFrame(count_2,index=index_note,columns=['BON','MAUVAIS'])
    
    for r in range(data_test.shape[0]):
#        r_pred = round(mu + b_u[data_test.user_id.values[r]] + b_i[data_test.movie_id.values[r]] + X[data_test.user_id.values[r],data_test.movie_id.values[r]])           
        mean = mu + b_u[data_test[r,0]] + b_i[data_test[r,1]]        
        r_pred = round(mean + np.dot(L[data_test[r,0],:],R[data_test[r,1],:]))          
        r_pred = min(5,r_pred)
        r_pred = max(1,r_pred)
        r_true = int(round(mean+data_test[r,2]))
        r_naif = round(mean)

        if r_naif==r_true:
            notes_naif.BON[r_true]+=1
        else:
            notes_naif.MAUVAIS[r_true]+=1
        
        if r_pred==r_true:
            notes.BON[r_true]+=1
        else:
            notes.MAUVAIS[r_pred]+=1
                
    notes_naif_prop = notes_naif.div(notes_naif.sum(1),axis=0)
    notes_prop = notes.div(notes.sum(1),axis=0)
    
    notes_naif_VS_algo = pd.concat([notes_prop.BON,notes_naif_prop.BON], axis=1)
    notes_naif_VS_algo.columns = ['ALGO','NAIF']
    return notes_naif_VS_algo
def find_degree_vector(dfi):
    from pandas import DataFrame as DF
    results = [{dof: (dfi.columns[i], 'coh')} for i, dof in enumerate(DF.sum(dfi))]
    temp = [{dof: (dfi.index[i], 'doc')} for i, dof in enumerate(DF.sum(dfi, axis=1))]
    results.extend(temp)
    # results = [{i[0]: (i[1], 'coh')} for i in DF.sum(dfi)]
    return sorted(results)
Exemple #3
0
def hmm_build(alphabet, aln, threshold, sigma):
    '''Given alphabet, multiple alignment aln, insertion threshold and pseudocount sigma,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # k states
    states_ = ['M{0} D{0} I{0}'.format(i).split() for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']
    
    # building matrices
    transitions = DataFrame(data=0.0, columns=states, index=states)
    emissions = DataFrame(data=0.0, columns=alphabet, index=states) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # scale rows to [0, 1]
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)
    
    #add pseudocounts
    transitions.iloc[:2, 1:4] += sigma
    transitions.iloc[-4:-1, -2:] += sigma
    for i in range(k):
        transitions.iloc[i*3-1:i*3+2, i*3+1:i*3+4] += sigma
        emissions.iloc[i*3+1:i*3+3, :] += sigma
    emissions.iloc[-2, :] += sigma
    
    # scale again
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

    return transitions, emissions
Exemple #4
0
def aggregate_chunks(mod_features_df, modality):
    without_info_df = mod_features_df.query('field != "info"')
    cnt_df = DataFrame([list(mod_features_df.loc[('info', 'count'), :].values)] * len(without_info_df),
                       index=without_info_df.index)
    agg_df = without_info_df * cnt_df
    agg_df = DataFrame(agg_df.sum(axis=1) / cnt_df.sum(axis=1), index=without_info_df.index)
    agg_df['modality'] = modality
    agg_df.set_index('modality', append=True, inplace=True)
    agg_df = agg_df.reorder_levels(['modality', 'field', 'feature'])
    return agg_df
def summaryStatDataFrame():
    df = DataFrame(np.arange(12).reshape(4,3),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green', 'Red','Green']])
    df.index.names = ['key1','key2']
    df.columns.names = ['state','color']
    print (df)
    print ('Sum of key1: \n{}'.format(df.sum(level='key1')))
    print ('Sum of key2: \n{}'.format(df.sum(level='key2')))
    print ('Sum of state: \n{}'.format(df.sum(level='state', axis = 1)))
    print ('Sum of color: \n{}'.format(df.sum(level='color', axis = 1)))
Exemple #6
0
def project_participation_evolution(
        pm_frame, all_authors, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to projects with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = list(all_authors)
        title = "Participation per project in Polymath\
                 (threshold = {})".format(n)
    else:
        thread_type = 'research threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = set().union(
            *data['research threads', 'authors (accumulated)'])
        title = "Participation per project in Polymath\
                 (threshold = {}, only research-threads)".format(n)
    data.index = data.index.droplevel(1)
    author_project = DataFrame(columns=all_authors)
    for author in author_project.columns:
        author_project[author] = data[
            thread_type, 'authors (accumulated)'].apply(
                lambda project, author=author: author in project)
    author_project = author_project.T
    author_project = author_project.sort_values(by=data.index.tolist(),
                                                ascending=False)
    author_project = author_project.drop(
        "Anonymous") if skip_anon else author_project
    select = author_project.sum(axis=1) >= n
    return author_project, data.index, select, title
Exemple #7
0
    def _get_most_frequent_word(lower_rank_bound: int,
                                dtm_data_frame: pd.DataFrame) -> pd.DataFrame:
        """Get the most frequent words in final_matrix and words.

        The new count matrix will consists of only the most frequent words in
        the whole corpus.
        :param lower_rank_bound: the lowest rank to remain in the matrix
                                 (the rank is determined by the word's number
                                 of appearance in the whole corpus)
                                 (ranked from high to low)
        :param dtm_data_frame: the dtm in the form of panda data frame.
                                the indices(rows) are segment names
                                the columns are words.
        :return:
            dtm data frame with only the most frequent words
        """
        # get the word count of each word in the corpus (a panda series)
        corpus_word_count: pd.Series = dtm_data_frame.sum(axis='index')

        # sort the word list
        sorted_word_count: pd.Series \
            = corpus_word_count.sort_values(ascending=False)

        # get the first "lower_rank_bound" number of item
        most_frequent_counts: pd.Series \
            = sorted_word_count.head(lower_rank_bound)

        # get the most frequent words (the index of the count)
        most_frequent_words = most_frequent_counts.index

        return dtm_data_frame[most_frequent_words]
def getNgrams(query, corpus, startYear, endYear, smoothing):
    
    
    params = dict(content=query, year_start=startYear, year_end=endYear,
                  corpus=corpora[corpus], smoothing=1)
    
    req = requests.get('http://books.google.com/ngrams/graph', params=params)
    res = re.findall('var data = (.*?);\\n', req.text)
    if res:
        data = {qry['ngram']: qry['timeseries']
                for qry in literal_eval(res[0])}
        df = DataFrame(data)
        df_sum = df.sum(axis=1)
        
        final_sum = df_sum.loc[[0]]
        
        
    else:
        df = DataFrame()
        
    
    
    final_sum.to_csv(filename, mode = 'a', header = False, index = False)
    print('Data saved to %s' % filename)
    return req.url, params['content'], df
Exemple #9
0
def analyze():
    signals = read_csv(FILE_SIGNALS)
    devices = signals["id"].unique()
    
    print("got %d signals from %d devices" % (len(signals), len(devices)))

    signals = signals.groupby(["frequency", "id"]).size()
    signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices],
                                                      names=signals.index.names),
                              fill_value=0)
    signals = signals.unstack("id")
    
    # let's only keep frequencies with all signals present
    candidates = signals.dropna()
    # suggest frequency where the weakest sensor has the most
    # received signals, and then the frequency with most total
    # received signals for all sensors
    candidates = DataFrame({"total":   candidates.sum(axis=1),
                            "weakest": candidates.min(axis=1)})
    appropriate_freq = candidates.sort(["weakest", "total"],
                                       ascending=False).index[0]
    print("suggesting frequency %s" % mhz(appropriate_freq))

    signals.to_csv("spectrum.csv")
    
    import matplotlib.pyplot as plt
    from matplotlib.ticker import EngFormatter

    p=signals.plot(kind="Area")
    p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2))
    plt.savefig(FILE_SPECTRUM, dpi=300)
    print("saved spectrum as %s" % FILE_SPECTRUM)
Exemple #10
0
def thread_participation_evolution(
        pm_frame, project, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to threads in project with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        title = "Participation per thread in {} (threshold = {})".format(
            project, n)
    else:
        thread_type = 'research threads'
        title = "Participation per thread in {}\
                 (threshold = {}, only research-threads)".format(project, n)
    data = pm_frame.loc[project][['basic', thread_type]]
    data = data.dropna()
    all_authors = set().union(*data[thread_type, 'authors'])
    author_thread = DataFrame(columns=all_authors)
    for author in author_thread.columns:
        author_thread[author] = data[thread_type, 'authors'].apply(
            lambda thread, author=author: author in thread)
    author_thread = author_thread.T
    author_thread = author_thread.sort_values(by=data.index.tolist(),
                                              ascending=False)
    author_thread = author_thread.drop(
        "Anonymous") if skip_anon else author_thread
    author_thread.columns.name = "Threads"
    select = author_thread.sum(axis=1) >= n
    return author_thread, data.index, select, title
Exemple #11
0
def hmm_build(alphabet, aln, threshold):
    '''given alphabet, multiple alignment aln, and insertion threshold,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # k states
    states_ = [('M'+ str(i), 'D' + str(i), 'I' + str(i)) for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']

    # building matrices
    transitions = DataFrame(data=0.0, columns=states, index=states)
    emissions = DataFrame(data=0.0, columns=alphabet, index=states) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # normalize rows
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

    return transitions, emissions
Exemple #12
0
    def agg(self):
        dframe = DataFrame(index=[0])

        columns = [
            Series([col]) for col in [self.column.sum(), len(self.column)]]

        dframe = self._build_dframe(dframe, columns)
        dframe = DataFrame([dframe.sum().to_dict()])

        return self._add_calculated_column(dframe)
Exemple #13
0
    def agg(self):
        dframe = DataFrame(index=self.column.index)

        dframe = self._build_dframe(dframe, self.columns)
        column_names = [self._name_for_idx(i) for i in xrange(0, 2)]
        dframe = dframe.dropna(subset=column_names)

        dframe = DataFrame([dframe.sum().to_dict()])

        return self._add_calculated_column(dframe)
def numpy_dot():
    '''
    Imagine a point system in which each country is awarded 4 points for each
    gold medal,  2 points for each silver medal, and one point for each
    bronze medal.

    Using the numpy.dot function, create a new dataframe called
    'olympic_points_df' that includes:
        a) a column called 'country_name' with the country name
        b) a column called 'points' with the total number of points the country
           earned at the Sochi olympics.

    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea',
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    olympic_medal_counts = {'country_name':countries,
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}


    df = DataFrame(olympic_medal_counts)



    gold_points = df[['gold']].applymap(lambda x: x*4)
    silver_points = df[['silver']].applymap(lambda x: x*2)
    bronze_points = df[['bronze']].applymap(lambda x: x*1)

    medal_points = DataFrame({'gold_points': gold_points.ix[:,0],
                              'silver_points': silver_points.ix[:,0],
                              'bronze_points':bronze_points.ix[:,0]})

    medal_sums = medal_points.sum(axis=1)

    #instructors solution
    #medal_counts = df[['gold', 'silver', 'bronze']]
    #points = numpy.dot(medal_counts, [4, 2, 1])

    olympic_points_df = DataFrame({'country_name': countries,
                                   'points': medal_sums})

    return olympic_points_df
Exemple #15
0
    def test_transpose_empty_preserves_datetimeindex(self):
        # GH#41382
        df = DataFrame(index=DatetimeIndex([]))

        expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None)

        result1 = df.T.sum().index
        result2 = df.sum(axis=1).index

        tm.assert_index_equal(result1, expected)
        tm.assert_index_equal(result2, expected)
Exemple #16
0
 def test_nan_int_timedelta_sum(self):
     # GH 27185
     df = DataFrame(
         {
             "A": Series([1, 2, NaT], dtype="timedelta64[ns]"),
             "B": Series([1, 2, np.nan], dtype="Int64"),
         }
     )
     expected = Series({"A": Timedelta(3), "B": 3})
     result = df.sum()
     tm.assert_series_equal(result, expected)
Exemple #17
0
def uncondExact2x2DF(df: pd.DataFrame, **kwargs) -> pd.Series:
    from rpy2.robjects.packages import importr

    assert df.shape == (2, 2), "Input dataframe must be of shape 2x2"
    exact2x2 = importr("exact2x2")
    c1 = int(df.iloc[0, 0])
    c2 = int(df.iloc[1, 0])
    n1, n2 = [int(x) for x in df.sum(axis=1)]

    res_d = uncondExact2x2(c1, n1, c2, n2, **kwargs)

    return pd.Series(res_d)
    def show_ordered_ssm (
            self,
            df:pd.DataFrame, 
            name:str = 'NotProvided', 
            ticks_interval:int =10, 
            figsize:Tuple[int,int] =(10,9), 
            isAnnot:bool=False
        ) -> pd.DataFrame:
    
        """ 
        Get the Self-Similarity Matrix in an ordered form, where the first column has the highest sum.

        Parameters
        ----------
            :param df : The Self-Similarity Matrix in pandas DataFrame format, where indexs and columns are same i.e. caseIDs.
            :param name : The name to be shown on the plot title. (default: NotProvided).
            :param ticks_interval : The interval of ticks for the Self-Similarity heatmap.
            :param figsize : Figure size of the Heatmap plot (default: (10,10)).
            :param isAnnot : True, will annotate each cell with its similarity value (default: False).

        Plots
        -----
            Heatmap : heatmap of the ordered Self-Similarity Matrix.

        Returns
        -------
            DataFrame : Ordered Self-Similarity Matrix. NaN represents that a caseID was not compared for the similarity.
        """

        ordered_series = df.sum( axis=1).sort_values( ascending=False)
        lis = ordered_series.index.tolist()

        df_1 = df[lis]
        df_temp = df_1.reindex(lis)

        plt.figure( figsize=figsize)

        ax = sns.heatmap(
            df_temp, 
            cmap='viridis', 
            xticklabels=ticks_interval, 
            yticklabels=ticks_interval, 
            fmt='g', 
            annot=isAnnot, 
            annot_kws={'size': 9}
        )
        
        ax.invert_xaxis()

        plt.yticks(rotation=0) 
        plt.title('Self-Similarity Matrix (ordered) for : '+name)

        return df_temp
Exemple #19
0
def filter_empty_trajectories(data: pandas.DataFrame) -> pandas.DataFrame:
    # If the inputfiles have whitespace characters in a line they'll be imported as additional trajectories with 0% at every timepoint.
    # So basically remove any trajectories which are 0% at all timepoints.

    # Need to first isolate the columns dedicated to the experimental timepoints
    row_totals = data.sum(axis=1)
    index_to_drop = row_totals[row_totals == 0.0].index

    index_to_keep = [i for i in data.index if i not in index_to_drop]
    data = data.reindex(index_to_keep)

    return data
Exemple #20
0
class DetectionEvaluation(BaseEvaluation):
    """
    DetectionEvaluations have a different number of predictions from the
    number of ground truth annotations. An example would be detecting lung
    nodules in a CT volume, or malignant cells in a pathology slide.
    """
    def merge_ground_truth_and_predictions(self):
        self._cases = concat(
            [self._ground_truth_cases, self._predictions_cases],
            keys=["ground_truth", "predictions"])

    def cross_validate(self):
        expected_keys = set(self._ground_truth_cases[self._join_key])
        submitted_keys = set(self._predictions_cases[self._join_key])

        missing = expected_keys - submitted_keys
        if missing:
            self._raise_missing_predictions_error(missing=missing)

        extra = submitted_keys - expected_keys
        if extra:
            self._raise_extra_predictions_error(extra=extra)

    def score(self):
        cases = set(self._ground_truth_cases[self._join_key])

        self._case_results = DataFrame()

        for idx, case in enumerate(cases):
            self._case_results = self._case_results.append(self.score_case(
                idx=idx,
                case=self._cases.loc[self._cases[self._join_key] == case],
            ),
                                                           ignore_index=True)
        self._aggregate_results = self.score_aggregates()

    def score_aggregates(self):
        aggregate_results = super().score_aggregates()

        totals = self._case_results.sum()

        for s in totals.index:
            aggregate_results[s]["sum"] = totals[s]

        tp = aggregate_results["true_positives"]["sum"]
        fp = aggregate_results["false_positives"]["sum"]
        fn = aggregate_results["false_negatives"]["sum"]

        aggregate_results["precision"] = tp / (tp + fp)
        aggregate_results["recall"] = tp / (tp + fn)
        aggregate_results["f1_score"] = 2 * tp / ((2 * tp) + fp + fn)

        return aggregate_results
Exemple #21
0
 def test_reduce_mixed_frame(self):
     # GH 6806
     df = DataFrame({
         "bool_data": [True, True, False, False, False],
         "int_data": [10, 20, 30, 40, 50],
         "string_data": ["a", "b", "c", "d", "e"],
     })
     df.reindex(columns=["bool_data", "int_data", "string_data"])
     test = df.sum(axis=0)
     tm.assert_numpy_array_equal(test.values,
                                 np.array([2, 150, "abcde"], dtype=object))
     tm.assert_series_equal(test, df.T.sum(axis=1))
Exemple #22
0
def add_total_row(df: pd.DataFrame) -> pd.DataFrame:
    """
    Добавляет строчку с total суммами по столбцам к итоговой таблице
    :param df: финальный датафрейм с тремя конкатенированными таблицами
    :return: датафрейм с добавленной строкой сумм
    """
    sumrow = pd.DataFrame(columns=df.columns)
    indx = sumrow.index
    sumrow = sumrow.append(df.sum(numeric_only=True), ignore_index=True)
    sumrow.index = indx.union(["Total"])
    df = pd.concat([sumrow, df])
    return df
Exemple #23
0
    def _get_priors_and_counts(self, df: pd.DataFrame) -> (int, int, int, int):

        sums = df.sum(axis=0)
        total_success_count = sums['success_case_count']
        total_non_success_count = sums['non_success_count']
        total = total_success_count + total_non_success_count

        # P(success) & P(not success)
        p_success = total_success_count / total
        p_non_success = total_non_success_count / total

        return p_success, p_non_success, total_success_count, total_non_success_count
Exemple #24
0
def merge_profiles(name, output):

    while True:

        n = GetRunningTasks(name)
        if n == 0:
            print("%s, Merging profiles ... " % current_time())
            files = glob.glob("%s/*/*gene_abund.tab" % (output))

            if files:
                files = sorted(files)
                dict_merge = {}
                for f in files:
                    with open(f) as handle:
                        for line in islice(handle, 1, None):
                            line = line.strip().split("\t")
                            k_map = line[1]
                            k_RNA = f.split("/")[-2]
                            count = float(line[8])  # TPM
                            if k_map in dict_merge:
                                dict_merge[k_map][k_RNA] = count

                            else:
                                tmp_dic = {}
                                tmp_dic[k_RNA] = count
                                dict_merge[k_map] = tmp_dic

                df = DataFrame(dict_merge).T
                df = df.fillna(value=0)  ### fill NA to 0
                df_sum = DataFrame(df.sum(axis=1), columns=['sum'])
                df = df.join(df_sum)
                df = df.sort_values(by="sum", ascending=False)  ### sort by sum
                df.drop(['sum'], axis=1, inplace=True)
                merge_out = os.path.join(output, "merge_gene_TPM.txt")
                df.to_csv(merge_out,
                          sep="\t",
                          header=True,
                          index=True,
                          index_label="gene",
                          float_format="%.2f")
                return merge_out
                break

            else:
                print(
                    "\n### Merge profiles failed, it is not exsit in %s/*/ \n"
                    % (output))
                exit(1)
        else:
            print("%s, Waitiing for task finished, remaining %d tasks" %
                  (current_time(), n))
            time.sleep(10)
Exemple #25
0
def compute_countries_sto_multipliers(years: List[int], countries: List[str],
                                      sto_inflows_df: pd.DataFrame,
                                      ror_inflows_df: pd.DataFrame,
                                      ror_capacity_ds: pd.Series) -> pd.Series:
    """
     Computing STO multipliers mapping cell runoff to approximated hourly-sampled reservoir inflows.

     Parameters
     ----------

     years: List[int]
        List of years.
     countries: List[str]
        ISO codes of the countries for which we want to obtain STO multipliers.
     sto_inflows_df: pd.DataFrame
        Data frame with STO (GWh) inflow time series for each geographical unit across the time horizon considered.
     ror_inflows_df: pd.DataFrame
        Data frame with ROR (p.u.) capacity factors for each geographical unit across the time horizon considered.
     ror_capacity_ds: pd.Series
        Series with ROR hydro capacities (GW) for each geographical unit considered.

     Returns
     -------
     sto_multipliers_ds: pd.Series
         STO multipliers per country.
     """

    # Compute yearly per country ror electricity production
    ror_inflows_yearly = ror_inflows_df.groupby(
        ror_inflows_df.index.year).sum()
    ror_production_yearly = ror_inflows_yearly.multiply(
        ror_capacity_ds.dropna(), axis=1).transpose()
    ror_production_yearly_per_country = ror_production_yearly.groupby(
        ror_production_yearly.index.str[:2]).sum()

    # Get total hydro-electric production and remove ROR production to get STO production
    sto_production_yearly_per_country = get_hydro_production(
        years=years, countries=countries)
    countries_with_ror = set(countries).intersection(
        set(ror_production_yearly_per_country.index))
    sto_production_yearly_per_country.loc[countries_with_ror] -= \
        ror_production_yearly_per_country.loc[countries_with_ror]
    # For some countries (like LV and IE), computed ROR potential is bigger than the Eurostat total hydro generation
    # leading to negative STO production values so we clip it.
    sto_production_per_country = sto_production_yearly_per_country.clip(
        lower=0.).sum(axis=1)

    sto_inflows_per_country = sto_inflows_df.sum().groupby(
        sto_inflows_df.columns.str[:2]).sum()
    sto_multipliers_ds = sto_production_per_country / sto_inflows_per_country

    return sto_multipliers_ds
Exemple #26
0
def weights_sum_to_one(weights: pd.DataFrame):
    sum_weights = weights.sum(axis=1)
    sum_weights[sum_weights==0.0] = 0.0001
    weight_multiplier = 1.0 / sum_weights
    weight_multiplier_array = np.array([weight_multiplier]*len(weights.columns))
    weight_values = weights.values

    normalised_weights_np = weight_multiplier_array.transpose() * weight_values
    normalised_weights = pd.DataFrame(normalised_weights_np,
                                      columns = weights.columns,
                                      index = weights.index)

    return normalised_weights
Exemple #27
0
def get_tail_labels(df: pd.DataFrame, ql=[0.03, 1.]) -> list:
    """
    Find the underrepresented targets.
    Underrepresented targets are those which are observed less than the median occurance.
    Targets beyond a quantile limit are filtered.
    """
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) &
                  ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_labels = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_labels
Exemple #28
0
def test_sum_timedelta64_skipna_false():
    # GH#17235
    arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
    arr[-1, -1] = "Nat"

    df = DataFrame(arr)

    result = df.sum(skipna=False)
    expected = Series([pd.Timedelta(seconds=12), pd.NaT])
    tm.assert_series_equal(result, expected)

    result = df.sum(axis=0, skipna=False)
    tm.assert_series_equal(result, expected)

    result = df.sum(axis=1, skipna=False)
    expected = Series([
        pd.Timedelta(seconds=1),
        pd.Timedelta(seconds=5),
        pd.Timedelta(seconds=9),
        pd.NaT,
    ])
    tm.assert_series_equal(result, expected)
def order_optimum(df: DataFrame, k1: Union[int, float], k2: Union[int, float],
                  z: Union[int, float], depth: int):
    """
    :param df: DataFrame, stock moving data
    :param k1: int or float, in case of perishable product - storage cost
        + purchase price, else - only storage cost
    :param k2: int or float, selling price
    :return: int or float, optimal order quantity
    """
    if depth == 1:
        return stock_optimum(df, k1, k2) - z

    return stock_optimum(df, k1, k2) - z + depth * df.sum(axis=1).mean()
    def test_stale_cached_series_bug_473(self):

        # this is chained, but ok
        with option_context('chained_assignment', None):
            Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
                          columns=('e', 'f', 'g', 'h'))
            repr(Y)
            Y['e'] = Y['e'].astype('object')
            Y['g']['c'] = np.NaN
            repr(Y)
            result = Y.sum()  # noqa
            exp = Y['g'].sum()  # noqa
            assert pd.isna(Y['g']['c'])
Exemple #31
0
def gauge_chart_histogram_cross(responses, categories):
	datVals = str(dic['Data_values'][responses.name==dic['Spring_2017_Question_Code']].values[0]).split(';')
	# Blank values are pulled in as NaN
	if datVals == ['nan']:
		datVals = ['']
	description = responses.describe()
	h = 2*(description['75%']-description['25%'])/np.power(description['count'], 1./3.)
	nbins = np.round((description['max']-description['min'])/h)
	bins = np.histogram(responses, nbins)[1]
	stack = DataFrame(columns=np.arange(nbins), index=categories.columns)
	for i in range(len(stack)):
		stack.ix[i] = np.histogram(responses[categories.ix[:, i]], bins)[0]
	(100*stack.T/stack.sum(1)).T[::-1].plot(kind='barh', stacked=True, width=1,
		edgecolor='w', legend=False, align='edge', figsize=(12, 6))
	plt.title("\n".join(wrap(str(dic['Question_Text']\
			[responses.name==dic['Spring_2017_Question_Code']].values[0]), 88)), size='medium')
	for i in range(len(stack.T)):
		plt.axvline(np.cumsum((stack.T/stack.sum(1)).T.ix[0])[::-1][i]*100, color='lightgray')
	plt.axis('tight')
	plt.xticks(np.arange(0, 101, 10), ('%i%% '*11 % tuple(np.arange(0, 101, 10))).split())
	plt.legend(bbox_to_anchor=(0.55, -0.05, 0.5, 0), ncol=len(datVals), fontsize='small')
	plt.subplots_adjust(left=0.18, right=0.92)
Exemple #32
0
def check_lineage(genotypes: pandas.DataFrame, lineages: pandas.Series):

    ancestor = get_ancestor_series(genotypes, 0.97)
    genotypes = genotypes[[
        i for i in genotypes.columns if i not in ancestor.index
    ]]
    frequencies = genotypes.sum()

    fig, ax = plt.subplots(figsize=(10, 10))

    ax.plot(frequencies)

    plt.show()
def _compute_quantile_accuracies(heatmap: pd.DataFrame) -> Tuple[float, float]:
    """Computes the accuracy within 1st and 2nd quantile."""
    # TODO(): Add overall accuracy result.
    # Create filters to calculate accuracy within 1st and 2nd quantile.
    mask_1st_quantile = (np.eye(*heatmap.shape) + np.eye(*heatmap.shape, k=1) +
                         np.eye(*heatmap.shape, k=-1))
    mask_2nd_quantile = (mask_1st_quantile + np.eye(*heatmap.shape, k=2) +
                         np.eye(*heatmap.shape, k=-2))
    # Calculate accuracy.
    all_sum = heatmap.sum().sum()
    accuracy_1st_quantile = (heatmap * mask_1st_quantile).sum().sum() / all_sum
    accuracy_2nd_quantile = (heatmap * mask_2nd_quantile).sum().sum() / all_sum
    return accuracy_1st_quantile, accuracy_2nd_quantile
Exemple #34
0
def left_right():
    low, high, n = -1 << 10, 1 << 10, 1 << 20
    left = DataFrame(np.random.randint(low, high, (n, 7)),
                     columns=list("ABCDEFG"))
    left["left"] = left.sum(axis=1)

    # one-2-one match
    i = np.random.permutation(len(left))
    right = left.iloc[i].copy()
    right.columns = right.columns[:-1].tolist() + ["right"]
    right.index = np.arange(len(right))
    right["right"] *= -1
    return left, right
    def test_stale_cached_series_bug_473(self):

        # this is chained, but ok
        with option_context('chained_assignment', None):
            Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
                          columns=('e', 'f', 'g', 'h'))
            repr(Y)
            Y['e'] = Y['e'].astype('object')
            Y['g']['c'] = np.NaN
            repr(Y)
            result = Y.sum()  # noqa
            exp = Y['g'].sum()  # noqa
            assert pd.isna(Y['g']['c'])
Exemple #36
0
 def c_input(self, o_iot: pd.DataFrame, q_iot: pd.DataFrame, p_tau: float):
     factor = 1 / (1 - (o_iot.sum(axis=0) / q_iot) * p_tau)
     A = np.array([
         self.indicator("q", i) -
         (factor.loc[i] *
          (np.sum([self.indicator("d", j, i) for j in Sector], axis=0) +
           self.indicator("x", M.I, i) + self.indicator("xtilde", M.L, i) +
           self.indicator("xtilde", M.K, i))) for i in Sector
     ])
     # normalization
     normalization = np.array([1 / q_iot.loc[i] for i in Sector])
     A = np.multiply(A, normalization[:, None])
     self.update_constraint(Bound(None, None, A, np.zeros(A.shape[0])))
Exemple #37
0
    def operate(cls, value: DataFrame) -> Series:
        """
        Apply the sum operation across a DataFrame's columns.

        :param value: The DataFrame of distributions to sum across.
        """
        if isinstance(value, DataFrame):
            result = value.sum(axis=1)
            names_csv = ', '.join(value.columns.to_list())
            result.name = f'sum({names_csv})'
            return result
        else:
            raise TypeError('value for Sum aggregator must be DataFrame')
def weighted_average_of_impurity(df: pd.DataFrame, impurity_func: Callable[[pd.Series], float]) -> float:
    col_sum = df.sum(axis=1)
    col_gini = df.apply(impurity_func, axis=1)
    res = sum(col_gini * col_sum / sum(col_sum))
    print('<<<<Frame Gini>>>>')
    print('Sum of Columns')
    print(col_sum)
    print('Gini of Columns')
    print(col_gini)
    print('Result')
    print(res)

    return res
Exemple #39
0
    def _normalise(df: pd.DataFrame) -> pd.DataFrame:
        """
        Normalises dataframe

        Args:
            df: Raw dataframe

        Returns:
            Normalised dataframe
        """
        norm_df = df / df.sum(axis=0)
        norm_df.fillna(1.0 / df.shape[0], inplace=True)
        return norm_df
Exemple #40
0
def get_deseq2_stats(df: pd.DataFrame,
                     subsets: List[List[T]],
                     min_total_row_count: int = 0) -> pd.DataFrame:
    """Use the R bioconductor package 'limma' to perform a differential
    expression analysis of count like data (e.g. miRNA). See package
    documentation for more details.
    :param df: Matrix of counts, where each column is a sample and each row
    a feature.
    :param subsets: The two subsets to compare with each other.
    :param min_total_row_count: Drop rows that have in total less than than
        min_total_row_count reads
    :return: Results of the analysis in form of a Dataframe (p, logFC, ...)
    """
    logger.debug("Computing deseq2 stats")
    if len(subsets) != 2:
        error = "This method currently only supports exactly two " \
                "subsets as this is the most common use case. Support " \
                "for more subsets will be added later."
        logger.exception(error)
        raise ValueError(error)
    # flatten subset
    flattened_subsets = [x for subset in subsets for x in subset]
    # discard columns that are not in a subset
    df = df[flattened_subsets]
    # filter rows with too few reads
    total_row_counts = df.sum(axis=1)
    keep = total_row_counts[total_row_counts >= min_total_row_count].index
    df = df.loc[keep]
    # pandas df -> R df
    r_count_data = pandas2ri.py2ri(df)
    # py2ri is stupid and makes too many assumptions.
    # These two lines restore the column order
    r_count_data.colnames = list(OrderedDict.fromkeys(flattened_subsets))
    r_count_data = r_count_data.rx(robj.StrVector(flattened_subsets))

    # see package documentation
    condition = ['s{}'.format(i) for i, subset in enumerate(subsets)
                 for _ in subset]
    r_condition = robj.FactorVector(robj.StrVector(condition))
    r_col_data = r['DataFrame'](condition=r_condition)
    r_design = robj.Formula('~ condition')
    r_design.environment['condition'] = r_condition
    r_dds = r['DESeqDataSetFromMatrix'](r_count_data, r_col_data, r_design)
    r_dds = r['DESeq'](r_dds, parallel=True)
    r_res = r['results'](r_dds)

    # R result table to Python pandas
    r_res = r['as.data.frame'](r_res)
    results = pandas2ri.ri2py(r_res)
    results.insert(0, 'feature', list(r['row.names'](r_res)))
    return results
def plot(tvec: pd.DataFrame, data: pd.DataFrame, combineZones: bool):
    '''
    Function to plot the data
    :param tvec: Time vector for all the data measurements
    :param combineZones: Whether or not to sum all the zones into 1 value
    :param data: The data to plot
    :return: Does not return anything, will show a plot on the screen.
    '''
    if tvec.empty or data.empty:
        raise Exception("The dataset is empty")
    if isinstance(tvec, pd.Index):
        x = tvec
        xLabel = f"Time ({x.name})"
    else:
        x = pd.to_datetime(tvec)
        xLabel = "Time (default)"
    if combineZones:
        if len(data) < 25:
            plt.bar(x, data.sum(axis=1))
        else:
            plt.plot(x, data.sum(axis=1))
        plt.xlabel(xLabel)
        plt.ylabel("Consumption (Wh)")
        plt.title("Energy consumption (all zones)")
        plt.show()
    else:
        fig, axs = plt.subplots(2, 2, figsize=(16, 10))
        axs = axs.flatten()
        for index, column in enumerate(data):
            if len(data) < 25:
                axs[index].bar(x, data[column])
                axs[index].set(xlabel=xLabel, ylabel="Consumption (Wh)")
                axs[index].set_title(f"Energy consumption ({column})")
            else:
                axs[index].plot(x, data[column])
                axs[index].set(xlabel=xLabel, ylabel="Consumption (Wh)")
                axs[index].set_title(f"Energy consumption ({column})")
        plt.show()
Exemple #42
0
def _get_fovdf(dbcon, evalset: str, min_ps_per_fov: int, min_fovs_per_p: int):
    # get fov metadata
    fovmetas = read_sql_query(
        f"""
            SELECT  "fovname", "participants_{evalset}" AS "participants"
            FROM "fov_meta"
            WHERE "participants_{evalset}" NOT NULL
        ;""", dbcon)
    fovnames = list(set(fovmetas.loc[:, 'fovname']))
    fovmetas.index = fovmetas.loc[:, 'fovname']

    # init dataframe of fovs and who annotated them
    fovdf = DataFrame(0, index=fovnames, columns=ir.NPs)
    for fovname, row in fovmetas.iterrows():
        for p in row['participants'].split(','):
            if p in ir.NPs:
                fovdf.loc[fovname, p] = 1

    # only keep participants and fovs if > a certain threshold
    fovdf = fovdf.loc[:, fovdf.sum(axis=0) >= min_fovs_per_p]
    fovdf = fovdf.loc[fovdf.sum(axis=1) >= min_ps_per_fov, :]

    return fovdf, fovnames
Exemple #43
0
 def c_demand(self, p_delta: Mapping[Tuple[Sector, FinalUse], float],
              ytilde_iot: pd.DataFrame):
     const = np.array([
         np.sum([p_delta[i, u] * ytilde_iot.loc[i, u] for u in FinalUse])
         for i in Sector
     ])
     A = np.array([self.indicator("y", i) for i in Sector])
     normalization = np.array(
         [1 / ytilde_iot.sum(axis=1).loc[i] for i in Sector])
     const = np.multiply(
         const,
         normalization)  # if normalized, should all be 1 if p_delta is 1
     A = np.multiply(A, normalization[:, None])
     return Bound(A, const, None, None)
Exemple #44
0
 def getDF(self):
     # Converts to data frame
     col = ["ER+", "ER-", "Control"]
     keys = self.setKeys()
     keys.insert(0, "Total")
     ret = DataFrame(zeros((len(keys), len(col)), dtype=int),
                     columns=col,
                     index=keys)
     for k in keys:
         ret.loc[k, "ER+"] = self.pos.count(k)
         ret.loc[k, "ER-"] = self.neg.count(k)
         ret.loc[k, "Control"] = self.control.count(k)
     ret.loc["Total"] = ret.sum()
     return ret
Exemple #45
0
def assert_melt(df: pd.DataFrame, eval_metric: str = "percent_strong") -> None:
    pair_ids = set_pair_ids()
    df = df.loc[:, [pair_ids[x]["index"] for x in pair_ids]]
    index_sums = df.sum().tolist()

    assert_error = "Stop! The eval_metric provided in 'metric_melt()' is incorrect!"
    assert_error = "{err} This is a fatal error providing incorrect results".format(
        err=assert_error)
    if eval_metric == "percent_strong":
        assert index_sums[0] != index_sums[1], assert_error
    elif eval_metric == "precision_recall":
        assert index_sums[0] == index_sums[1], assert_error
    elif eval_metric == "grit":
        assert index_sums[0] == index_sums[1], assert_error
Exemple #46
0
class I8Merge(object):

    params = ['inner', 'outer', 'left', 'right']
    param_names = ['how']

    def setup(self, how):
        low, high, n = -1000, 1000, 10**6
        self.left = DataFrame(np.random.randint(low, high, (n, 7)),
                              columns=list('ABCDEFG'))
        self.left['left'] = self.left.sum(axis=1)
        self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1)
        self.right = self.right.reset_index(drop=True)
        self.right['right'] *= -1

    def time_i8merge(self, how):
        merge(self.left, self.right, how=how)
def generate_probability_vector_result(output_path):

    cluster_frame = pd.read_csv(output_path + '/clusters.csv', header=None)
    cluster_frame = cluster_frame.set_index(cluster_frame.ix[:,0]).ix[:, 1:]
    cluster_array = cluster_frame.values

    points_frame = pd.read_csv(output_path + '/points.csv', header=None)
    # points_frame = points_frame.drop_duplicates()
    points_array = points_frame.values

    distance_matrix = pw.euclidean_distances(cluster_array, points_array)
    distance_matrix = distance_matrix.T
    distance_frame = DataFrame(distance_matrix)
    # print(distance_frame)
    # print(distance_frame.sum(axis=1))
    distance_frame = distance_frame.div(distance_frame.sum(axis=1), axis=0)
    distance_frame.to_csv(output_path + '/probability.csv')
Exemple #48
0
def edbSave():
    '获取客户剪切板中的edb代码,并调用接口获取edb指标的具体数据'

    # 获取客户剪切板中的代码及输入的起始与结束日期
    codes = getCodeFromClipboard()
    start = sDate()
    end = eDate()

    data = w.edb(codes, start, end, "Fill=Previous")
    datachg = [d.strftime('%y-%m-%d') for d in data.Times]
    df = DataFrame(data.Data, index=data.Codes, columns=datachg).T
    print('-' * 85)
    print(df)
    print('-' * 85)
    print('统计指标:')
    print(df.describe())
    print("sum", " " * 3, str(df.sum()).split(sep="    ")[1].rjust(10))
    return df
def calc_happiness(order, guest_dict):
    df = DataFrame(columns=order, index=order)

    for idx,guest in enumerate(order[:-1]):
        # print "{} -> {}: {}".format(
        #     guest,
        #     order[idx+1],
        #     gd[guest][order[idx+1]]
        #     )

        df[guest][order[idx+1]] = guest_dict[guest][order[idx+1]]
        df[order[idx+1]][guest] = guest_dict[order[idx+1]][guest]

    df[order[0]][order[-1]] = guest_dict[order[0]][order[-1]]
    df[order[-1]][order[0]] = guest_dict[order[-1]][order[0]]


    return df.sum().sum()
def predict_random_category(y_test, n=1000):
    """ Uses boostrapping to compute the expected prediction by chance for each category.
    Parameters:
        y_test (array): Labels
        n (int): the number of times randomize.
    Returns:
        Series containg the accuracy for each class.."""

    # Create a data frame with random predictions.
    random_ = DataFrame({i: shuffle_predict(y_test) for i in xrange(n)})
    random_.index = y_test

    # Calculate the random and
    random_ = 1.0 * random_.sum(axis=1) / n
    grouped = random_.groupby(level=0)
    mean = grouped.mean() * 100.0
    sd = grouped.std() * 100.0
    return mean, sd
def two_column_summary(df, index, column, do_totals=True, do_prob=False):
    """returns a DataFrame contingency (frequency) summary table for two columns.

    arguments:
    df       -- input DataFrame
    index    -- the column used to summarize vertically
    column   -- the column used to summarize vertically
    to_total -- places a row and column to summarize the total along that axis
    do_prob  -- instead of return frequency, return probablity
    """

    # test input
    if not (column in df.columns):
        raise ValueError("[two_column_summary] '%s' no a valid column name" % column)
    if not (index in df.columns):
        raise ValueError("[two_column_summary] '%s' no a valid column name" % index)

    # group for each column
    unique_col_values = df[column].unique()
    cols = []
    for v in unique_col_values:
        mask = df[column]==v
        cols.append(df[mask].groupby(index))

    # glue groups back together
    df_summary = DataFrame()
    for idx, c in enumerate(cols):
        d = c.count()
        d.columns = [unique_col_values[idx]]
        df_summary = pandas.concat([df_summary, d], axis=1)

    # add total
    if do_totals:
        df_summary['total']    = df_summary.apply(sum, axis=1)
        df_summary.ix['total'] = df_summary.sum()

    # make into probablity
    if do_prob:
        df_summary = df_summary/df_summary.ix['total']['total']

    return df_summary
 def plot_centre_crowd(self, thresh=2, show_threads=False, **kwargs):
     """Plotting evolution of number of participants close to centre"""
     project, show, _ = ac.handle_kwargs(**kwargs)
     data = self.__get_centre_distances(thresh, split=False)
     data_close = DataFrame({
         '6 hours': data[data <= .25].count(axis=1),
         '12 hours': data[(data <= .5) & (data > .25)].count(axis=1),
         '24 hours': data[(data <= 1) & (data > .5)].count(axis=1)},
                            columns=['6 hours', '12 hours', '24 hours'])
     plt.style.use(SETTINGS['style'])
     y_max = data_close.sum(axis=1).max()
     _, axes = plt.subplots()
     data_close.plot(kind="area", ax=axes, stacked=True,
                     color=['darkslategray', 'steelblue', 'lightgray'])
     axes.set_yticks(range(1, y_max + 1))
     axes.set_ylabel("Number of participants")
     axes.set_title("Crowd close to the centre of discussion in {}".format(
         project))
     axes.xaxis.set_ticks_position('bottom')
     axes.yaxis.set_ticks_position('left')
     if show_threads:
         self.__show_threads(axes)
     ac.show_or_save(show)
Exemple #53
0
    def process(self, start_time: datetime, end_time: datetime, input:DataFrame):
        if str(self.name) not in '+-*/':
            raise ValueError("Unknown math function: " + str(self.name))

        ret = DataFrame()

        # two args means we're doing A + B
        if len(self._args) == 2:
            left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0]
            right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1]

            for l_col in left.columns:
                for r_col in right.columns:
                    if self.name == '+':
                        t = left[l_col] + right[r_col]
                    elif self.name == '-':
                        t = left[l_col] - right[r_col]
                    elif self.name == '*':
                        t = left[l_col] * right[r_col]
                    elif self.name == '/':
                        t = left[l_col] / right[r_col]
                    else:
                        raise ValueError("Unknown operator: " + str(self.name))

                    t = DataFrame(t)
                    t.columns = [l_col + self.name + r_col]

                    print(left.head())
                    print(right.head())
                    print(t.head())
                    ret = ret.combine_first(t)

        else:  # everything is in the input DataFrame
            ret = DataFrame(input.sum(axis=0))
            ret.columns = [' + '.join(input.columns)]

        return ret
Exemple #54
0
def fix_event_type(df: DataFrame):
    '''
    Not sure yet.
    :param df: Dataframe object.
    :return: Modified Dataframe.
    '''

    a = time.time()

    colsf = df['id'].ravel()            # list of all IDs
    unique = pd.Series(colsf).unique()  # get unique IDs
    u_counts = []                       # list of unique counts (UNUSED)
    counts_bucket = []                  # bucket of counts (UNUSED)
    df = pd.get_dummies(df)             # create dummy variables
    todrop = df.sum() < 50              # get columns where sum of dummy column < 50
    dropcols = df.columns[todrop]       # get those column names
    df = df.drop(dropcols, axis=1)      # drop those columns
    df['num_events'] = 0                # create number of events columns, set to 0
    # print(df.columns)
    print(str(len(unique)))

    for ii in range(0,len(unique)):     # loop through all the unique IDs
        subset = df.loc[df['id'] == unique[ii]]     # subset by that ID
        the_dummies = subset.columns != 'id'        # get all columns that do not equal that ID
        aa = subset.iloc[:, subset.columns != 'id'].sum().tolist()  # get all of those columns to list
        event_sum = np.sum(aa)      # sum all of those
        
        # aa = aa.set_index([[subset.index[0]]])
        # subset.iloc[:,subset.columns != 'id'] = aa
        df = df.set_value(subset.index, the_dummies, aa)
        df = df.set_value(subset.index, 'num_events', event_sum)
        # df.loc[subset.index] = subset
    df = df.drop_duplicates('id')
    print(df)
    b = time.time()
    print(b-a)
    return df
for i, group  in enumerate(Groups):
    TotalCells_Area[SGroup[i]+'_Mean'] = TotalCells_Area[group].mean(axis =1)
    TotalCells_Area[SGroup[i]+'_Values']= TotalCells_Area[group].count(axis=1)
    TotalCells_Area[SGroup[i]+'_Stdev'] = TotalCells_Area[group].std(axis=1)
    TotalCells_Area[SGroup[i]+'_Serror'] = TotalCells_Area[SGroup[i]+'_Stdev']/np.sqrt(TotalCells_Area[SGroup[i]+'_Values'])
## Saving Table

TotalCells.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2  p16 p30 p60 Septotemporal\SGZ\Total_Cells_C57_SW_SGZ.csv')
SGZArea.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2  p16 p30 p60 Septotemporal\SGZ\SGZArea_C57_SW_SGZ.csv')
TotalCells_Area.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2  p16 p30 p60 Septotemporal\SGZ\Total_Cells_Area_C57_SW_SGZ.csv')

 TotalCells[Dictionary['C57_p30']]
 TotalCells[Dictionary['C57']]
#Calculating Density for C57 and C57_p30
 
TotalCellsSum = TotalCells.sum()
SGZAreaSum = SGZArea.sum()     
Density = TotalCellsSum/SGZAreaSum
Density =  Density[0:13]
DensityTable = Series([Density[0:4].mean(),Density[4:9].mean(),Density[9:13].mean()], index =['C57 P16','C57 P30','C57 P60'])
C57_p16_Error =Density[0:4].std()/sqrt(Density[0:4].count())
C57_p30_Error = Density[4:9].std()/sqrt(Density[4:9].count())
C57_Error = Density[9:13].std()/sqrt(Density[9:13].count())

#Density.to_csv('C:\Users\KBermudez-Hernandez\Documents\Dropbox\Figures\Figure1 p16 p30 p60 density\Prox_density_bar_graph\P16 P30 P60 Prox1 Density.csv')

#Plotting Density Graph
plt.figure()   
DensityTable.plot(kind='bar',yerr=[C57_p16_Error,C57_p30_Error,C57_Error])
plt.ylabel('Density of Prox1 in SGZ')
plt.xticks(rotation=0)
Exemple #56
0
print "df.head - first 5 rows"
print df.head()
import os
os.remove(r'./births1880.txt')

uqNames = df['Names'].unique()

print "df['names'].unique()"
print uqNames

print "df.names.describe()"
print df['Names'].describe()

df = df.groupby("Names")  #group by name
print df
df = df.sum() # applys sum to each groupBy obj
print df
#above is equivalent os select sum(births) from df group by names;


Sorted = df.sort(columns="Births", ascending=False)
print Sorted.head(1)

#or
df['Births'].max()

#Create Graph
df['Births'].plot(kind="bar")

print "The most popular name"
df.sort(columns='Births', ascending=False)
Exemple #57
0
class Scores(object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(
        cls, df,
        uri=None, modality=None, aggfunc=np.mean
    ):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        dataframe = pivot_table(
            df, values=PYANNOTE_SCORE,
            index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL,
            aggfunc=aggfunc
        )

        annotation = Annotation(uri=uri, modality=modality)
        for index, _ in dataframe.iterrows():
            segment = Segment(*index[0])
            track = index[1]
            annotation[segment, track] = ''

        labels = dataframe.columns

        return cls(uri=uri, modality=modality,
                   annotation=annotation, labels=labels,
                   values=dataframe.values)

    def __init__(self, uri=None, modality=None,
                 annotation=None, labels=None,
                 values=None, dtype=None):

        super(Scores, self).__init__()

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        if annotation:
            annotation = annotation.copy()
            index = Index(
                [s + (t, ) for s, t in annotation.itertracks()],
                name=names)

        else:
            annotation = Annotation(uri=uri, modality=modality)
            index = MultiIndex(levels=[list() for name in names],
                               labels=[list() for name in names],
                               names=names)

        self.annotation_ = annotation
        columns = None if labels is None else list(labels)
        data = None if values is None else np.array(values)
        dtype = np.float if values is None else values.dtype

        self.dataframe_ = DataFrame(data=data, dtype=dtype,
                                    index=index, columns=columns)

        self.hasChanged_ = True

        self.modality = modality
        self.uri = uri

    def copy(self):
        self._reindexIfNeeded()
        copied = self.__class__(uri=self.uri, modality=self.modality)
        copied.dataframe_ = self.dataframe_.copy()
        copied.annotation_ = self.annotation_.copy()
        copied.hasChanged_ = self.hasChanged_
        return copied

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self.dataframe_.drop(tuple(segment), axis=0, inplace=True)
            del self.annotation_[segment]
            self.hasChanged_ = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self.dataframe_.drop(tuple(segment) + (track, ),
                                 axis=0, inplace=True)
            del self.annotation_[segment, track]
            self.hasChanged_ = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key
        return self.dataframe_.at[tuple(segment) + (track, ), label]

    # scores[segment, track, label] = value
    # scores[segment, label] ==== scores[segment, '_', label]
    def __setitem__(self, key, value):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key

        # do not add empty track
        if not segment:
            return

        self.dataframe_.at[tuple(segment) + (track,), label] = value
        self.annotation_[segment, track] = label
        self.hasChanged_ = True

    def __len__(self):
        """Number of annotated segments"""
        return len(self.annotation_)

    def __nonzero__(self):
        return self.__bool__()

    def __bool__(self):
        """False if annotation is empty"""
        return True if self.annotation_ else False

    def __contains__(self, included):
        """Check if segments are annotated

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` is annotated, False otherwise.
        """
        return included in self.annotation_

    def __iter__(self):
        """Iterate over sorted segments"""
        return iter(self.annotation_.get_timeline())

    def __reversed__(self):
        """Reverse iterate over sorted segments"""
        return reversed(self.annotation_.get_timeline())

    def itersegments(self):
        return iter(self)

    def tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return self.annotation_.get_tracks(segment)

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return self.annotation_.has_track(segment, track)

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        return self.annotation_.get_track_by_name(track)

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name


        Returns
        -------
        track : str
            New track name
        """

        return self.annotation_.new_track(segment, candidate=None, prefix=None)

    def itertracks(self):
        """Iterate over annotation as (segment, track) tuple"""
        return self.annotation_.itertracks()

    def itervalues(self):
        """Iterate over scores as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._reindexIfNeeded()

        labels = self.labels()

        # yield one (segment, track, label) tuple per loop
        for index, columns in self.dataframe_.iterrows():
            segment = Segment(*index[:-1])
            track = index[-1]
            for label in labels:
                value = columns[label]
                if not np.isnan(value):
                    yield segment, track, label, value

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return dict(self.dataframe_.xs(tuple(segment) + (track, )))

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        labels = sorted(self.dataframe_.columns, key=str)
        if unknown:
            return labels
        else:
            return [l for l in labels if not isinstance(l, Unknown)]

    def _reindexIfNeeded(self):

        if not self.hasChanged_:
            return

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        new_index = Index(
            [s + (t, ) for s, t in self.annotation_.itertracks()],
            name=names)

        self.dataframe_ = self.dataframe_.reindex(new_index)

        self.hasChanged_ = False

        return

    def retrack(self):
        """
        """

        self._reindexIfNeeded()
        retracked = self.copy()

        annotation = self.annotation_.retrack()
        retracked.annotation_ = annotation

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]
        new_index = Index(
            [s + (t, ) for s, t in annotation.itertracks()],
            name=names)
        retracked.dataframe_.index = new_index

        return retracked

    def apply(self, func, axis=0):

        applied = self.copy()
        applied.dataframe_ = self.dataframe_.apply(func, axis=axis)
        applied.hasChanged_ = True

        return applied

    def rank(self, ascending=False):
        """

        Parameters
        ----------
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        rank : `Scores`

        """

        ranked = self.copy()
        ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1,
                                                      ascending=ascending)
        ranked.hasChanged_ = True
        return ranked

    def nbest(self, n, ascending=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """

        filtered = self.copy()
        ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending)
        filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n,
                                                        other=np.NaN)
        filtered.hasChanged_ = True
        return filtered

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        self._reindexIfNeeded()

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        subset = Scores(uri=self.uri, modality=self.modality)
        subset.annotation_ = self.annotation_
        subset.dataframe_ = self.dataframe_[list(labels)]

        return subset

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        if not self:
            return Annotation(uri=self.uri, modality=self.modality)

        best = self.nbest(1, ascending=False)
        large_enough = best.copy()

        if posterior:
            unknown_posterior = 1. - self.dataframe_.sum(axis=1)

            large_enough.dataframe_ = (
                ((best.dataframe_.T > unknown_posterior) &
                 (best.dataframe_.T > threshold)).T
            )

        else:

            large_enough.dataframe_ = (
                (best.dataframe_.T > threshold).T
            )

        large_enough.dataframe_.where(best.dataframe_.notnull(),
                                      inplace=True, other=np.NaN)

        annotation = Annotation(uri=self.uri, modality=self.modality)
        for segment, track, label, value in large_enough.itervalues():
            label = label if value else Unknown()
            annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""

        mapped = self.copy()
        mapped.dataframe_ = self.dataframe_.applymap(func)
        mapped.hasChanged_ = True
        return mapped

    def crop(self, focus, mode='strict'):
        """Crop on focus

        Parameters
        ----------
        focus : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : same type as caller
            Cropped version of the caller containing only tracks matching
            the provided focus and mode.

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.

        """

        if isinstance(focus, Segment):
            return self.crop(Timeline([focus], uri=self.uri), mode=mode)

        self._reindexIfNeeded()
        cropped = self.copy()

        if mode in ['strict', 'loose']:

            new_annotation = self.annotation_.crop(focus, mode=mode)
            keep = [new_annotation.has_track(segment, track)
                    for segment, track in self.itertracks()]
            cropped.dataframe_ = self.dataframe_[keep]
            cropped.annotation_ = new_annotation
            cropped.hasChanged_ = True

            return cropped

        elif mode in ['intersection']:

            raise NotImplementedError('')

            # # two original segments might be cropped into the same resulting
            # # segment -- therefore, we keep track of the mapping
            # intersection, mapping = timeline.crop(coverage,
            #                                       mode=mode, mapping=True)
            #
            # # create new empty annotation
            # A = self.__class__(uri=self.uri, modality=self.modality)
            #
            # for cropped in intersection:
            #     for original in mapping[cropped]:
            #         for track in self.tracks(original):
            #             # try to use original track name (candidate)
            #             # if it already exists, create a brand new one
            #             new_track = A.new_track(cropped, candidate=track)
            #             # copy each value, column by column
            #             for label in self.dataframe_.columns:
            #                 value = self.dataframe_.get_value((original, track),
            #                                            label)
            #                 A.dataframe_ = A.dataframe_.set_value((cropped, new_track),
            #                                         label, value)
            #
            # return A

    def __str__(self):
        """Human-friendly representation"""
        if self:
            self._reindexIfNeeded()
            return str(self.dataframe_)
        else:
            return ""

    def _repr_png_(self):
        from .notebook import repr_scores
        return repr_scores(self)
for i, group  in enumerate(Groups):
    TotalCells_Area[SGroup[i]+'_Mean'] = TotalCells_Area[group].mean(axis =1)
    TotalCells_Area[SGroup[i]+'_Values']= TotalCells_Area[group].count(axis=1)
    TotalCells_Area[SGroup[i]+'_Stdev'] = TotalCells_Area[group].std(axis=1)
    TotalCells_Area[SGroup[i]+'_Serror'] = TotalCells_Area[SGroup[i]+'_Stdev']/np.sqrt(TotalCells_Area[SGroup[i]+'_Values'])
## Saving Table

TotalCells.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\Total_Cells_CreBax_C_H_Hilus.csv')
HilusArea.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\HilusArea_CreBax_C_H_Hilus.csv')
TotalCells_Area.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\Total_Cells_Area_C_H_CreBax_Hilus.csv')



#Calculating Density of All
 
TotalCellsSum = TotalCells.sum()
HilusAreaSum = HilusArea.sum()     
Density = TotalCellsSum/HilusAreaSum

DensityTable = Series([Density[P30_Positive_Tam].mean(),Density[P30_Negative_Tam].mean(),Density[P60_Positive_Tam].mean(),Density[P60_Negative_Tam].mean()], index =['P30_Positive_Tam','P30_Negative_Tam', 'P60_Positive_Tam' ,'P60_Negative_Tam'])
P30_Positive_Tam_Error =Density[P30_Positive_Tam].std()/sqrt(Density[P30_Positive_Tam].count())
P30_Negative_Tam_Error = Density[P30_Negative_Tam].std()/sqrt(Density[P30_Negative_Tam].count())
P60_Positive_Tam_Error = Density[P60_Positive_Tam].std()/sqrt(Density[P60_Positive_Tam].count())
P60_Negative_Tam_Error = Density[P60_Negative_Tam].std()/sqrt(Density[P60_Negative_Tam].count())



#Density.to_csv('C:\Users\KBermudez-Hernandez\Documents\Dropbox\Figures\Figure1 p16 p30 p60 density\Prox_density_bar_graph\P16 P30 P60 Prox1 Density.csv')

#Plotting Density Graph
plt.figure()   
Exemple #59
0
import numpy as np
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df.ix['b']
## Summarizing and Computing Descriptive Statistics
# reductions or summary statistics
f = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
Exemple #60
-1
def descriptiveStatsDataFrame():
    df = DataFrame([[1.4, np.nan], [7, 5], [np.nan, np.nan], [7,10]], index=['a','b','c','d'], columns=['one','two'])
    print (df)
    print ('Column Sum: \n{}'.format(df.sum(axis=0)))
    print ('Row Sum: \n{}'.format(df.sum(axis=1)))
    print ('Do not skip NA: \n{}'.format(df.sum(axis=1, skipna=False)))
    print ('Index with min Value: \n{}'.format(df.idxmin()))
    print ('Summary Statistic: \n{}'.format(df.describe()))