コード例 #1
0
def make_USrepresentative_df():
    representative_df = pd.DataFrame()

    df = pd.read_html(URLS['dem_USrepresentative'])[0]
    df.columns = ['county', 'candidate1', 'candidate2',
                  'candidate3', 'candidate4', 'candidate5', 'candidate6']
    df['county'] = df['county'].fillna('') 
    splits = df[df.county.str.startswith('DISTRICT')].index.tolist()
    splits.append(df.shape[0])
    
    for split in range(len(splits) - 1):
        df_ = df.iloc[splits[split]:splits[split+1]]
        df_ = df_.drop(df_.index[0])
        df_.columns = df_.iloc[0]
        df_ = df_.drop(df_.index[0])
        df_.columns = ['county'] + list(df_.columns[1:])
        df_ = df_.dropna(subset=[df_.columns.values[1]])
        df_ = df_.dropna(axis=1)
        
        df_ = pd.melt(df_, id_vars=['county'], value_vars=list(df_.columns[1:]))
        df_.columns = ['county', 'candidate', 'votes']
        df_ = df_[df_['county'] != '']


        df_['party'] = 'Democratic'
        df_['candidate'] = df_['candidate'].str.lstrip('*')
        df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '')
        df_['candidate'] = df_['candidate'].str.rstrip('()')
        df_['office'] = 'US Representative'
        representative_df = representative_df.append(df_)

    df = pd.read_html(URLS['rep_USrepresentative'])[0]
    df.columns = ['county', 'candidate1', 'candidate2',
                  'candidate3', 'candidate4', 'candidate5']
    df['county'] = df['county'].fillna('') 
    splits = df[df.county.str.startswith('DISTRICT')].index.tolist()
    splits.append(df.shape[0])
    
    for split in range(len(splits) - 1):
        df_ = df.iloc[splits[split]:splits[split+1]]
        df_ = df_.drop(df_.index[0])
        df_.columns = df_.iloc[0]
        df_ = df_.drop(df_.index[0])
        df_.columns = ['county'] + list(df_.columns[1:])
        df_ = df_.dropna(subset=[df_.columns.values[1]])
        df_ = df_.dropna(axis=1)
        
        df_ = pd.melt(df_, id_vars=['county'], value_vars=list(df_.columns[1:]))
        df_.columns = ['county', 'candidate', 'votes']
        df_ = df_[df_['county'] != '']


        df_['party'] = 'Republican'
        df_['candidate'] = df_['candidate'].str.lstrip('*')
        df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '')
        df_['candidate'] = df_['candidate'].str.rstrip('()')
        df_['office'] = 'US Representative'
        representative_df = representative_df.append(df_)

    return representative_df
コード例 #2
0
ファイル: production.py プロジェクト: ajerneck/thatsfordinner
def most_probable_words(model, vocabulary, num_words):
    """
    Return a DataFrame of the most probable words for each topic,
    given a model, vocabulary, and number of words.
    """
    ## create array of vocabulary, sorted by topic
    ## probabilities, one row for each topic.
    vocab = np.asarray(vocabulary)[np.argsort(model.topic_word_)]
    wp = np.sort(model.topic_word_)

    ## select n most probable words, which are the right-most
    ## columns in the vocab array.
    words = vocab[:, -num_words:-1]

    words = pd.DataFrame(words.T)
    words['rank'] = words.index
    words = pd.melt(words, id_vars='rank')

    word_probs = wp[:, -num_words:-1]
    word_probs = pd.DataFrame(word_probs.T)
    word_probs['rank'] = word_probs.index
    word_probs = pd.melt(word_probs, id_vars='rank')

    ww = words.merge(word_probs, on=['rank', 'variable'])

    ww.columns = ['rank', 'topic', 'word', 'prob']
    return ww
コード例 #3
0
ファイル: test_study.py プロジェクト: bobbybabra/flotilla
    def test_tidy_splicing_with_expression(self, test_study):
        test = test_study.tidy_splicing_with_expression

        common_id = 'common_id'
        sample_id = 'sample_id'
        event_name = 'event_name'

        splicing_common_id = test_study.splicing.feature_data[
            test_study.splicing.feature_expression_id_col]

        # Tidify splicing
        splicing = test_study.splicing.data
        splicing_index_name = test_study._maybe_get_axis_name(splicing, axis=0)
        splicing_columns_name = test_study._maybe_get_axis_name(splicing, axis=1)

        splicing_tidy = pd.melt(splicing.reset_index(),
                                id_vars=splicing_index_name,
                                value_name='psi',
                                var_name=splicing_columns_name)
        rename_columns = {}
        if splicing_index_name == 'index':
            rename_columns[splicing_index_name] = sample_id
        if splicing_columns_name == 'columns':
            rename_columns[splicing_columns_name] = event_name
            splicing_columns_name = event_name
        splicing_tidy = splicing_tidy.rename(columns=rename_columns)

        # Create a column of the common id on which to join splicing
        # and expression
        splicing_names = splicing_tidy[splicing_columns_name]
        if isinstance(splicing_names, pd.Series):
            splicing_tidy[common_id] = splicing_tidy[
                splicing_columns_name].map(splicing_common_id)
        else:
            splicing_tidy[common_id] = [
                test_study.splicing.feature_renamer(x)
                for x in splicing_names.itertuples(index=False)]

        splicing_tidy = splicing_tidy.dropna()

        # Tidify expression
        expression = test_study.expression.data_original
        expression_index_name = test_study._maybe_get_axis_name(expression, axis=0)
        expression_columns_name = test_study._maybe_get_axis_name(expression, axis=1)

        expression_tidy = pd.melt(expression.reset_index(),
                                  id_vars=expression_index_name,
                                  value_name='expression',
                                  var_name=common_id)
        # This will only do anything if there is a column named "index" so
        # no need to check anything
        expression_tidy = expression_tidy.rename(columns={'index': sample_id})
        expression_tidy = expression_tidy.dropna()

        splicing_tidy.set_index([sample_id, common_id], inplace=True)
        expression_tidy.set_index([sample_id, common_id], inplace=True)

        true = splicing_tidy.join(expression_tidy, how='inner').reset_index()
        
        pdt.assert_frame_equal(test, true)
コード例 #4
0
ファイル: flask_app.py プロジェクト: supwest/capstone
def get_song_recs(ratings, n_features):

    '''
    Takes user new movie ratings from website user and returns 
    recommended song titles
    '''

    path_to_songs_sf = '/home/cully/Documents/capstone/data/flask_songs_sf'
    path_to_movies_sf = '/home/cully/Documents/capstone/data/flask_movies_sf'
    songs_sf = gl.load_sframe(path_to_songs_sf)
    songs_df = songs_sf.to_dataframe()
    value_vars = [x for x in songs_df.columns if x != 'id']
    ids = [x for x in songs_df.index]
    if 'id' not in songs_df.columns:
        songs_df.insert(0, 'id', ids)
    songs_melted = gl.SFrame(pd.melt(songs_df, id_vars = 'id', value_vars=value_vars))
    songs_rec = gl.factorization_recommender.create(songs_melted, user_id = 'id', item_id='variable', target='value', num_factors = n_features)
    _, _, songs_item_intercept, songs_item_factors, songs_intercept = get_rec_coeffs(songs_rec)
    movies_sf = gl.load_sframe(path_to_movies_sf)
    movies_df = movies_sf.to_dataframe()
    
    value_vars = [x for x in movies_df.columns if x != 'id']

    new_ratings = {movie_dict[name]:int(ratings[name]) for name in ratings}
    new_df = pd.DataFrame.from_dict([new_ratings], orient='columns').replace(-1,np.nan)
    movies_df = pd.concat([movies_df, new_df]).reset_index(drop=True)
    ids = [str(i) for i in movies_df.index]
    movies_df.insert(0, 'id', ids)
    movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna()
    movies_rec = gl.factorization_recommender.create(movies_melted, user_id='id', item_id='variable', target='value', num_factors=n_features)
    movies_user_intercept, movies_user_factors, _, _, movies_intercept = get_rec_coeffs(movies_rec)
    comb = np.dot(np.array(movies_user_factors)[-1], np.array(songs_item_factors).T)
    return songs_df.columns[1:][np.argsort(comb)[::-1]]
コード例 #5
0
ファイル: problem1.py プロジェクト: jeremyzyang/mit_6867
def wrapper(name):
    global pltsize
    Xt, Yt=loadData(name, 'train')
    Xv, Yv=loadData(name, 'validate')
    w = Train(Xt, Yt, 0)
    print 'Classification Error (TR): ', classifyErr(LRPredict(w, Xt), Yt, 0.5), name
    print 'Classification Error (VAL):: ',classifyErr(LRPredict(w, Xv), Yv, 0.5), name
    t1 = 'Classification Error vs Decision Boundary - ' + name + ': Training'
    t2 = 'Classification Error vs Decision Boundary - ' + name + ': Validation'
    plotCEDB(w, Xt, Yt, '')
    plotCEDB(w, Xv, Yv, '')
    t1 = 'Logistic Regression - ' + name + ': Training'
    t2 = 'Logistic Regression - ' + name + ': Validation'
    plotDecisionBoundary(w, Xt, Yt, LRPredict, [0.5], '')
    plotDecisionBoundary(w, Xv, Yv, LRPredict, [0.5], '')
    l = array(linspace(0,100,101))
    tErr, tClass, vErr, vClass = GridL(Xt, Yt, Xv, Yv, l)
    DF1 = pd.DataFrame({'TR': pd.Series(tClass), 'VAL': pd.Series(vClass), 'Lambda': pd.Series(l)})
    DF1 = pd.melt(DF1,id_vars=['Lambda'])
    DF2 = pd.DataFrame({'TR': pd.Series(tErr), 'VAL': pd.Series(vErr), 'Lambda': pd.Series(l)})
    DF2 = pd.melt(DF2,id_vars=['Lambda'])
    title1 = 'Classification Error vs Lambda - ' + name
    title2 = 'Logisitic Loss vs Lambda - ' + name
    print p1 = ggplot(DF1, aes(x='Lambda', y='value', color='variable')) + geom_line(size=4) + ggtitle('') + ylab('Error') + theme_matplotlib(rc=pltsize, matplotlib_defaults=False)
    print p2 = ggplot(DF2, aes(x='Lambda', y='value', color='variable')) + geom_line(size=4) + ggtitle('') + ylab('Error') + theme_matplotlib(rc=pltsize, matplotlib_defaults=False)
コード例 #6
0
def find_avg_dataframe(df, log=None, value_vars=list()):
    try:
        avg_col = None
        for col in df.columns:
            if 'average' in str(col):
                avg_col = col
        if avg_col != None:
            df_avg = pd.melt(df, id_vars=['year'], value_vars=[avg_col])
            if len(value_vars) == 0:
                all_columns = list()
                for col in df.columns:
                    all_columns.append(col)
                all_columns.remove(avg_col)
                all_columns.remove('year')
                value_vars = all_columns
            else:
                if avg_col in value_vars:
                    value_vars.remove(avg_col)

            df_lng = pd.melt(df, id_vars=['year'], value_vars=value_vars)
            
            print("Found average dataframe")
	    
            return df_avg, df_lng
    except KeyError as ke:
        if log:
            logging.error(str(ke))
        else:
            print("Could not find average dataframe")
    return pd.DataFrame(), pd.DataFrame()
def main():
    """Load up all the performances and do some stats"""
    log_files = []
    performances = []
    for local_file in os.listdir("data"):
        if local_file.endswith(".log"):
            log_files.append("data/" + local_file)
    print("Loading the performances.")
    for log in log_files:
        performances.append(MetatonePerformanceLog(log))

    ## Also load up the experiment design dataframe to merge with the data!
    experiment_design = pd.read_csv("2015-MetatoneStudy-ExperimentDesign.csv", index_col='time', parse_dates=True)


    print("Finding the lengths.")
    performer_length_dict = {}
    for perf in performances:
        performer_length_dict.update(perf.performer_lengths())
    performance_length_frame = pd.DataFrame.from_dict(performer_length_dict, orient="index")
    performance_length_frame['time'] = performance_length_frame.index
    performers = performances[0].performers().tolist()
    long_performance_lengths = pd.melt(performance_length_frame, id_vars=['time'], value_vars=performers)
    long_performance_lengths = long_performance_lengths.replace({'variable':DEVICE_SEATS})
    long_performance_lengths.to_csv("performance_lengths.csv")

    print("Creating Gesture Scores.")
    for perf in performances:
        perf.print_gesture_score() ## Prints out a gesture-score pdf for reference.

    print("Creating performance info dataframe.")
    perf_data = {}
    for perf in performances:
        perf_data.update({perf.first_touch_timestamp():{
            "raw_new_ideas":perf.raw_new_ideas,
            "new_idea_changes":perf.count_new_idea_interface_changes(),
            "button_presses":perf.count_button_interface_changes(),
            "flux":perf.ensemble_flux(),
            "entropy":perf.ensemble_entropy()
        }})
    performance_data = pd.DataFrame.from_dict(perf_data, orient = "index")
    performance_data.to_csv("performance_data.csv")

    print("Creating perfomer button press dataframe")
    performer_presses = {}
    for perf in performances:
        performer_presses.update(perf.button_interface_changes_by_performer())
    button_changes_frame = pd.DataFrame.from_dict(performer_presses,orient = "index")
    button_experiment_frame = pd.concat([experiment_design,button_changes_frame], axis = 1)
    performers = performances[0].performers().tolist()
    button_experiment_frame['time'] = button_experiment_frame.index

    long_button_frame = pd.melt(button_experiment_frame, id_vars=['time', 'perf_number', 'group', 'performance', 'button', 'server', 'overall'],
        value_vars=performers,
        var_name='seat',
        value_name='button_presses')
    long_button_frame = long_button_frame.replace({'seat':DEVICE_SEATS})
    long_button_frame['performer'] = np.vectorize(lambda x, y: PARTICIPANTS[x][y])(long_button_frame['group'], long_button_frame['seat'])
    long_button_frame.to_csv("button_presses_per_performer.csv")
コード例 #8
0
    def sales_to_db(self, kk_nullfall, kk_planfall):
        '''store the sales matrices in database'''
        # sum up sales join them on index to dataframe, replace missing entries
        # (e.g. no entries for planned markets in nullfall -> sales = 0)
        sales_nullfall = kk_nullfall.sum(axis=1)
        sales_planfall = kk_planfall.sum(axis=1)
        df_sales_null = pd.DataFrame(sales_nullfall, columns=['umsatz_nullfall'])
        df_sales_plan = pd.DataFrame(sales_planfall, columns=['umsatz_planfall'])
        df_sales = df_sales_null.join(df_sales_plan, how='outer')
        df_sales.fillna(0, inplace=True)
        df_sales['id'] = df_sales.index
        df_sales['umsatz_differenz'] = ((df_sales['umsatz_planfall'] /
                                         df_sales['umsatz_nullfall']) * 100 - 100)
        df_sales.fillna(0, inplace=True)

        self.parent_tbx.dataframe_to_table('Maerkte', df_sales, pkeys=['id'])

        # invert the pivoted tables
        kk_nullfall['id_markt'] = kk_nullfall.index
        kk_planfall['id_markt'] = kk_planfall.index
        df_nullfall = pd.melt(kk_nullfall,
                              value_name='kk_strom_nullfall',
                              id_vars='id_markt')
        df_planfall = pd.melt(kk_planfall,
                              value_name='kk_strom_planfall',
                              id_vars='id_markt')

        # join the results to the cell table
        cells = self.parent_tbx.table_to_dataframe('Beziehungen_Maerkte_Zellen')
        del cells['kk_strom_nullfall']
        del cells['kk_strom_planfall']
        cells = cells.merge(df_nullfall,
                            on=['id_siedlungszelle', 'id_markt'], how='left')
        cells = cells.merge(df_planfall,
                            on=['id_siedlungszelle', 'id_markt'], how='left')
        cells.fillna(0, inplace=True)
        cells.sort_values(by = ['id_markt', 'id_siedlungszelle'], inplace=True)


        # should be identical, but take both anyway
        sum_null = cells.groupby('id_siedlungszelle',
                                 as_index=False)['kk_strom_nullfall'].sum()
        sum_plan = cells.groupby('id_siedlungszelle',
                                 as_index=False)['kk_strom_planfall'].sum()
        cells = cells.merge(sum_null, on=['id_siedlungszelle'],
                            suffixes=('', '_sum'))
        cells = cells.merge(sum_plan, on=['id_siedlungszelle'],
                            suffixes=('', '_sum'))
        cells['kk_bindung_nullfall'] = cells['kk_strom_nullfall'] * 100 / cells['kk_strom_nullfall_sum']
        cells['kk_bindung_planfall'] = cells['kk_strom_planfall'] * 100 / cells['kk_strom_planfall_sum']

        # deletion of old entries and inserting is faster than updating
        self.parent_tbx.delete_rows_in_table('Beziehungen_Maerkte_Zellen')
        #column_values = {}
        #for col in cells.columns:
            #column_values[col] = cells[col].values
        arcpy.AddMessage(u'Schreibe Kenngrößen in Datenbank...')
        self.parent_tbx.insert_dataframe_in_table(
            'Beziehungen_Maerkte_Zellen', cells)
コード例 #9
0
def create_line_plot(plot_title, y_label, df, log, value_vars=list()):
    #variable_colors = dict()
    #colors = ['red', 'blue', 'green', 'orange', 'yellow', 'purple', 'black', 'cyan']
    #colors_to_hex = { 'red': '#FF0000', 'blue': '#00000FF', 'green': '#00FF00', 'orange': '#CC79A7', 'yellow': '#AAAA00', 'purple': '#AA00AA', 'black': '#FFFFFF', 'cyan': '#00AAFF' }
    #colors_to_col = dict()
    #color_index = 0
    #for col in df.columns:
        #if col != 'year':
            #variable_colors[col] = colors[color_index % len(colors)]
            #colors_to_col[colors[color_index % len(colors)]] = col
            #color_index += 1

    # Transform the columns into id, variable, and values columns, using the year column as the id
    df_lng = None
    try:
        df_aes_basis = pd.melt(df, id_vars=['year'])
        df_lng = pd.melt(df, id_vars=['year'], value_vars=value_vars)
    except KeyError as ke:
        if log:
            logging.error(str(ke))
        return None

    #df_avg, df_lng = find_avg_dataframe(df, log, value_vars)
    #if len(df_avg) == 0 or len(df_lng) == 0:
        #return None
    #color_list = list()
    #for row_index, row in df_lng.iterrows():
    #    color_list.append(variable_colors[row.variable])
    #
    #df_colors = pd.DataFrame(color_list, index=df_lng.index, columns=['color_mapping'])
    #df_lng = pd.concat([df_lng, df_colors], axis=1, join_axes=[df_lng.index])
    #

    plot = ggplot(aes(x='year', y='value', color='variable'), data=df_lng)

    #plot.add_to_legend(legend_type='color', legend_dict=colors_to_col)

    #print plot.data._get_numeric_data().columns

    #selected_color_list = list()
    #for col in value_vars:
        #selected_color_list.append(variable_colors[col])

    #plot.manual_color_list = selected_color_list

    #data_assigned_visual_mapping = assign_visual_mapping(data=df_aes_basis, aes=aes(x='year', y='value', color='variable'), gg=plot)
    #print data_assigned_visual_mapping

    plot += geom_line(aes(x='year', y='value', color='variable'), data=df_lng)
    plot += ggtitle(plot_title)
    plot += xlab('Year')
    plot += ylab(y_label)

    fig = plot.draw()

    return fig
コード例 #10
0
ファイル: run.py プロジェクト: ajerneck/thatsfordinner
def save_data_for_frontend(model, vectorizer, df):

    doc_ids = np.argsort(model.doc_topic_, axis=0)[-5:-1,:].T
    doc_probs = np.sort(model.doc_topic_, axis=0)[-5:-1,:].T
    topic_total_probs = np.sum(doc_probs, axis=1)
 
    ## extract and prepare most probable words.
    ## split bigrams and take the unique set of the resulting word list.
    w = p.most_probable_words(model, vectorizer.get_feature_names(), 10)
    word_data = collections.defaultdict(list)
    for topic, g in w.groupby('topic'):
        word_data[topic] = ', '.join([w.capitalize() for w in p.unique(itertools.chain(*g.sort('prob', ascending=False)['word'].str.split(' ').values))])
        # word_data[topic] = ', '.join([str(g['prob'].sum())] + [w.capitalize() for w in p.unique(itertools.chain(*g.sort('prob', ascending=False)['word'].str.split(' ').values))])
    # for k,v in word_data.iteritems():
    #     print k
    #     print topic_total_probs[k]
    #     word_data[k] = v + str(topic_total_probs[k])


    with open('frontend/app/word_data.pkl', 'w') as f:
        pickle.dump(word_data, f)


    di = pd.DataFrame(doc_ids)
    di['topic'] = di.index
    di = pd.melt(di, id_vars='topic')
    di.columns = ['topic','rank','key']
    dp = pd.DataFrame(doc_probs)
    dp['topic'] = dp.index
    dp = pd.melt(dp, id_vars='topic')
    dp.columns = ['topic','rank','prob']

    dd = pd.merge(di, dp)

    ## merge in document data for the most probable documents.
    df['topic'] = np.argmax(model.doc_topic_, axis=1).T
    df['topic_prob'] = np.max(model.doc_topic_, axis=1).T
    df['key'] = df.index
    most_probable_docs = pd.merge(df, dd)
    ## TODO: do the decoding here.

    most_probable_docs['ingredient_txt'] = [w for w in most_probable_docs['ingredient_txt'].str.split('\n') if w != []]
    doc_data = collections.defaultdict(list)
    for topic, g in most_probable_docs.groupby('topic'):
        row = g.sort('prob')[['ingredient_txt','image','url','title', 'key']].values
        doc_data[topic] = map(lambda x: dict(zip(['ingredient','image','url','title','key'], x)), row)
    with open('frontend/app/doc_data.pkl', 'w') as f:
        pickle.dump(doc_data, f)

    engine = p.make_engine()
    df.to_sql('clean_recipes', engine, if_exists='replace')
コード例 #11
0
ファイル: test_multi.py プロジェクト: dougc333/TestCode
def test_melt():
    pdf = pd.DataFrame({"A": list("abcd") * 5, "B": list("XY") * 10, "C": np.random.randn(20)})
    ddf = dd.from_pandas(pdf, 4)

    list_eq(dd.melt(ddf), pd.melt(pdf))

    list_eq(dd.melt(ddf, id_vars="C"), pd.melt(pdf, id_vars="C"))
    list_eq(dd.melt(ddf, value_vars="C"), pd.melt(pdf, value_vars="C"))
    list_eq(
        dd.melt(ddf, value_vars=["A", "C"], var_name="myvar"), pd.melt(pdf, value_vars=["A", "C"], var_name="myvar")
    )
    list_eq(
        dd.melt(ddf, id_vars="B", value_vars=["A", "C"], value_name="myval"),
        pd.melt(pdf, id_vars="B", value_vars=["A", "C"], value_name="myval"),
    )
コード例 #12
0
def additional_rows(table_career, var_value):

    def _fill_variables(row, var_value):
        years, values, starts, ends = yearly_value_converter(row[var_value], row.time_unit,
                                                             row.start_date, row.end_date)
        col_y = ['year_{}'.format(i) for i in range(len(years))]
        col_v = ['value_{}'.format(i) for i in range(len(years))]
        col_s = ['start_{}'.format(i) for i in range(len(years))]
        col_e = ['end_{}'.format(i) for i in range(len(years))]
        row[col_y] = years
        row[col_v] = values
        row[col_s] = starts
        row[col_e] = ends
        return row
    table = table_career.copy()
    year_vars = ['year_{}'.format(i) for i in range(20)]
    value_vars = ['value_{}'.format(i) for i in range(20)]
    start_vars = ['start_{}'.format(i) for i in range(20)]
    end_vars = ['end_{}'.format(i) for i in range(20)]
    for year, value, start, end in zip(year_vars, value_vars, start_vars, end_vars):
        table[year] = np.nan
        table[value] = np.nan
        table[start] = np.nan
        table[end] = np.nan
    table = table.apply(lambda x: _fill_variables(x, var_value), axis = 1)

    id_vars = [var_name for var_name in table.columns if var_name not in year_vars + value_vars + start_vars + end_vars]
    df_years = pd.melt(table, id_vars = id_vars, value_vars = year_vars,
                       var_name = 'var_year', value_name = 'year_from_melt')

    to_concat = [df_years]
    for to_add in ['value', 'start', 'end']:
        df_type = pd.melt(table, id_vars = ['noind', 'start_date'], value_vars = eval(to_add + '_vars'),
                          var_name = 'var_' + to_add, value_name = to_add + '_from_melt')
        assert (df_years['noind'] == df_type['noind']).all()
        df_type.drop(['noind', 'start_date'], inplace = True, axis=1)
        assert df_years.shape[0] == df_type.shape[0]
        to_concat += [df_type]
    df = pd.concat(to_concat, axis=1, join_axes=[df_years.index])
    del to_concat, table
    gc.collect()
    df = df.loc[df.value_from_melt.notnull(), :]
    df.drop([var_value, 'year', 'start_date', 'end_date', 'var_value', 'var_year', 'var_start', 'var_end'],
            inplace = True, axis=1)
    df.rename(columns={'value_from_melt': var_value, 'year_from_melt': 'year',
                       'end_from_melt': 'end_date', 'start_from_melt': 'start_date'}, inplace=True)
    df['time_unit'] = 'year'
    return df.sort(['noind', 'year', 'start_date'])
コード例 #13
0
def parse_sub(sub, office, district):
    sub = sub.reset_index(drop=True)

    # Special case these. Needs to be cleaned up and generalized.
    if (office, district) == ('U.S. House', '33'):
        sub = pd.concat([sub.iloc[0:4,   0:-1].reset_index(drop=True),
                         sub.iloc[5:9,   1:-1].reset_index(drop=True),
                         sub.iloc[10:14, 1:].reset_index(drop=True)], axis=1).dropna(how='all')
    elif (office, district) == ('State Assembly', '33'):
        sub = pd.concat([sub.iloc[0:4, 0:-1].reset_index(drop=True),
                         sub.iloc[5:9, 1:].reset_index(drop=True)], axis=1).dropna(how='all')
    elif (office, district) == ('U.S. House', '24'):
        sub = pd.concat([sub.iloc[0:6,  0:-1].reset_index(drop=True),
                         sub.iloc[7:13, 1:].reset_index(drop=True)], axis=1).dropna(how='all')

    sub.columns = ['county'] + \
        sub.iloc[:, 1:-1].iloc[0].fillna('').tolist() + ['office']
    sub = sub.dropna(axis=1, how='all')
    sub = sub.rename(columns=parse_candidate)
    parties = sub.iloc[:, 1:-1].iloc[1].to_dict()
    sub = sub[sub.county.isin(COUNTIES)]
    sub = pd.melt(sub, id_vars=['county', 'office'], value_vars=sub.columns.tolist()[
        1:-1], var_name='candidate', value_name='votes')
    sub['party'] = sub.candidate.apply(lambda x: parties[x])
    sub = sub.assign(office=office, district=district)
    return sub[fieldnames]
コード例 #14
0
def timePlotLine(data):
    normalize = input("Would you like to normalize the y-axis? (y/n): ")
    geneNamesDict = {}
    for _, row in data.iterrows():
        geneNamesDict[row['Gene']] = 1

    data = data.pivot_table('Values', ['Sample'], ['Gene', 'Time'])
    geneList = geneNamesDict.keys()
    ylabel = input("What should the y-axis label be?: ")

    counter = 1

    for key in geneList:
        
        plt.figure(counter)
        tempTable = data[key]
        tempTable = tempTable.T
        tempTable = tempTable.dropna(axis=1, how='any')
        if normalize == 'y':
            tempTable = tempTable / np.amax(tempTable.values)
            

        tempTable['Time'] = tempTable.index
        tempTable = pd.melt(tempTable, id_vars='Time')[['Time','value']]
        sns.regplot(x='Time',y='value',data=tempTable,scatter=True)
        plt.title(key)
        plt.ylabel(ylabel)
        plt.xlabel('Time(min)')
        counter += 1
    plt.show()
コード例 #15
0
def FormatToPrevise(df2_1, df2_2):
    # Dropping DATETime index to merge df1 and df2
    df2_1 = df2_1.reset_index(drop=False)
    df2_2 = df2_2.reset_index(drop=False)

    # Converting Historian files to VTQ format (DATETime, TAGNAME, DESCRIPTION, VALUE)
    mdf = pd.merge(pd.melt(df2_1, id_vars=['DATETIME'], var_name='TAGNAME',
                           value_name='DESCRIPTION')[['TAGNAME', 'DESCRIPTION']],
                   pd.melt(df2_2, id_vars=['DATETIME'], var_name='TAGNAME',
                           value_name='VALUE'),
                   on=['TAGNAME'])

    # Sort columns by VTQ format
    mdf = mdf[['DATETIME', 'TAGNAME', 'DESCRIPTION', 'VALUE']]

    return (mdf)
コード例 #16
0
ファイル: check.py プロジェクト: yujiex/GSA
def check_interval(filename):
    df = pd.read_csv(inputdir + filename)
    df.rename(columns=lambda x: x[:8] if x != 'Timestamp' else x,
              inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df.set_index(pd.DatetimeIndex(df['Timestamp']), inplace=True)
    # df.info()
    df_re = df.resample('M', how='sum')
    cols = list(df_re)
    df_re.reset_index(inplace=True)
    df_long = pd.melt(df_re, id_vars='index', value_vars=cols)
    # print
    # print df_long.head()
    df_long.rename(columns={'index':'Timestamp', 'variable': 'Building_Number', 'value': 'Electricity_(KWH)'}, inplace=True)
    df_long['month'] = df_long['Timestamp'].map(lambda x: x.month)
    df_long['year'] = df_long['Timestamp'].map(lambda x: x.year)
    col_str = ','.join(['\'{0}\''.format(x) for x in cols])
    conn = uo.connect('all')
    with conn:
        df = pd.read_sql('SELECT Building_Number, year, month, [Electricity_(KWH)] FROM EUAS_monthly WHERE Building_Number IN ({0}) AND year = \'2015\''.format(col_str), conn)
    # print df.head()
    df_long.drop('Timestamp', axis=1, inplace=True)
    df_all = pd.merge(df, df_long, how='left', on=['Building_Number', 'year', 'month'], suffixes=['_EUAS', '_ION'])
    df_all['ratio'] = df_all['Electricity_(KWH)_ION']/df_all['Electricity_(KWH)_EUAS'].map(lambda x: round(x, 3))
    df_all['percent_diff'] = df_all['ratio'].map(lambda x: abs(1 - x) * 100.0)
    # print df_all.head()
    return df_all
コード例 #17
0
ファイル: dplython.py プロジェクト: dodger487/dplython
 def __call__(self, df):
   df_cols = df.columns.values.tolist()
   id_vals = [col._name for col in self.args[2]]
   id_vars = [col for col in df_cols if col not in id_vals]
   key = self.args[0]
   value = self.args[1]
   return pandas.melt(df, id_vars, id_vals, key, value)
コード例 #18
0
ファイル: test_future_wealth.py プロジェクト: ganong123/HARK
def gg_funcs(functions,bottom,top,N=1000,labels = ["Baseline"],
             title = "Consumption and Cash-on-Hand", ylab = "y", xlab="x", 
             loc = loc, ltitle = 'Variable',
             file_name = None):
    if type(functions)==list:
        function_list = functions
    else:
        function_list = [functions]
       
    step = (top-bottom)/N
    x = np.arange(bottom,top,step)
    fig = pd.DataFrame({'x': x})
    #xx there's got to be a better way to scroll through this list
    i = 0
    for function in function_list:
        fig[labels[i]] = function(x)
        #print labels[i]
        i=i+1
    fig = pd.melt(fig, id_vars=['x'])  
    #print(fig)
    g = gg.ggplot(fig) + \
        mp.base_plot + mp.line + mp.point +  \
        mp.theme_bw(base_size=9) + mp.fte_theme +mp.colors +  \
        gg.labs(title=title,y=ylab,x=xlab) + mp.legend_f(loc) + mp.legend_t_c(ltitle) + mp.legend_t_s(ltitle) #+ \
        #
        #gg.geom_text(data=pd.DataFrame(data={'l':"test"},index=np.arange(1)), x = "1", y = "1",group="1",colour="1", label = "plot mpg vs. wt")
        #gg.geom_text(data=pd.DataFrame(data={'l':"test"},index=np.arange(1)), mapping=gg.aes_string(x="1", y="1",group="1",colour="1",shape="1", mapping="l")) 
    if file_name is not None:
        mp.ggsave(file_name,g)
    return(g)
コード例 #19
0
def get_rep_data(C_prog_arg, rep) :
  strain_1_file = root_path + "/" + str(C_prog_arg) \
                + "/J0." + str(rep) + "." + str(C_prog_arg)
  strain_2_file = root_path + "/" + str(C_prog_arg) \
                + "/J1." + str(rep) + "." + str(C_prog_arg)
  
  newind = np.arange(260) + 1
  strain_1 = pd.read_csv(strain_1_file, 
                         sep="\t", 
                         header=None,
                         names=newind)
  strain_2 = pd.read_csv(strain_2_file, 
                         sep="\t",
                         header=None,
                         names=newind)
  n = np.shape(strain_1)[0]
  strain_1['strain'] = 1
  strain_2['strain'] = 2
  strain_1['t'] = np.linspace(0, n * dt * prntime / 365, n)
  strain_2['t'] = np.linspace(0, n * dt * prntime / 365, n)
  
  out = pd.merge(strain_1, strain_2, 
                 #on=('t', 'strain'),
                 how='outer')
  #out = pd.concat([strain_1, strain_2], axis=1)
  out['rep'] = rep

  out = pd.melt(out, 
                id_vars=['t', 'strain', 'rep'], 
                value_vars=list(newind),
                var_name="city_newind",
                value_name='inc')
  out.head() 
 
  return out
コード例 #20
0
ファイル: output.py プロジェクト: NREL/OpenStudio-ResStock
def expand(predicted, tsv_file):
  tsv = pd.read_csv(tsv_file, sep='\t')
  on = []
  for col in tsv.columns:
    if 'Dependency=' in col:
      tsv = tsv.rename(columns={col: col.replace('Dependency=', 'building_characteristics_report.').lower().replace(' ', '_')})
      on.append(col.replace('Dependency=', 'building_characteristics_report.').lower().replace(' ', '_'))

  try:
    predicted = predicted.reset_index()
    predicted = predicted.merge(tsv, on=on, how='left')
  except KeyError as ke:
    sys.exit('Column {} does not exist.'.format(ke))
    
  id_vars = []
  value_vars = []
  for col in predicted.columns:
    if 'Option=' in col:
      value_vars.append(col)
    else:
      id_vars.append(col)
    
  melted = pd.melt(predicted, id_vars=id_vars, value_vars=value_vars, var_name='building_characteristics_report.{}'.format(os.path.basename(tsv_file).replace('.tsv', '').lower().replace(' ', '_')), value_name='frac')
  melted = melted.set_index('_id')
  melted['building_characteristics_report.{}'.format(os.path.basename(tsv_file).replace('.tsv', '').lower().replace(' ', '_'))] = melted['building_characteristics_report.{}'.format(os.path.basename(tsv_file).replace('.tsv', '').lower().replace(' ', '_'))].str.replace('Option=', '')
    
  return melted
コード例 #21
0
ファイル: plotting.py プロジェクト: csddzh/NS_Classify
def plot_clf_polar(clf, cmap=None, key='nickname', n_topics=60, n_top=3, labels=None, topics = None, mask=None, selection='top', metric='correlation', max_val=None):
    import pandas as pd
    import seaborn as sns

    ## Set up topic nicknames
    word_keys = pd.read_csv("../data/unprocessed/abstract_topics_filtered/topic_sets/topic_keys" + str(n_topics) + "-july_cognitive.csv")
    word_keys['topic_name'] = "topic" + word_keys['topic'].astype('str')

    o_fi = pd.DataFrame(clf.odds_ratio)

    # Melt feature importances, and add top_words for each feeature
    o_fi['region'] = range(1, o_fi.shape[0] + 1)
    o_fis_melt = pd.melt(o_fi, var_name='topic_order', value_name='importance', id_vars=['region'])

    word_keys = pd.merge(pd.DataFrame(np.array([range(0, clf.feature_importances.shape[1]), clf.feature_names]).T, columns=['topic_order', 'topic_name']), word_keys)
    word_keys.topic_order = word_keys.topic_order.astype('int')

    o_fis_melt= pd.merge(o_fis_melt, word_keys)
    o_fis_melt['abs_imp'] = np.abs(o_fis_melt['importance'])
    
    if mask is not None:
        o_fis_melt = o_fis_melt[o_fis_melt.region.isin(mask)]
        
    if topics is not None:
        o_fis_melt = o_fis_melt[o_fis_melt[key].isin(topics)]
    
    pplot = pd.pivot_table(o_fis_melt, values='importance', index=[key], columns=['region'])
    
    if cmap is None:
        cmap = sns.color_palette('Set1', clf.feature_importances.shape[0])
    if mask is not None:
        cmap = [n[0] for n in sorted(zip(np.array(cmap)[np.array(mask)-1], mask), key=lambda tup: tup[1])]
    return plot_polar(pplot, overplot=True, palette=cmap, n_top=n_top, metric=metric, selection=selection, 
        label_size=30, labels=labels, max_val=max_val)
コード例 #22
0
def tx_modes_plot(consensus_data, ordered_genomes, tx_mode_plot_tgt):
    ordered_groups = ['transMap', 'transMap+TM', 'transMap+TMR', 'transMap+TM+TMR', 'TM', 'TMR', 'TM+TMR', 'CGP', 'PB',
                      'Other']
    ordered_groups = OrderedDict([[frozenset(x.split('+')), x] for x in ordered_groups])

    def split_fn(s):
        return ordered_groups.get(frozenset(s['Transcript Modes'].replace('aug', '').split(',')), 'Other')

    modes_df = json_biotype_counter_to_df(consensus_data, 'Transcript Modes')
    df = modes_df.pivot(index='genome', columns='Transcript Modes').transpose().reset_index()
    df['Modes'] = df.apply(split_fn, axis=1)
    df = df[['Modes'] + ordered_genomes]
    ordered_values = [x for x in ordered_groups.itervalues() if x in set(df['Modes'])]
    with tx_mode_plot_tgt.open('w') as outf, PdfPages(outf) as pdf:
        title_string = 'Transcript modes in protein coding consensus gene set'
        ylabel = 'Number of transcripts'
        if len(ordered_genomes) > 1:
            df['Ordered Modes'] = pd.Categorical(df['Modes'], ordered_values, ordered=True)
            df = df.sort_values('Ordered Modes')
            df = df[['Ordered Modes'] + ordered_genomes].set_index('Ordered Modes')
            df = df.fillna(0)
            generic_stacked_barplot(df, pdf, title_string, df.index, ylabel, ordered_genomes, 'Transcript mode(s)',
                                    bbox_to_anchor=(1.25, 0.7))

        else:
            generic_barplot(pd.melt(df, id_vars='Modes'), pdf, 'Transcript mode(s)', ylabel, title_string, x='Modes',
                            y='value', order=ordered_values)
コード例 #23
0
ファイル: test_legend.py プロジェクト: 2dpodcast/ggplot
def test_linetype():
    meat_lng = pd.melt(meat[['date', 'beef', 'pork', 'broilers']], id_vars='date')
    p = ggplot(aes(x='date', y='value', colour='variable',
               linetype='variable', shape='variable'), data=meat_lng) + \
        geom_line() + geom_point() +\
        ylim(0, 3000)
    assert_same_ggplot(p, "legend_linetype")
コード例 #24
0
ファイル: pvc_raw_mappings.py プロジェクト: catfishy/jagust
def generateBathroomTilePlot(bl_vs_change_json):
    df = pd.read_json(bl_vs_change_json)
    summary_regions = ['ctx-lh-parsorbitalis','ctx-rh-parsorbitalis','ctx-rh-lateralorbitofrontal',
                       'ctx-lh-lateralorbitofrontal','ctx-rh-frontalpole','ctx-rh-parstriangularis',
                       'ctx-lh-frontalpole','ctx-lh-parstriangularis','ctx-lh-caudalanteriorcingulate',
                       'ctx-rh-rostralmiddlefrontal','ctx-lh-caudalmiddlefrontal',
                       'ctx-rh-caudalanteriorcingulate','ctx-rh-rostralanteriorcingulate',
                       'ctx-lh-rostralmiddlefrontal','ctx-rh-caudalmiddlefrontal',
                       'ctx-lh-superiorparietal','ctx-rh-isthmuscingulate',
                       'ctx-lh-rostralanteriorcingulate','ctx-rh-parsopercularis',
                       'ctx-rh-superiorparietal','ctx-lh-parsopercularis',
                       'ctx-rh-medialorbitofrontal','ctx-lh-isthmuscingulate',
                       'ctx-lh-supramarginal','ctx-lh-inferiorparietal','ctx-rh-supramarginal',
                       'ctx-lh-superiorfrontal','ctx-rh-superiorfrontal','ctx-rh-middletemporal',
                       'ctx-lh-middletemporal','ctx-rh-inferiorparietal','ctx-rh-superiortemporal',
                       'ctx-lh-posteriorcingulate','ctx-lh-precuneus','ctx-lh-medialorbitofrontal',
                       'ctx-lh-superiortemporal','ctx-rh-posteriorcingulate','ctx-rh-precuneus']
    ordering = {x:i for i,x in enumerate(summary_regions)}
    rank_by = summary_regions # could take subset of cortical summary regions
    subjects = GROUPS['increasing_low']['N']
    df = df[df['rid'].isin(subjects)]

    baseline_keys = ["%s_bl" % _ for _ in rank_by]
    change_keys = ["%s_change" % _ for _ in summary_regions]
    df['rank'] = df[baseline_keys].mean(axis=1)

    keep_keys = ['rid', 'rank'] + change_keys
    df = df[keep_keys]
    df_long = pd.melt(df,id_vars=['rank'],value_vars=change_keys)

    # sort change
    df_long['variable'] = [_.replace('_change','') for _ in df_long['variable']]
    df_long['variable'] = ['%s_%s' % (str(ordering[_]).zfill(2),_) for _ in df_long['variable']]

    print ggplot(aes(x='variable',y='rank'),data=df_long)+geom_tile(aes(fill='value'))+theme(axis_text_x=element_text(angle=270,size=8), axis_text_y=element_text(size=6))
コード例 #25
0
def parse_dates2(df):

    for k in range(len(df.columns)):
        vtype = df.convert_objects(convert_numeric=True).dtypes[k]
    if vtype == 'int64':
        if k != 0:
            df[df.columns[k]] = df[df.columns[k]].astype(float)

    df.drop(df.columns[-1], axis=1, inplace=True)        

    df.rename(columns={'HORA UTC': 'date'}, inplace=True)

    df = pd.melt( df, id_vars=["date"] ).rename(columns={'variable': 'hour'} )

    df['hour'] = df['hour'].astype(str)

    df.hour = df.hour.apply(lambda x: '%04i' %int(x) )
    df.hour = df.hour.apply(lambda x: x[:2] ) 

    df.date = df.apply(lambda x: pd.to_datetime(x.date, format="%Y-%m-%d")\
                        + timedelta(hours=int(x.hour)), axis=1)
    
    df.rename(columns={'value': var}, inplace=True)
    df.drop('hour', 1, inplace=True)
    
    df.set_index('date', inplace=True) 
    df.sort(inplace=True)

    return df        
def parse_los_angeles():
    output_columns = ['county', 'precinct', 'office',
                      'district', 'party', 'candidate', 'votes']

    sovc_zip_url = 'https://www.lavote.net/documents/SVC/3744_SVC_Excel.zip'
    sovc_zip = requests.get(sovc_zip_url)
    if sovc_zip.status_code != 200:
        return
    f = tempfile.NamedTemporaryFile()
    f.write(sovc_zip.content)
    sovc_zf = zipfile.ZipFile(f.name)
    df = pd.read_excel(sovc_zf.open(
        '34TH_CONGRESS_DIST_U-T_06-06-17_Voter_Nominated_by_Precinct_3744-5055.xls'))

    df.columns = df.loc[1]
    df = df[df.TYPE == 'TOTAL']
    table = pd.melt(df, id_vars=['PRECINCT'], value_vars=df.columns.tolist()[
        8:-1], var_name='candidate', value_name='votes').assign(county='Los Angeles', office='U.S. House', district='34').rename(columns={'PRECINCT': 'precinct'}).replace({'candidate': candidates})
    parties = {k: 'DEM' for k in candidates.values()}

    table['party'] = table.candidate.apply(lambda x: parties[x])
    for x in ['candidate', 'district', 'office', 'precinct', 'county']:
        table = table.sort_values(by=x, kind='mergesort')
    table[output_columns].to_csv(
        '2017/20170606__ca__special__general__los_angeles__precinct.csv', index=False)
コード例 #27
0
ファイル: attention.py プロジェクト: gitHubyan/LearnPython
def main():

    df = pd.read_csv("./attention.csv")
    df = pd.melt(df, ["subidr", "attnr"], var_name="solutions", value_name="score")
    df.solutions = df.solutions.str[-1].astype(int)
    df.columns = ["subject", "attention", "solutions", "score"]
    df.to_csv("attention.csv")
コード例 #28
0
ファイル: chart_utils.py プロジェクト: JaySpell/storage_stats
def donutchart(*args, **kwargs):
    #get info from submitted data
    data = kwargs.get('data', 'None')
    ids = kwargs.get('ids', 'None')
    vals = kwargs.get('vals', 'None')
    val_name = kwargs.get('val_name', 'None')
    v_name = kwargs.get('v_name', 'None')
    out_file = kwargs.get('out_file', 'None')

    if vals or data or ids or val_name == 'None':
        return "Data must be submitted"

    df = df_from_json(data)
    df = df.sort("total", ascending=False)
    df = pd.melt(df, id_vars=[ids],
                value_vars=[vals],
                value_name=val_name,
                var_name=v_name)
    d = Donut(df, label=[ids, v_name],
            values=v_name,
            text_font_size='8pt',
            hover_text='vals')

    output_file(out_file)
    save(d)
コード例 #29
0
ファイル: clean.py プロジェクト: codefordurham/Durham-Data
def reshape(school_attendance):
    # reshape the data into a more normal form
    x = pd.melt(school_attendance, id_vars=['school_year', 'lea_name',
                                            'lea_number', 'school_number',
                                            'school_name', 'grade_level'])
    
    def get_sex(v):
        if v.endswith('_Male'):
            return 'Male'
        if v.endswith('_Female'):
            return 'Female'
        assert False, 'can not get here'

    def get_race(v):
        if v.endswith('_Male'):
            return v[:-5].replace('_', ' ')
        if v.endswith('_Female'):
            return v[:-7].replace('_', ' ')
        assert False, 'can not get here'
        # did not work
        #return v.rstrip('_Male').rstrip('_Female')
    
    x['sex'] = x.variable.map(get_sex)
    x['race'] = x.variable.map(get_race)
    x['attendance'] = x.value
    del(x['variable'])
    del(x['value'])  
    
    return x
コード例 #30
0
def process_load_data(filename):

    # import data
    df = pd.read_csv(filename, parse_dates=[[1, 2, 3]], thousands=',')

    # unpivot
    df = pd.melt(df, id_vars=['year_month_day', 'zone_id'], var_name='hour')

    # drop rows where value is NaN
    df.dropna(inplace=True)

    # drop where zoneid = 21 [this is just a total row, that occurs in solution data only]
    df = df[df.zone_id != 21]

    # create datetime col
    df.hour = df.hour.str.replace('h', '')
    df.hour = pd.to_timedelta(df.hour.astype(int) - 1, unit='h')
    df['datetime'] = df.year_month_day + df.hour

    # drop and reorder columns
    df = df[['datetime', 'zone_id', 'value']].copy()

    # add weights
    df['weight'] = 1
    # increase weight on future predictions - where datetime > 2008-06-30 05:30
    predictions_start_datetime = datetime(2008, 6, 30, 05, 30, 0)
    df.loc[df['datetime'] > predictions_start_datetime, 'weight'] = 8

    # add trend variable [incremental number of hours]
    trend_start_datetime = datetime(2004, 1, 1, 0, 0, 0)
    df['trend'] = (df.datetime - trend_start_datetime) / np.timedelta64(1, 'h') + 1

    df = df.sort_values(by=['zone_id', 'datetime'], ascending=[True, True])

    return df
コード例 #31
0
def run_t_test_app():
    st.header('■t-test')
    st.write(
        'To compare the results of two tests. e.g., examine the difference in performance by teaching method.'
    )

    st.sidebar.subheader('Data Upload')

    df_edu = pd.read_csv("data/eng_sample_data_t_test.csv")

    def download_link(object_to_download, download_filename,
                      download_link_text):
        if isinstance(object_to_download, pd.DataFrame):
            object_to_download = object_to_download.to_csv(
                index=False, encoding='utf_8_sig')
            b64 = base64.b64encode(object_to_download.encode()).decode()
            return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>'

    tmp_download_link = download_link(df_edu, 'sample_ttest.csv',
                                      'Download sample csv file.')
    st.sidebar.markdown(tmp_download_link, unsafe_allow_html=True)

    #     st.sidebar.info("""
    #     [Download the sample csv file](https://github.com/59er/eng_learning_analytics_web/blob/master/sample_data/eng_sample_data_t_test_for_WEB.csv)
    #         """)

    uploaded_file = st.sidebar.file_uploader(
        "File upload (Drag and drop or use [Browse files] button to import csv file. Only utf-8 format is available.)",
        type=["csv"])
    # uploaded_file = st.file_uploader(
    #     label = 'File Upload(Drag and drop csv/Excel)',
    #     type = ['csv', 'xlsx']
    # )

    try:

        if uploaded_file is not None:
            df_edu = pd.read_csv(uploaded_file)
            uploaded_file.seek(0)
            display_data = st.sidebar.checkbox(label='Show uploaded data')

            if display_data:
                st.dataframe(df_edu)

        else:
            df_edu = pd.read_csv('data/eng_sample_data_t_test.csv')

            show_df = st.sidebar.checkbox('Show DataFrame')

            if show_df == True:
                st.write(df_edu)

        A_var = np.var(df_edu.iloc[:, 0], ddof=1)
        B_var = np.var(df_edu.iloc[:, 1], ddof=1)
        A_df = len(df_edu) - 1
        B_df = len(df_edu) - 1
        f = A_var / B_var
        one_sided_pval1 = stats.f.cdf(f, A_df, B_df)
        one_sided_pval2 = stats.f.sf(f, A_df, B_df)
        two_sided_pval = min(one_sided_pval1, one_sided_pval2)

        st.subheader(
            "Confirmation of equality of variance between two groups (p-value < 0.05 for unequal variance (Welch's t-test was applied)),\
            Equal variances at p-value > 0.05 (Student's t-test applied))")
        dist = round(two_sided_pval, 3)
        st.write('F      ', round(f, 3))
        st.write('p-value:', round(two_sided_pval, 3))

        if dist < 0.05:

            result_w = stats.ttest_ind(df_edu.iloc[:, 0], df_edu.iloc[:, 1])
            st.subheader('t-test results (welch)')
            st.write(result_w)

        else:
            result_s = stats.ttest_ind(df_edu.iloc[:, 0], df_edu.iloc[:, 1])
            st.subheader('t-test results (Student)')
            st.write(result_s)

        st.set_option('deprecation.showPyplotGlobalUse', False)

        st.write(
            sns.catplot(x='variable',
                        y='value',
                        kind='box',
                        data=pd.melt(df_edu)))
        plt.title('Comparison between the two groups', fontsize=15)
        plt.show()
        st.pyplot()

    except Exception as e:
        st.header(
            'ERROR: Data inconsistency. Check data format to be uploaded.')
        print('Data inconsistency error')
コード例 #32
0
        test    127306      2B1B-A4  0.9    69.23
        test    127306      2B1B-A4  1      0

    # to something like this:
        client  propcode    propfp  0   0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
        venterra    127306  1B1B-A123*  100 99.11   98.32   97.05   95.53   93.99   92.16   89.9    85.5    65.05   0
        venterra    127306  2B1B-A4 100 98.91   97.82   96.73   95.64   94.55   93.46   90.4    88.65333333 69.23   0

# Python code for it would be:
df_fix = df.pivot_table(index = ['client','propcode','propfp'], columns='pctile', values='adj')
df_fix.columns = df_fix.columns.get_level_values('pctile')
df_fix.reset_index(inplace=True)

# From pivot table format back to origional format could use melt
v = ['client','propcode','propfp']
df = pd.melt(df_fix, id_vars = v, var_name = 'pctile', value_name = 'adj')


"""
h2o.ai in python
"""
import h2o
# Start the h2o clusters / shut down clusters
h2o.init()
h2o.cluster().shutdown()
h2o.cluster().show_status() # check cluster status

# Data exchange between pandas and h2o
df_h2o = h2o.H2OFrame(df) # import pandas dataframe to h2o dataframe
df = df_h2o.as_data_frame() # export h2o dataframe to pandas dataframe
コード例 #33
0
import pandas as pd

filename = 'Inj_Prodbywell.xls'
df = pd.read_table(filename)
df_inj = pd.melt(
    df,
    id_vars=['Apino', 'Company', 'Inj_type', 'Field', 'Formation', 'Year'],
    value_vars=[
        'Jan_Inj', 'Feb_Inj', 'Mar_Inj', 'Apr_Inj', 'May_Inj', 'Jun_Inj',
        'Jul_Inj', 'Aug_Inj', 'Sep_Inj', 'Oct_Inj', 'Nov_Inj', 'Dec_Inj'
    ],
    var_name="Month_val",
    value_name="Inj")
df_inj['Month_val'] = df_inj['Month_val'].replace(to_replace='_Inj',
                                                  value='',
                                                  regex=True)
month_transform = {
    'Jan': 1,
    'Feb': 2,
    'Mar': 3,
    'Apr': 4,
    'May': 5,
    'Jun': 6,
    'Jul': 7,
    'Aug': 8,
    'Sep': 9,
    'Oct': 10,
    'Nov': 11,
    'Dec': 12
}
df_inj['Month_val'] = df_inj['Month_val'].map(month_transform)
コード例 #34
0
# model8-3ではこの方法を使ってwarningを回避しているが、サンプリングがうまくいかない、、、

# 予測分布
probs = (2.5, 50, 97.5)
qua = np.transpose(np.percentile(mcmc_sample['y_new'], (2.5, 50, 97.5),
                                 axis=0),
                   axes=(1, 2, 0))
d_est = pandas.DataFrame(qua.reshape((-1, 3)),
                         columns=['p{}'.format(p) for p in probs])
d_est['PersonID'] = np.repeat(np.arange(N) + 1, T_new)
d_est['Time'] = np.tile(Time_new, N)

print(d_est)
Time_tbl = pandas.Series(Time, index=['Time{}'.format(t) for t in Time])
d = pandas.melt(data_conc2,
                id_vars='PersonID',
                var_name='Time',
                value_name='Y')
d['Time'] = Time_tbl[d['Time']].values

_, axes = plt.subplots(4, 4, figsize=figaspect(7 / 8) * 1.5)
for (row, col), ax in np.ndenumerate(axes):
    person = row * 4 + col + 1
    ax.fill_between('Time',
                    'p2.5',
                    'p97.5',
                    data=d_est.query('PersonID==@person'),
                    color='k',
                    alpha=1 / 5)
    ax.plot('Time', 'p50', data=d_est.query('PersonID==@person'), color='k')
    ax.scatter('Time', 'Y', data=d.query('PersonID==@person'), color='k')
    if row < 3:
コード例 #35
0
ファイル: melter.py プロジェクト: SamGomes/message-across
gives_df = data[[
    'playerId', 'meanNumberOfGives_A', 'meanNumberOfGives_B',
    'meanNumberOfGives_C', 'meanNumberOfGives_D'
]]

gives_df.rename(columns={
    'meanNumberOfGives_A': 'A',
    'meanNumberOfGives_B': 'B',
    'meanNumberOfGives_C': 'C',
    'meanNumberOfGives_D': 'D'
},
                inplace=True)

gives_melted_df = pd.melt(gives_df,
                          id_vars=['playerId'],
                          value_vars=['A', 'B', 'C', 'D'])

gives_melted_df.rename(columns={
    'variable': 'ScoreSystem',
    'value': 'gives'
},
                       inplace=True)

#print(gives_melted_df)

#################################### Takes

takes_df = data[[
    'playerId', 'meanNumberOfTakes_A', 'meanNumberOfTakes_B',
    'meanNumberOfTakes_C', 'meanNumberOfTakes_D'
コード例 #36
0
ファイル: __init__.py プロジェクト: gyd1990/scanpy
def rank_genes_groups_violin(adata,
                             groups=None,
                             n_genes=20,
                             gene_names=None,
                             gene_symbols=None,
                             use_raw=None,
                             key=None,
                             split=True,
                             scale='width',
                             strip=True,
                             jitter=True,
                             size=1,
                             ax=None,
                             show=None,
                             save=None):
    """\
    Plot ranking of genes for all tested comparisons.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    groups : list of `str`, optional (default: `None`)
        List of group names.
    n_genes : `int`, optional (default: 20)
        Number of genes to show. Is ignored if `gene_names` is passed.
    gene_names : `None` or list of `str` (default: `None`)
        List of genes to plot. Is only useful if interested in a custom gene list,
        which is not the result of :func:`scanpy.api.tl.rank_genes_groups`.
    gene_symbols : `str`, optional (default: `None`)
        Key for field in `.var` that stores gene symbols if you do not want to
        use `.var_names` displayed in the plot.
    use_raw : `bool`, optional (default: `None`)
        Use `raw` attribute of `adata` if present. Defaults to the value that
        was used in :func:`~scanpy.api.tl.rank_genes_groups`.
    split : `bool`, optional (default: `True`)
        Whether to split the violins or not.
    scale : `str`, optional (default: 'width')
        See `seaborn.violinplot`.
    strip : `bool`, optional (default: `True`)
        Show a strip plot on top of the violin plot.
    jitter : `int`, `float`, `bool`, optional (default: `True`)
        If set to 0, no points are drawn. See `seaborn.stripplot`.
    size : `int`, optional (default: 1)
        Size of the jitter points.
    {show_save_ax}
    """
    if key is None:
        key = 'rank_genes_groups'
    groups_key = str(adata.uns[key]['params']['groupby'])
    if use_raw is None:
        use_raw = bool(adata.uns[key]['params']['use_raw'])
    reference = str(adata.uns[key]['params']['reference'])
    groups_names = (adata.uns[key]['names'].dtype.names
                    if groups is None else groups)
    if isinstance(groups_names, str): groups_names = [groups_names]
    axs = []
    for group_name in groups_names:
        if gene_names is None:
            gene_names = adata.uns[key]['names'][group_name][:n_genes]
        df = pd.DataFrame()
        new_gene_names = []
        for g in gene_names:
            if adata.raw is not None and use_raw:
                X_col = adata.raw[:, g].X
            else:
                X_col = adata[:, g].X
            if issparse(X_col): X_col = X_col.toarray().flatten()
            new_gene_names.append(
                g if gene_symbols is None else adata.var[gene_symbols][g])
            df[g] = X_col
        df['hue'] = adata.obs[groups_key].astype(str).values
        if reference == 'rest':
            df.loc[df['hue'] != group_name, 'hue'] = 'rest'
        else:
            df.loc[~df['hue'].isin([group_name, reference]), 'hue'] = np.nan
        df['hue'] = df['hue'].astype('category')
        df_tidy = pd.melt(df, id_vars='hue', value_vars=new_gene_names)
        x = 'variable'
        y = 'value'
        hue_order = [group_name, reference]
        import seaborn as sns
        _ax = sns.violinplot(x=x,
                             y=y,
                             data=df_tidy,
                             inner=None,
                             hue_order=hue_order,
                             hue='hue',
                             split=split,
                             scale=scale,
                             orient='vertical',
                             ax=ax)
        if strip:
            _ax = sns.stripplot(x=x,
                                y=y,
                                data=df_tidy,
                                hue='hue',
                                dodge=True,
                                hue_order=hue_order,
                                jitter=jitter,
                                color='black',
                                size=size,
                                ax=_ax)
        _ax.set_xlabel('genes')
        _ax.set_title('{} vs. {}'.format(group_name, reference))
        _ax.legend_.remove()
        _ax.set_ylabel('expression')
        _ax.set_xticklabels(gene_names, rotation='vertical')
        writekey = ('rank_genes_groups_' +
                    str(adata.uns[key]['params']['groupby']) + '_' +
                    group_name)
        utils.savefig_or_show(writekey, show=show, save=save)
        axs.append(_ax)
    if show == False: return axs
コード例 #37
0
def index():

    # extract data needed for visuals
    genre_counts = df.groupby('genre').count()['message']
    genre_names = list(genre_counts.index)

    category_melt = pd.melt(df,
                            id_vars=['id', 'message', 'original', 'genre'],
                            var_name='category')
    category_counts = category_melt.groupby('category').sum()['value']
    category_names = list(category_counts.index)

    df['message_len'] = df.message.str.len()
    message_lens = df['message_len']

    # create visuals
    graphs = [
        {
            'data': [Bar(x=genre_names, y=genre_counts)],
            'layout': {
                'title': 'Distribution of Message Genres',
                'yaxis': {
                    'title': "Count"
                },
                'xaxis': {
                    'title': "Genre"
                },
            }
        },
        {
            'data': [Bar(x=category_names, y=category_counts)],
            'layout': {
                'title': 'Distribution of Message Categories',
                'yaxis': {
                    'title': "Count"
                },
                'xaxis': {
                    'title': "Category",
                    'tickangle': -45,
                },
            }
        },
        {
            'data': [{
                'type': 'histogram',
                'x': message_lens,
            }],
            'layout': {
                'title': 'Histogram of Message Lengths',
                'yaxis': {
                    'title': "Count"
                },
            }
        },
    ]

    # encode plotly graphs in JSON
    ids = ["graph-{}".format(i) for i, _ in enumerate(graphs)]
    graphJSON = json.dumps(graphs, cls=plotly.utils.PlotlyJSONEncoder)

    # render web page with plotly graphs
    return render_template('master.html', ids=ids, graphJSON=graphJSON)
コード例 #38
0

# Check that index column was added

# In[46]:


common_dict['index_col'].head()


# <b> Unpivot other columns than Index </b>

# In[47]:


common_dict_melt = pd.melt(common_dict, id_vars=['index_col'])


# In[48]:


common_dict_melt.head()


# <p> <b> Remove {} <b/> </p>

# In[49]:


common_dict_melt['value'] = common_dict_melt['value'].map(lambda x: x.lstrip('{').rstrip('}'))
コード例 #39
0
zip_ref.close()
'''
Process data
'''
# read in csv file as Dataframe
df = pd.read_csv(raw_data_file_unzipped + '/SE4ALLData.csv')

# subset for renewable energy consumption data
df_subset = df[df['Indicator Name'].str.contains(
    'Renewable energy consumption')]

#convert tables from wide form (each year is a column) to long form (a single column of years and a single column of values)
year_list = [str(year) for year in range(1990, 2017)]  #check
df_long = pd.melt(df_subset,
                  id_vars=['Country Name', 'Country Code'],
                  value_vars=year_list,
                  var_name='year',
                  value_name='renewable energy consumption')

#convert year column from object to integer
df_long.year = df_long.year.astype('int64')

#save processed dataset to csv
processed_data_file = data_dir + dataset_name + '_edit.csv'
df_long.to_csv(processed_data_file, index=False)
'''
Upload processed data to Carto
'''
print('Uploading processed data to Carto.')
#set up carto authentication using local variables for username (CARTO_WRI_RW_USER) and API key (CARTO_WRI_RW_KEY)
auth_client = APIKeyAuthClient(api_key=os.getenv('CARTO_WRI_RW_KEY'),
コード例 #40
0
               aggfunc='sum')

# In[27]:

table1 = pd.pivot_table(table,
                        index='cust_id',
                        columns='type',
                        values='Monetary',
                        fill_value=0,
                        aggfunc=np.sum).reset_index()  # 索引重置,恢复到最初

# In[28]:

pd.melt(table1,
        id_vars='cust_id',
        value_vars=['Normal', 'Special_offer'],
        value_name='Monetary',
        var_name='TYPE')

# ### 5.1.8 赋值与条件赋值

# #### 1. 赋值

# In[29]:

sample = pd.DataFrame({
    'name': ['Bob', 'Lindy', 'Mark', 'Miki', 'Sully', 'Rose'],
    'score': [99, 78, 999, 77, 77, np.nan],
    'group': [1, 1, 1, 2, 1, 2],
})
コード例 #41
0
            min_permuted_scores[(tissue, subset, 'permuted', 'min')].append(
                lasso_perm.score(X_final_test_sub, y_test_sub))

real = pd.DataFrame.from_dict(scores)
real = real.T.reset_index()

permuted = pd.DataFrame.from_dict(permuted_scores)
permuted = permuted.T.reset_index()

min_real = pd.DataFrame.from_dict(min_scores)
min_real = min_real.T.reset_index()

min_permuted = pd.DataFrame.from_dict(min_permuted_scores)
min_permuted = min_permuted.T.reset_index()

everything = pd.concat([real, permuted, min_real, min_permuted])
everything = everything.rename(
    columns={
        'level_0': 'tissue',
        'level_1': 'training_set_size',
        'level_2': 'type',
        'level_3': 'test_set_size'
    })

everything = pd.melt(
    everything,
    id_vars=['tissue', 'training_set_size', 'type', 'test_set_size'],
    value_vars=[0, 1, 2])

pickle.dump(everything, open('test_results', 'wb'))
コード例 #42
0
ファイル: EIA_MECS.py プロジェクト: modelearth/flowsa
def eia_mecs_energy_call(**kwargs):
    """
    Convert response for calling url to pandas dataframe, begin parsing df into FBA format
    :param kwargs: potential arguments include:
                   url: string, url
                   response_load: df, response from url call
                   args: dictionary, arguments specified when running
                   flowbyactivity.py ('year' and 'source')
    :return: pandas dataframe of original source data
    """
    # load arguments necessary for function
    response_load = kwargs['r']
    args = kwargs['args']

    ## load .yaml file containing information about each energy table
    ## (the .yaml includes information such as column names, units, and which rows to grab)
    filename = 'EIA_MECS_energy tables'
    sourcefile = datapath + filename + '.yaml'
    with open(sourcefile, 'r') as f:
        table_dict = yaml.safe_load(f)

    ## read raw data into dataframe
    ## (include both Sheet 1 (data) and Sheet 2 (relative standard errors))
    df_raw_data = pd.read_excel(io.BytesIO(response_load.content),
                                sheet_name=0,
                                header=None)
    df_raw_rse = pd.read_excel(io.BytesIO(response_load.content),
                               sheet_name=1,
                               header=None)

    ## retrieve table name from cell A3 of Excel file
    table = df_raw_data.iloc[2][0]
    # drop the table description (retain only table name)
    table = table.split('    ')[0]

    ## for each of the census regions...
    ## - grab the appropriate rows and columns
    ## - add column names
    ## - "unpivot" dataframe from wide format to long format
    ## - add columns denoting census region, relative standard error, units
    ## - concatenate census region data into master dataframe
    df_data = pd.DataFrame()
    for region in table_dict[args['year']][table]['regions']:

        ## grab relevant columns
        ## (this is a necessary step because code was retaining some seemingly blank columns)
        # determine number of columns in table, based on number of column names
        num_cols = len(table_dict[args['year']][table]['col_names'])
        # keep only relevant columns
        df_raw_data = df_raw_data.iloc[:, 0:num_cols]
        df_raw_rse = df_raw_rse.iloc[:, 0:num_cols]

        ## grab relevant rows
        # get indices for relevant rows
        grab_rows = table_dict[args['year']][table]['regions'][region]
        grab_rows_rse = table_dict[args['year']][table]['rse_regions'][region]
        # keep only relevant rows
        df_data_region = pd.DataFrame(
            df_raw_data.loc[grab_rows[0] - 1:grab_rows[1] - 1]).reindex()
        df_rse_region = pd.DataFrame(
            df_raw_rse.loc[grab_rows_rse[0] - 1:grab_rows_rse[1] -
                           1]).reindex()

        # assign column names
        df_data_region.columns = table_dict[args['year']][table]['col_names']
        df_rse_region.columns = table_dict[args['year']][table]['col_names']

        # "unpivot" dataframe from wide format to long format
        # ('NAICS code' and 'Subsector and Industry' are identifier variables)
        # (all other columns are value variables)
        df_data_region = pd.melt(
            df_data_region,
            id_vars=table_dict[args['year']][table]['col_names'][0:2],
            value_vars=table_dict[args['year']][table]['col_names'][2:],
            var_name='FlowName',
            value_name='FlowAmount')
        df_rse_region = pd.melt(
            df_rse_region,
            id_vars=table_dict[args['year']][table]['col_names'][0:2],
            value_vars=table_dict[args['year']][table]['col_names'][2:],
            var_name='FlowName',
            value_name='Spread')

        # add census region
        df_data_region['Location'] = region

        # add relative standard error data
        df_data_region = pd.merge(df_data_region, df_rse_region)

        ## add units
        # if table name ends in 1, units must be extracted from flow names
        if table[-1] == '1':
            flow_name_array = df_data_region['FlowName'].str.split('\s+\|+\s')
            df_data_region['FlowName'] = flow_name_array.str[0]
            df_data_region['Unit'] = flow_name_array.str[1]
        # if table name ends in 2, units are 'trillion Btu'
        elif table[-1] == '2':
            df_data_region['Unit'] = 'Trillion Btu'
            df_data_region['FlowName'] = df_data_region['FlowName']

        data_type = table_dict[args['year']][table]['data_type']
        if data_type == 'nonfuel consumption':
            df_data_region['Class'] = 'Other'
        elif data_type == 'fuel consumption':
            df_data_region['Class'] = 'Energy'
        # remove extra spaces before 'Subsector and Industry' descriptions
        df_data_region['Subsector and Industry'] = \
            df_data_region['Subsector and Industry'].str.lstrip(' ')

        # concatenate census region data with master dataframe
        df_data = pd.concat([df_data, df_data_region])

    return df_data
コード例 #43
0
#continent/regional aggregates
agg_list = ['ARB', 'CSS', 'EAS', 'EMU', 'LCN', 'MEA', 'PSS', 'SAS', 'SSF']
agg = fs[fs.country_code.isin(agg_list)]

#individual countries - remove country names that represent aggregates
agg_country_code_list = [
    'ARB', 'CSS', 'EAS', 'EAP', 'CEA', 'EMU', 'ECS', 'ECA', 'CEU', 'EUU',
    'HPC', 'HIC', 'NOC', 'OEC', 'LCN', 'LAC', 'CLA', 'LDC', 'LMY', 'LIC',
    'LMC', 'MEA', 'MNA', 'CME', 'MIC', 'NAC', 'OED', 'OSS', 'PSS', 'SST',
    'SAS', 'CSA', 'SSF', 'SSA', 'CAA', 'UMC', 'WLD'
]
fs = fs[~fs.country_code.isin(agg_country_code_list)]

#reshape to put years in rows instead of columns
fs = pd.melt(fs,
             id_vars=['country', 'country_code', 'indicator'],
             var_name='year')
agg = pd.melt(agg,
              id_vars=['country', 'country_code', 'indicator'],
              var_name='year')
world = pd.melt(world,
                id_vars=['country', 'country_code', 'indicator'],
                var_name='year')

#reshape again to put indicators in columns instead of rows & save results
fs = pd.pivot_table(fs,
                    values='value',
                    index=['country', 'country_code', 'year'],
                    columns=['indicator'])
agg = pd.pivot_table(agg,
                     values='value',
コード例 #44
0
 cints = ions["CALIBRATED_INTENSITY"][full_anchor_ions]
 cints_swim = cints[:, :9]
 cints_udmse = cints[:, 9:]
 cints_swim_cv = scipy.stats.variation(cints_swim, axis=1)
 cints_udmse_cv = scipy.stats.variation(cints_udmse, axis=1)
 results = scipy.stats.ttest_rel(cints_swim_cv, cints_udmse_cv)
 log.printMessage(
     "SWIM/UDMSE median cvs (ttest: {}, pval: {}): {} {}".format(
         results[0],
         results[1],
         np.median(cints_swim_cv),
         np.median(cints_udmse_cv),
     ))
 d = pd.melt(
     pd.DataFrame(np.stack([
         cints_swim_cv,
         cints_udmse_cv,
     ]).T,
                  columns=["SWIM-DIA", "HDMSE"]), )
 d["Y"] = 1
 d["Acquistion"] = d["variable"]
 tmp = sns.violinplot(x='value',
                      y='Y',
                      hue='Acquistion',
                      split=True,
                      data=d,
                      inner="quartile",
                      gridsize=1000,
                      orient="h")
 tmp = plt.ylabel("Relative Frequency")
 tmp = plt.xlabel("CV Of Fully Reproducible Aggregates")
 tmp = plt.yticks([])
コード例 #45
0
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

if __name__ == '__main__':
    # Plot stat vs legendary
    data = pd.read_csv("Pokemon.csv")
    stat = data.drop(['#', 'Type 1', 'Type 2', 'Total', 'Generation', 'Name'],
                     axis=1)
    stat = pd.melt(stat, id_vars=['Legendary'], var_name="stat")

    plt.figure()
    sns.swarmplot(
        x="stat", y="value", data=stat,
        hue="Legendary").get_figure().savefig("Results//Stat_vs_Le.png")

    # Plot type vs legendary
    stat1 = data[['Type 1', 'Legendary']]
    stat2 = data[['Type 2', 'Legendary']]
    typ = data['Type 2'].unique()
    dic_type = {typ[i]: i for i in range(19)}
    val = [[typ[i], 0, 0, 0] for i in range(19)]

    for i in range(800):
        val[dic_type[stat1.values[i][0]]][3] += 1
        if stat1.values[i][1] == True:
            val[dic_type[stat1.values[i][0]]][1] += 1
            val[dic_type[stat2.values[i][0]]][2] += 1

    df = pd.DataFrame(val, columns=['Type Name', 'Type 1', 'Type 2', 'Total'])
コード例 #46
0
# add any new lines to wmata.rail_lines

# wmata.rail_stations #########################################################
# wmata.rail_lines_served #####################################################
rail_stations = requests.get('http://api.wmata.com/Rail.svc/json/jStations',
                             headers)
rail_stations = return_data(rail_stations, 'Stations')

rail_stations = rail_stations.loc[:, [
    'Code', 'Name', 'Lat', 'Lon', 'LineCode1', 'LineCode2', 'LineCode3',
    'LineCode4'
]]

rail_lines_served = pandas.melt(
    rail_stations,
    id_vars=['Code', 'Name', 'Lat', 'Lon'],
    value_vars=['LineCode1', 'LineCode2', 'LineCode3', 'LineCode4'],
    var_name='Split',
    value_name='LineCode').loc[:, ['Code', 'LineCode']]
rail_lines_served = rail_lines_served[rail_lines_served['LineCode'].notnull()]

rail_stations = rail_stations.loc[:, ['Code', 'Name', 'Lat', 'Lon']]

# rail stations
# uppercase the station name
# map the MAR ID
# for unknown records, add to unknown locations table
# for records where no match on wmata_station_code, station, and mar_id, add
# map back the station ids
# update rail_stations_operational
# expire stations where no record in new table, record in operational
# extend expiration date where record in both
コード例 #47
0
        diff_gene_expression_df.append(sig_dict)
        p_val_list.append(p_val)

_, p_val_corrected = fdrcorrection(p_val_list)


diff_gene_expression_df = pd.DataFrame(diff_gene_expression_df)
diff_gene_expression_df['p_val'] = p_val_corrected
diff_gene_expression_df['sig_level'] = diff_gene_expression_df['p_val'].apply(
    lambda x: man_utils.pval_to_sig(x))
diff_gene_expression_df = diff_gene_expression_df.loc[diff_gene_expression_df.sig_level != 'n.s.', ]
gene_sig_grouped = diff_gene_expression_df.groupby('gene')


diff_param_data_inh = pd.melt(hof_param_data_inh_lines, id_vars=['Cell_id', 'Cre_line'],
                              value_vars=significant_parameters, var_name='conductance',
                              value_name='value')


inh_expr_df = pd.melt(inh_expression_data, id_vars=['sample_id', 'Cre_line'],
                      value_vars=gene_types, var_name='gene', value_name='cpm')
hue_levels = inh_lines

tick_fontsize = 16
axis_fontsize = 16
sns.set(style='whitegrid')

for channel_, genes in channel_correlate_dict.items():
    cond_ = 'gbar_%s.somatic' % channel_
    if cond_ not in significant_parameters:
        continue
コード例 #48
0
def load_jhu_us_time_series(branch="master"):
    """
    Loads the JHU US timeseries data, transforms it so we are happy with it.
    """
    cases = pd.read_csv(CASES_URL.format(branch))
    deaths = pd.read_csv(DEATHS_URL.format(branch))
    lookup_table = pd.read_csv(LOOKUP_TABLE_URL.format(branch))

    keep_lookup_cols = ["UID", "Population"]
    lookup_table = lookup_table[keep_lookup_cols]

    # melt cases
    id_vars, dates = parse_columns(cases)
    cases_df = pd.melt(
        cases, id_vars=id_vars, value_vars=dates, value_name="cases", var_name="date",
    )

    # melt deaths
    id_vars, dates = parse_columns(deaths)
    deaths_df = pd.melt(
        deaths, id_vars=id_vars, value_vars=dates, value_name="deaths", var_name="date",
    )

    # join
    merge_cols = [
        "UID",
        "iso2",
        "iso3",
        "code3",
        "FIPS",
        "Admin2",
        "Province_State",
        "Country_Region",
        "Lat",
        "Long_",
        "date",
    ]
    m1 = pd.merge(cases_df, deaths_df, on=merge_cols, how="left")

    df = pd.merge(m1.drop(columns="Population"), lookup_table, on="UID", how="left")

    keep_cols = [
        "Province_State",
        "Admin2",
        "FIPS",
        "Lat",
        "Long_",
        "date",
        "cases",
        "deaths",
        "Population",
    ]

    df = (
        df[keep_cols]
        .assign(
            date=pd.to_datetime(df.date)
            .dt.tz_localize("US/Pacific")
            .dt.normalize()
            .dt.tz_convert("UTC"),
        )
        .rename(
            columns={
                "FIPS": "fips",
                "Long_": "Lon",
                "Province_State": "state",
                "Admin2": "county",
            }
        )
    )

    # Fix fips
    df = df.pipe(coerce_fips_integer)
    df["fips"] = df.fips.astype(str)
    df["fips"] = df.apply(correct_county_fips, axis=1)
    for col in ["state", "county", "fips"]:
        df[col] = df[col].fillna("")

    return df.sort_values(sort_cols).reset_index(drop=True)
コード例 #49
0
# Montreal geojson
with open(DATA_PATH.joinpath('montreal_shapefile.geojson'),
          encoding='utf-8') as shapefile:
    mtl_geojson = json.load(shapefile)

# Montreal cases per borough
# cases = pd.read_csv(DATA_PATH.joinpath('cases.csv'), encoding='utf-8', na_values='na').dropna(axis=1, how='all')
# borough_tbc = cases[-1:]  # Nb. of cases with borough TBC
# cases_df = cases[:-1]  # Nb. of cases with known borough
# cases_long = pd.melt(cases_df, id_vars='borough',
#                     var_name='date', value_name='cases')
cases_per1000_df = pd.read_csv(DATA_PATH.joinpath('cases_per1000.csv'),
                               encoding='utf-8',
                               na_values='na').dropna(axis=1, how='all')
cases_per1000_long = pd.melt(reduce_cols(cases_per1000_df, 10),
                             id_vars='borough',
                             var_name='date',
                             value_name='cases_per_1000')

# Montreal data
data_mtl = pd.read_csv(DATA_PATH.joinpath('data_mtl.csv'),
                       encoding='utf-8',
                       na_values='na')

# QC data
data_qc = pd.read_csv(DATA_PATH.joinpath('data_qc.csv'),
                      encoding='utf-8',
                      na_values='na')

# Last update date
# Display 1 day after the latest data as data from the previous day are posted
latest_mtl_date = datetime.date.fromisoformat(
コード例 #50
0
ファイル: features_2010_2018.py プロジェクト: stelagirt/ffp
 big_frame = pd.concat(dfs, ignore_index=True)
 big_frame_1 = pd.DataFrame(
     big_frame[fields[0]].loc[big_frame['fire'] == 1],
     columns=[fields[0]
              ]).assign(year=key).rename(columns={feature: "fire"})
 big_frame_0 = pd.DataFrame(
     big_frame[fields[0]].loc[big_frame['fire'] == 0],
     columns=[fields[0]
              ]).assign(year=key).rename(columns={feature: "non_fire"})
 del big_frame
 new_list.append(big_frame_1)
 new_list.append(big_frame_0)
 del dfs
 #import pdb; pdb.set_trace()
 cdf = pd.concat([pd for pd in new_list], ignore_index=True)
 mdf = pd.melt(cdf, id_vars=['year'], var_name=['fire']).dropna()
 #		import pdb; pdb.set_trace()
 #	bxpstats.extend(cbook.boxplot_stats(np.ravel(mdf), labels=key))
 #	del cdf
 #	fig, axes = pyplot.subplots()
 if i == 0:
     min_lim = mdf.value.min()
     max_lim = mdf.value.max()
     sp = (max_lim - min_lim) / 10
     ax = sns.boxplot(ax=axes[i],
                      x="year",
                      y="value",
                      hue="fire",
                      palette=['red', 'green'],
                      data=mdf)
     #ax.set_ylim([int(min_lim), int(max_lim)])
コード例 #51
0
import pandas as pd
data1 = {'Student':['Ice Bear','Panda','Grizzly'],
        'Math':[80,95,79]}
grades1=pd.DataFrame(data1,columns=['Student','Math'])
data2 = {'Student':['Ice Bear','Panda','Grizzly'],
        'Electronics':[85,81,83]}
grades2=pd.DataFrame(data2,columns=['Student','Electronics'])
data3 = {'Student':['Ice Bear','Panda','Grizzly'],
        'GEAS':[90,79,93]}
grades3=pd.DataFrame(data3,columns=['Student','GEAS'])
data4 = {'Student':['Ice Bear','Panda','Grizzly'],
        'ESAT':[93,89,88]}
grades4=pd.DataFrame(data4,columns=['Student','ESAT'])

merge=pd.merge (grades1,grades2,how='right',on='Student')
merge1=pd.merge (merge,grades3,how='right',on='Student')
mergefinal=pd.merge (merge1,grades4,how='right',on='Student')
mergelong=pd.melt(mergefinal,id_vars = 'Student', 
                  var_name = 'Subject', 
                  value_name='Grades')
コード例 #52
0
from bokeh.io import output_file, show
import microtubule_pkg as mt

output_file("interactive_fig1.html")

rg = np.random.default_rng(1284)

lbl_df = pd.read_csv('gardner_time_to_catastrophe_dic_tidy.csv')

labeled = lbl_df.loc[lbl_df["labeled"] == True, "time to catastrophe (s)"].values
unlabeled = lbl_df.loc[lbl_df["labeled"] == False, "time to catastrophe (s)"].values

# Make plots for tubulin concentration data
# taken from HW9.1
df = pd.read_csv('gardner_mt_catastrophe_only_tubulin.csv',comment='#')
df = pd.melt(df, value_vars = ['12 uM', '7 uM', '9 uM', '10 uM', '14 uM'], var_name = 'tubulin concentrations',
        value_name = 'time to catastrophe (s)')
df = df.dropna()
concen = ['12 uM', '7 uM', '9 uM', '10 uM', '14 uM']

def tub_stripbox(conc):
    return iqplot.stripbox(
        title = 'Microtubule Time to Catastrophe against Tubulin Concentration',
        data = df.loc[df['tubulin concentrations'] == conc],
        q = 'time to catastrophe (s)',
        #color_column='year',
        q_axis='x',
        jitter=True,
        whisker_caps=True,
        display_points=False,
        marker_kwargs=dict(alpha=0.5, size=1),
        box_kwargs=dict(fill_color=None, line_color='grey'),
コード例 #53
0
    size_scale = 500
    ax.scatter(
        x=x.map(x_to_num),  # Use mapping for x
        y=y.map(y_to_num),  # Use mapping for y
        s=size *
        size_scale,  # Vector of square sizes, proportional to size parameter
        marker='s'  # Use square as scatterplot marker
    )

    # Show column labels on the axes
    ax.set_xticks([x_to_num[v] for v in x_labels])
    ax.set_xticklabels(x_labels, rotation=45, horizontalalignment='right')
    ax.set_yticks([y_to_num[v] for v in y_labels])
    ax.set_yticklabels(y_labels)

    fig.show()


data = pd.read_csv(
    'https://raw.githubusercontent.com/drazenz/heatmap/master/autos.clean.csv')
columns = [
    'bore', 'stroke', 'compression-ratio', 'horsepower', 'city-mpg', 'price'
]
corr = data[columns].corr()
corr = pd.melt(
    corr.reset_index(), id_vars='index'
)  # Unpivot the dataframe, so we can get pair of arrays for x and y
corr.columns = ['x', 'y', 'value']
heatmap(x=corr['x'], y=corr['y'], size=corr['value'].abs())
コード例 #54
0
    steps = 10
    adj = [(i + 1) * 0.01 / steps for i in range(steps)]

    myDF = myModel.makeSensAnalDF(
        {col: adj
         for col in myDF.columns if col != "RSS"}, myDF)

    myRSS = myModel.getRSSforParamiters(myDF, RS["indep_cond"], myPaths)

    myDF['RSS'] = myRSS

    myDF2 = myDF.copy()
    tempDS = myDF.iloc[0]
    myDF2 = (myDF2 - tempDS) / tempDS
    myDF2 = pd.melt(myDF2, id_vars=['RSS'])
    myDF2 = myDF2[myDF2["value"] != 0.0]
    myDF2 = myDF2[pd.isnull(myDF2["value"]) == False]
    myDF2 = myDF2.sort_values(by=['RSS'])
    myDF2 = myDF2.rename(columns={
        "value": "paramiter change%",
        "RSS": "RSS change%"
    })
    myDF["index"] = index
    myDF2["index"] = index
    myList1.append(myDF)
    myList2.append(myDF2)

myDF = pd.concat(myList1)
myDF2 = pd.concat(myList2)
myDF.to_csv(os.path.join(data_dir, "sensativity.csv"))
コード例 #55
0
ファイル: climograph.py プロジェクト: peter-shoes/PHEV_DB
def newcsv(set1, data_dict, set2=None):
    df = pd.DataFrame(data=data_dict)
    melt = pd.melt(df, id_vars=list(data_dict.keys())[0], var_name='Period', value_name='Average Temperature')
    print(melt)
    return(melt)
コード例 #56
0
        list(hidr.set_index('UH_nome').index),
    )

    data = hidr.set_index('UH_nome').loc[powerplant]

    left_column, right_column = st.beta_columns([1, 3])

    table_vis = data.filter(
        like='vol/volutilmax_itr',
        axis=1).T.reset_index(drop=True).reset_index().set_index('index')

    left_column.write(table_vis)

    data = data.filter(like='vol/volutilmax_itr',
                       axis=1).T.reset_index(drop=True).reset_index()
    data = pd.melt(data, id_vars=["index"])
    data.columns = ['iteration', 'UH_nome', 'vol/volutilmax']
    st.write('')

    chart = (
        alt.Chart(data).mark_area(opacity=0.2)
        # .mark_line()
        .encode(
            x="iteration:N",
            y=alt.Y("vol/volutilmax:Q", stack=None),
            color="UH_nome:N",
        ))
    right_column.altair_chart(chart, use_container_width=True)

    st.subheader('After seek goal')
    st.write('\n')
コード例 #57
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 30 16:55:56 2019

@author: root
"""


import pandas as pedo
#setting libraries
dmath={'Student':['Ice bear','Panda','Grizzly'], 'Math':[80,95,79]}
delectronics={'Student':['Ice bear','Panda','Grizzly'], 'Electronics':[85,81,83]}
dgeas={'Student':['Ice bear','Panda','Grizzly'], 'GEAS':[90,79,93]}
desat={'Student':['Ice bear','Panda','Grizzly'], 'ESAT':[93,89,88]}
#dataframezzzz
math=pedo.DataFrame(dmath)
elecs=pedo.DataFrame(delectronics)
geas=pedo.DataFrame(dgeas)
esat=pedo.DataFrame(desat)
#merge
grades = pedo.merge(pedo.merge(pedo.merge(math,elecs),geas),esat)
#long to short
messy = pedo.melt(grades, id_vars=['Student'], value_vars=['Math','Electronics','GEAS','ESAT']).rename(columns={'variable' : 'Subject', 'value' : 'Grades'})
コード例 #58
0
def analyze_color(rgb_img, mask, hist_plot_type=None, label="default"):
    """Analyze the color properties of an image object
    Inputs:
    rgb_img          = RGB image data
    mask             = Binary mask made from selected contours
    hist_plot_type   = None, 'all', 'rgb','lab' or 'hsv'
    label            = optional label parameter, modifies the variable name of observations recorded

    Returns:
    analysis_image   = histogram output

    :param rgb_img: numpy.ndarray
    :param mask: numpy.ndarray
    :param hist_plot_type: str
    :param label: str
    :return analysis_images: list
    """
    if len(np.shape(rgb_img)) < 3:
        fatal_error("rgb_img must be an RGB image")

    # Mask the input image
    masked = cv2.bitwise_and(rgb_img, rgb_img, mask=mask)
    # Extract the blue, green, and red channels
    b, g, r = cv2.split(masked)
    # Convert the BGR image to LAB
    lab = cv2.cvtColor(masked, cv2.COLOR_BGR2LAB)
    # Extract the lightness, green-magenta, and blue-yellow channels
    l, m, y = cv2.split(lab)
    # Convert the BGR image to HSV
    hsv = cv2.cvtColor(masked, cv2.COLOR_BGR2HSV)
    # Extract the hue, saturation, and value channels
    h, s, v = cv2.split(hsv)

    # Color channel dictionary
    channels = {
        "b": b,
        "g": g,
        "r": r,
        "l": l,
        "m": m,
        "y": y,
        "h": h,
        "s": s,
        "v": v
    }

    # Histogram plot types
    hist_types = {
        "ALL": ("b", "g", "r", "l", "m", "y", "h", "s", "v"),
        "RGB": ("b", "g", "r"),
        "LAB": ("l", "m", "y"),
        "HSV": ("h", "s", "v")
    }

    if hist_plot_type is not None and hist_plot_type.upper() not in hist_types:
        fatal_error(
            "The histogram plot type was " + str(hist_plot_type) +
            ', but can only be one of the following: None, "all", "rgb", "lab", or "hsv"!'
        )
    # Store histograms, plotting colors, and plotting labels
    histograms = {
        "b": {
            "label":
            "blue",
            "graph_color":
            "blue",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["b"]], [0], mask,
                                                  [256], [0, 255])
            ]
        },
        "g": {
            "label":
            "green",
            "graph_color":
            "forestgreen",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["g"]], [0], mask,
                                                  [256], [0, 255])
            ]
        },
        "r": {
            "label":
            "red",
            "graph_color":
            "red",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["r"]], [0], mask,
                                                  [256], [0, 255])
            ]
        },
        "l": {
            "label":
            "lightness",
            "graph_color":
            "dimgray",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["l"]], [0], mask,
                                                  [256], [0, 255])
            ]
        },
        "m": {
            "label":
            "green-magenta",
            "graph_color":
            "magenta",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["m"]], [0], mask,
                                                  [256], [0, 255])
            ]
        },
        "y": {
            "label":
            "blue-yellow",
            "graph_color":
            "yellow",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["y"]], [0], mask,
                                                  [256], [0, 255])
            ]
        },
        "h": {
            "label":
            "hue",
            "graph_color":
            "blueviolet",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["h"]], [0], mask,
                                                  [256], [0, 255])
            ]
        },
        "s": {
            "label":
            "saturation",
            "graph_color":
            "cyan",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["s"]], [0], mask,
                                                  [256], [0, 255])
            ]
        },
        "v": {
            "label":
            "value",
            "graph_color":
            "orange",
            "hist": [
                float(i[0]) for i in cv2.calcHist([channels["v"]], [0], mask,
                                                  [256], [0, 255])
            ]
        }
    }

    # Create list of bin labels for 8-bit data
    binval = np.arange(0, 256)

    analysis_image = None
    # Create a dataframe of bin labels and histogram data
    dataset = pd.DataFrame({
        'bins': binval,
        'blue': histograms["b"]["hist"],
        'green': histograms["g"]["hist"],
        'red': histograms["r"]["hist"],
        'lightness': histograms["l"]["hist"],
        'green-magenta': histograms["m"]["hist"],
        'blue-yellow': histograms["y"]["hist"],
        'hue': histograms["h"]["hist"],
        'saturation': histograms["s"]["hist"],
        'value': histograms["v"]["hist"]
    })

    # Make the histogram figure using plotnine
    if hist_plot_type is not None:
        if hist_plot_type.upper() == 'RGB':
            df_rgb = pd.melt(dataset,
                             id_vars=['bins'],
                             value_vars=['blue', 'green', 'red'],
                             var_name='Color Channel',
                             value_name='Pixels')
            hist_fig = (ggplot(
                df_rgb, aes(x='bins', y='Pixels', color='Color Channel')) +
                        geom_line() +
                        scale_x_continuous(breaks=list(range(0, 256, 25))) +
                        scale_color_manual(['blue', 'green', 'red']))

        elif hist_plot_type.upper() == 'LAB':
            df_lab = pd.melt(
                dataset,
                id_vars=['bins'],
                value_vars=['lightness', 'green-magenta', 'blue-yellow'],
                var_name='Color Channel',
                value_name='Pixels')
            hist_fig = (ggplot(
                df_lab, aes(x='bins', y='Pixels', color='Color Channel')) +
                        geom_line() +
                        scale_x_continuous(breaks=list(range(0, 256, 25))) +
                        scale_color_manual(['yellow', 'magenta', 'dimgray']))

        elif hist_plot_type.upper() == 'HSV':
            df_hsv = pd.melt(dataset,
                             id_vars=['bins'],
                             value_vars=['hue', 'saturation', 'value'],
                             var_name='Color Channel',
                             value_name='Pixels')
            hist_fig = (ggplot(
                df_hsv, aes(x='bins', y='Pixels', color='Color Channel')) +
                        geom_line() +
                        scale_x_continuous(breaks=list(range(0, 256, 25))) +
                        scale_color_manual(['blueviolet', 'cyan', 'orange']))

        elif hist_plot_type.upper() == 'ALL':
            s = pd.Series([
                'blue', 'green', 'red', 'lightness', 'green-magenta',
                'blue-yellow', 'hue', 'saturation', 'value'
            ],
                          dtype="category")
            color_channels = [
                'blue', 'yellow', 'green', 'magenta', 'blueviolet', 'dimgray',
                'red', 'cyan', 'orange'
            ]
            df_all = pd.melt(dataset,
                             id_vars=['bins'],
                             value_vars=s,
                             var_name='Color Channel',
                             value_name='Pixels')
            hist_fig = (ggplot(
                df_all, aes(x='bins', y='Pixels', color='Color Channel')) +
                        geom_line() +
                        scale_x_continuous(breaks=list(range(0, 256, 25))) +
                        scale_color_manual(color_channels))
        analysis_image = hist_fig
    # Hue values of zero are red but are also the value for pixels where hue is undefined. The hue value of a pixel will
    # be undef. when the color values are saturated. Therefore, hue values of 0 are excluded from the calculations below

    # Calculate the median hue value (median is rescaled from the encoded 0-179 range to the 0-359 degree range)
    hue_median = np.median(h[np.where(h > 0)]) * 2

    # Calculate the circular mean and standard deviation of the encoded hue values
    # The mean and standard-deviation are rescaled from the encoded 0-179 range to the 0-359 degree range
    hue_circular_mean = stats.circmean(h[np.where(h > 0)], high=179, low=0) * 2
    hue_circular_std = stats.circstd(h[np.where(h > 0)], high=179, low=0) * 2

    # Plot or print the histogram
    if hist_plot_type is not None:
        params.device += 1
        if params.debug == 'print':
            hist_fig.save(os.path.join(
                params.debug_outdir,
                str(params.device) + '_analyze_color_hist.png'),
                          verbose=False)
        elif params.debug == 'plot':
            print(hist_fig)

    # Store into global measurements
    # RGB signal values are in an unsigned 8-bit scale of 0-255
    rgb_values = [i for i in range(0, 256)]
    # Hue values are in a 0-359 degree scale, every 2 degrees at the midpoint of the interval
    hue_values = [i * 2 + 1 for i in range(0, 180)]
    # Percentage values on a 0-100 scale (lightness, saturation, and value)
    percent_values = [round((i / 255) * 100, 2) for i in range(0, 256)]
    # Diverging values on a -128 to 127 scale (green-magenta and blue-yellow)
    diverging_values = [i for i in range(-128, 128)]

    if hist_plot_type is not None:
        if hist_plot_type.upper() == 'RGB' or hist_plot_type.upper() == 'ALL':
            outputs.add_observation(sample=label,
                                    variable='blue_frequencies',
                                    trait='blue frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["b"]["hist"],
                                    label=rgb_values)
            outputs.add_observation(sample=label,
                                    variable='green_frequencies',
                                    trait='green frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["g"]["hist"],
                                    label=rgb_values)
            outputs.add_observation(sample=label,
                                    variable='red_frequencies',
                                    trait='red frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["r"]["hist"],
                                    label=rgb_values)

        if hist_plot_type.upper() == 'LAB' or hist_plot_type.upper() == 'ALL':
            outputs.add_observation(sample=label,
                                    variable='lightness_frequencies',
                                    trait='lightness frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["l"]["hist"],
                                    label=percent_values)
            outputs.add_observation(sample=label,
                                    variable='green-magenta_frequencies',
                                    trait='green-magenta frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["m"]["hist"],
                                    label=diverging_values)
            outputs.add_observation(sample=label,
                                    variable='blue-yellow_frequencies',
                                    trait='blue-yellow frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["y"]["hist"],
                                    label=diverging_values)

        if hist_plot_type.upper() == 'HSV' or hist_plot_type.upper() == 'ALL':
            outputs.add_observation(sample=label,
                                    variable='hue_frequencies',
                                    trait='hue frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["h"]["hist"][0:180],
                                    label=hue_values)
            outputs.add_observation(sample=label,
                                    variable='saturation_frequencies',
                                    trait='saturation frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["s"]["hist"],
                                    label=percent_values)
            outputs.add_observation(sample=label,
                                    variable='value_frequencies',
                                    trait='value frequencies',
                                    method='plantcv.plantcv.analyze_color',
                                    scale='frequency',
                                    datatype=list,
                                    value=histograms["v"]["hist"],
                                    label=percent_values)

    # Always save hue stats
    outputs.add_observation(sample=label,
                            variable='hue_circular_mean',
                            trait='hue circular mean',
                            method='plantcv.plantcv.analyze_color',
                            scale='degrees',
                            datatype=float,
                            value=hue_circular_mean,
                            label='degrees')
    outputs.add_observation(sample=label,
                            variable='hue_circular_std',
                            trait='hue circular standard deviation',
                            method='plantcv.plantcv.analyze_color',
                            scale='degrees',
                            datatype=float,
                            value=hue_circular_std,
                            label='degrees')
    outputs.add_observation(sample=label,
                            variable='hue_median',
                            trait='hue median',
                            method='plantcv.plantcv.analyze_color',
                            scale='degrees',
                            datatype=float,
                            value=hue_median,
                            label='degrees')

    # Store images
    outputs.images.append(analysis_image)

    return analysis_image
コード例 #59
0
def load_data(data_path):
    # Subtitles info
    subs_df = pd.read_csv(data_path / 'prep/all_subtitles.csv')

    # IMDB data, only to top 150 movies
    imdb_df = pd.read_csv(data_path / 'prep/imdb_top250_movies.csv')
    imdb_df = imdb_df.loc[imdb_df['top_250_rank'] <= 150]
    movie_info_df = imdb_df.loc[:, [
        'imdb_id', 'title', 'year', 'rating', 'genres', 'top_250_rank',
        'color_info'
    ]]

    # Movie duration dict
    with open(data_path / 'prep/movie_duration_dict.pk', 'rb') as r:
        movie_duration_dict = pickle.load(r)

    # Silences info
    silences_df = pd.read_csv(data_path / 'prep/silences_info.csv')

    silences_df = pd.merge(left=silences_df,
                           right=movie_info_df,
                           on='imdb_id',
                           how='inner')

    silences_df.loc[:, 'total_duration'] = silences_df['imdb_id'].apply(
        lambda x: movie_duration_dict[x])
    silences_df.loc[:, 'pos_rel'] = 100 * silences_df['start'] / silences_df[
        'total_duration']
    silences_df.loc[:, 'dur_rel'] = 100 * silences_df[
        'duration'] / silences_df['total_duration']

    # Movie summary info
    movies_df = pd.read_csv(data_path / 'prep/movies_infos.csv')

    aux_dict = {
        'silence_dur': 'Silence',
        'dialogue_dur': 'Dialogue',
        'other_dur': 'Other sounds'
    }
    cols = movies_df.columns.tolist()

    for k, v in aux_dict.items():
        cols.remove(k)

    movies_melt = pd.melt(movies_df,
                          id_vars=cols,
                          value_vars=list(aux_dict.keys()))

    movies_melt.loc[:, 'var_name'] = movies_melt['variable'].apply(
        lambda x: aux_dict[x])

    # Positions info
    positions_df = data_sound_type_share_by__position(subs_df, silences_df,
                                                      movie_info_df, 1)

    # Umap data
    umap_df = pd.read_csv(data_path / 'prep/umap_df.csv')

    return {
        'subs_df': subs_df,
        'silences_df': silences_df,
        'movies_df': movies_df,
        'movies_melt_df': movies_melt,
        'positions_df': positions_df,
        'umap_df': umap_df
    }
コード例 #60
0
# Importar Dados

data = pd.read_excel(
    "D:\OneDrive\Documentos OK\Python Scripts\WIOD_SEA_Nov16 (2).xlsx",
    sheet_name='DATA')
df = pd.DataFrame(data)

# Filtrar para Brasil

df_bra = df[df['country'] == 'BRA']

# Fazer anos virarem uma coluna só (melt)

df_bra_melt = pd.melt(df_bra,
                      id_vars=["country", "variable", "description", "code"],
                      var_name='year')

# Fazer variable virar colunas (pivot_table)

df_bra_pivot = pd.pivot_table(df_bra_melt,
                              index=["country", "description", "code", "year"],
                              columns="variable",
                              values="value")

df_bra_pivot = df_bra_pivot.reset_index()

# Selecionar colunas não úteis

df_Bra = df_bra_pivot.drop(
    ['country', 'COMP', 'EMP', 'EMPE', 'GO_PI', 'II_PI', 'VA_PI', 'VA_QI'],