Beispiel #1
0
def create_task():
    df = clean_df(request.json)
    df_customer = df >> mutate(name=X.FullNameBilling.str.upper()) >> group_by(
        X.name) >> summarize(contact=X.PhoneBilling.head(1),
                             email=X.EmailBilling.head(1),
                             address=X.Address2Billing.head(1),
                             num_items_purchased=(X.name).count())
    jsondf = df_customer.to_json(orient='records')

    return (jsondf)
Beispiel #2
0
def create_task2():
	df = clean_df(request.json)
	print(df["Category"])	
	df['supplier'] = df['Category'].apply(lambda x: supp(x))
	df = DplyFrame(df) >> group_by(X.supplier) >> summarize(max1 = most_common( X.Name ) ) 

	print(df)
	# df_fav = df >> mutate(new = supp(X.Category))
	jsondf = df.to_json(orient='records')

	return (jsondf);
Beispiel #3
0
def create_task2():
    df = clean_df(request.json)
    print(df["Category"])
    df['supplier'] = df['Category'].apply(lambda x: supp(x))
    df = DplyFrame(df) >> group_by(
        X.supplier) >> summarize(max1=most_common(X.Name))

    print(df)
    # df_fav = df >> mutate(new = supp(X.Category))
    jsondf = df.to_json(orient='records')

    return (jsondf)
Beispiel #4
0
def create_task():
	df = clean_df(request.json)
	df_customer = df >> mutate(name=X.FullNameBilling.str.upper()) >> group_by(X.name) >> summarize(contact = X.PhoneBilling.head(1),
		email = X.EmailBilling.head(1),
        address = X.Address2Billing.head(1),
        num_items_purchased = (X.name).count()
        ) 
	jsondf = df_customer.to_json(orient='records')

	return (jsondf);
# data=iris %>% 
# select(Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species)
iris >> dp.select(X.Species) >> dp.head()

iris[['Species', 'PetalLength']]
iris.drop('SepalLength', axis=1) #quitar esa columna
iris.drop(5, axis=0) #quitar la sexta fila
# data=iris %>% 
# filter(Petal.Length>1 & Petal.Length<100)
iris >> dp.sift(X.PetalLength>5)

iris[(iris['PetalLength']>5) & (iris['PetalLength']<6)]
# data=iris %>% 
# dplyr::group_by(Species) %>%
# summarise(media=mean(Petal.Length)) 
iris >> dp.group_by(X.Species) >> dp.summarize(media=X.PetalLength.mean())

iris.groupby(['Species'])['PetalLength'].agg(['mean', 'sum', 'count'])
iris.groupby(['Species'])['PetalLength'].agg({'var1':'mean', 'var2':'sum', 'var3':'count'})
iris.groupby(['Species'])['PetalLength'].agg({'var1':['mean', 'sum']})
aggregations = {
    'dsuma':'sum',
    
}
import math
iris.groupby(['Species'])['PetalLength'].agg({'dsuma':'sum', 'otro': lambda x: math.sqrt(x.mean()) - 1})
# data=iris %>% 
# mutate(total=Sepal.Length+Petal.Length, otro=ifelse(Petal.Length>2, "grande", "pequeño"))
iris >> dp.mutate(redondeado=X.PetalLength.round(), redondeado2=X.SepalLength.round())

iris.assign(redondeado = lambda x: x.PetalLength.round(), redondeado2 = lambda x: x.SepalLength.round())
# bind_rows(other, join='outer', ignore_index=False)
# pandas.concat([df, other], join=join, ignore_index=ignore_index, axis=0)
a >> bind_rows(b, join='inner')
a >> bind_rows(b, join='outer')

# bind_cols() - joining DataFrames "horizontally"
# bind_cols(other, join='outer', ignore_index=False)
# pandas.concat([df, other], join=join, ignore_index=ignore_index, axis=1)
a >> bind_cols(b)


# Summarization
# summarize(**kwargs) takes an arbitrary number of keyword arguments that will 
# return new columns labeled with the keys that are summary functions of columns 
# in the original DataFrame.
diamonds >> summarize(price_mean=X.price.mean(), price_std=X.price.std())
diamonds >> group_by('cut') >> summarize(price_mean=X.price.mean(), price_std=X.price.std())

# summarize_each(function_list, *columns) is a more general summarization function. 
# It takes a list of summary functions to apply as its first argument and then a 
# list of columns to apply the summary functions to. Columns can be specified with 
# either symbolic, string label, or integer position like in the selection functions 
# for convenience.
diamonds >> summarize_each([np.mean, np.var], X.price, 'depth')
diamonds >> group_by(X.cut) >> summarize_each([np.mean, np.var], X.price, 4)

# Summary functions
# mean(series)
diamonds >> groupby(X.cut) >> summarize(price_mean=mean(X.price))
# first(series, order_by=None)
diamonds >> groupby(X.cut) >> summarize(price_first=first(X.price))
Beispiel #7
0
"""to filter the bigrams only"""
bigr = output[output['word'].str.contains("_")]

"""FROM THIS PART, 2 STRATEGIES, SAVE THE OUTPUT AND CONTINUE W R OR GO AHEAD W PYTHON"""




"""5 plotting"""
"""5 1 aggregating for plotting"""
from dplython import (DplyFrame, X, diamonds, select, sift, sample_n,
    sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) 
dfr = DplyFrame(output)
dfr = (dfr >> 
  group_by(X.word, X.source) >> 
  summarize(tot=X.count.sum()))
dff = (dfr >>select(X.word, X.tot ))

"""5.2 wordcloud"""
"""turns the word freq to dict"""
d = {}
for a, x in dff.values:
    d[a] = x
wordcloud = WordCloud(width = 1000, height = 1000,
                background_color ='white',
                min_font_size =15, max_font_size=120).generate_from_frequencies(frequencies=d)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
Beispiel #8
0
def czMatchmaker(data, Q, precursor_fasta):
    data = pd.read_csv(
        "/Users/matteo/Documents/czMatchmaker/data/examplaryData.csv")
    data = DplyFrame(data)
    precursors = data >> \
     sift( X.tag == 'precursor' ) >> \
     select( X.active, X.neutral, X.estimates)

    fragments = data >> sift( X.tag != 'precursor' ) >> \
     group_by( X.tag, X.active, X.broken_bond ) >> \
     summarize( estimates = X.estimates.sum() )

    I_on_fragments = {}
    optiminfos = {}
    for break_point, data in fragments.groupby('broken_bond'):
        pairing, optiminfo = collect_fragments(data, Q)
        I_on_fragments[break_point] = pairing
        optiminfos[break_point] = optiminfo

    cations_fragmented_I = sum(
        sum(I_on_fragments[bP][p] for p in I_on_fragments[bP])
        for bP in I_on_fragments)

    I_no_reactions = precursors >> \
        sift( X.active==Q, X.neutral == 0) >> \
        select( X.estimates )

    I_no_reactions = I_no_reactions.values.flatten()[0]

    prec_ETnoD_PTR_I = precursors >> \
        sift( X.active != Q ) >> \
        rename( ETnoD  = X.neutral, I = X.estimates ) >> \
        mutate( PTR    = Q - X.ETnoD - X.active ) >> \
        select( X.ETnoD, X.PTR, X.I )

    I_prec_no_frag = prec_ETnoD_PTR_I >> \
        summarize( I = X.I.sum() )

    I_prec_no_frag = I_prec_no_frag.values.flatten()[0]

    precursorNoReactions = precursors >> \
        sift( X.active == Q ) >> \
        select( X.estimates )

    prec_ETnoD_PTR_I = prec_ETnoD_PTR_I >> mutate(
            I_PTR  = crossprod(X.PTR, X.I), \
            I_ETnoD = crossprod(X.ETnoD, X.I) ) >> \
        summarize( I_PTR = X.I_PTR.sum(), I_ETnoD = X.I_ETnoD.sum() )

    I_PTR_no_frag, I_ETnoD_no_frag = prec_ETnoD_PTR_I.values.flatten()

    prob_PTR = I_PTR_no_frag / (I_PTR_no_frag + I_ETnoD_no_frag)
    prob_ETnoD = 1. - prob_PTR

    I_frags = dict(
        (bP, sum(I_on_fragments[bP][pairing]
                 for pairing in I_on_fragments[bP])) for bP in I_on_fragments)

    I_frag_total = sum(I_frags[bP] for bP in I_frags)

    prob_frag = Counter(
        dict((int(bP), I_frags[bP] / I_frag_total) for bP in I_frags))
    prob_frag = [prob_frag[i] for i in range(len(precursor_fasta))]

    I_frags_PTRETnoD_total = sum(
        (Q - 1 - sum(q for cz, q in pairing)) * I_on_fragments[bP][pairing]
        for bP in I_on_fragments for pairing in I_on_fragments[bP])

    anion_meets_cation = I_frags_PTRETnoD_total + I_PTR_no_frag + I_ETnoD_no_frag
    prob_fragmentation = I_frags_PTRETnoD_total / anion_meets_cation
    prob_no_fragmentation = 1 - prob_fragmentation

    prob_no_reaction = I_no_reactions / (I_no_reactions + I_frag_total +
                                         I_prec_no_frag)
    prob_reaction = 1. - prob_no_reaction

    res = {}
    res['reaction'] = (prob_reaction, prob_no_reaction)
    res['fragmentation'] = (prob_fragmentation, prob_no_fragmentation)
    res['fragmentation_amino_acids'] = tuple(prob_frag)
    return res
firsts = pd.read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-09/firsts.csv'
)
firsts.to_csv('/Users/vivekparashar/Downloads/firsts.csv')

# Create/Convert a pandas dataframe to dplython df
firsts = DplyFrame(firsts)

firsts.columns
firsts.gender.unique()
firsts.category.unique()

# firsts df summary by category
t1 = (firsts >> mutate(year_grp=((X.year / 10).round()) * 10) >> group_by(
    X.year_grp, X.category) >> summarize(nrows=X.accomplishment.count()))
c1 = alt.Chart(t1).mark_circle().encode(x='year_grp:O',
                                        y='category:O',
                                        size='nrows:Q')
c3 = alt.Chart(t1).mark_bar().encode(x='year_grp', y='nrows', color='category')
# firsts df summary by gender
t2 = (firsts >> mutate(year_grp=((X.year / 10).round()) * 10) >> group_by(
    X.year_grp, X.gender) >> summarize(nrows=X.accomplishment.count()))
c2 = alt.Chart(t2).mark_circle().encode(x='year_grp:O',
                                        y='gender:O',
                                        size='nrows:Q')

chart = alt.vconcat(c2, c1, c3)

chart.save(
    '/Users/vivekparashar/OneDrive/OneDrive-GitHub/Challenges-and-Competitions/TidyTuesday/Data/2020-11-17/chart.png',
import pandas
from dplython import (DplyFrame, X, diamonds, select, sift, sample_n,
                      sample_frac, head, arrange, mutate, group_by, summarize,
                      DelayFunction)

diamonds >> head(5)

diamonds >> select(X.carat, X.cut, X.price) >> head(5)

d = (diamonds >> sift(X.carat > 4) >> select(X.carat, X.cut, X.depth, X.price)
     >> head(2))

(diamonds >> mutate(carat_bin=X.carat.round()) >> group_by(X.cut, X.carat_bin)
 >> summarize(avg_price=X.price.mean()))

test = df['deaths'] < 0
less_than_zero = df[test]
print(less_than_zero.shape)
print(less_than_zero.head())

test

#df['deaths_fixed'] = df['deaths_new'].apply(lambda x: 'True' if x <= 0 else 'False')
Beispiel #11
0
def load_data(input_dir, crsrd_id):
    cctv_log = pd.read_csv(input_dir + "/ORT_CCTV_5MIN_LOG.csv")
    cctv_mst = pd.read_csv(input_dir + "/ORT_CCTV_MST.csv")

    cctv_log['DATE'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).date)
    cctv_log['HOUR'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).hour)
    cctv_log['MINUTE'] = (
        pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).minute) // 30) * 30
    cctv_log['temp_DAY'] = pd.to_datetime(cctv_log['DATE']).dt.dayofweek
    cctv_log.loc[cctv_log['temp_DAY'] < 5, 'DAY'] = int(0)  #mon - fri
    cctv_log.loc[cctv_log['temp_DAY'] == 5, 'DAY'] = int(1)  #sat
    cctv_log.loc[cctv_log['temp_DAY'] == 6, 'DAY'] = int(2)  #sun
    df0 = DplyFrame(cctv_log) >> group_by(
        X.DATE, X.DAY, X.HOUR, X.MINUTE, X.CCTV_ID) >> summarize(
            GO_TRF=X.GO_BIKE.sum() + X.GO_CAR.sum() + X.GO_SUV.sum() +
            X.GO_VAN.sum() + X.GO_TRUCK.sum() + X.GO_BUS.sum() +
            X.RIGHT_BIKE.sum() + X.RIGHT_CAR.sum() + X.RIGHT_SUV.sum() +
            X.RIGHT_VAN.sum() + X.RIGHT_TRUCK.sum() + X.RIGHT_BUS.sum(),
            LEFT_TRF=X.LEFT_BIKE.sum() + X.LEFT_CAR.sum() + X.LEFT_SUV.sum() +
            X.LEFT_VAN.sum() + X.LEFT_TRUCK.sum() + X.LEFT_BUS.sum())
    # Extract records of selected crossroad
    cctv_mst = DplyFrame(cctv_mst) >> sift(X.CRSRD_ID == crsrd_id) >> select(
        X.CRSRD_ID, X.CCTV_ID)
    df0 = pd.merge(df0, cctv_mst, how="inner", on="CCTV_ID")
    df0 = df0.sort_values(['DATE', 'HOUR', 'MINUTE', 'CCTV_ID'])

    # Time frame from existing dataset
    tf = DplyFrame(
        df0.drop_duplicates(
            ['DATE', 'DAY', 'HOUR', 'MINUTE'], keep='last')) >> select(
                X.DATE, X.DAY, X.HOUR, X.MINUTE)

    # Process the datastructure into pivot
    cctv_list = sorted(cctv_mst['CCTV_ID'].unique())
    df1 = tf

    for cctv in cctv_list:
        a = df0 >> sift(X.CCTV_ID == cctv) >> select(
            X.DATE, X.DAY, X.HOUR, X.MINUTE, X.GO_TRF, X.LEFT_TRF)
        df1 = pd.merge(df1,
                       a,
                       how='left',
                       on=['DATE', 'DAY', 'HOUR', 'MINUTE'],
                       suffixes=('', '_' + str(cctv)))

    df1 = df1.set_index(['DATE', 'DAY', 'HOUR', 'MINUTE'])
    df1 = df1.fillna(df1.rolling(window=24, min_periods=1, center=True).mean())
    df1 = df1.fillna(0)
    df1 = df1.reset_index()

    df1['TOTAL_TRF'] = DplyFrame(df1.iloc[:, 4:3 + len(cctv_list) * 2].sum(
        axis=1, skipna=True))
    df1 = df1 >> sift(X.TOTAL_TRF > 0)
    print(df1)
    # Name the cctv id and direction - for tod_traffic_analyzer

    cols = [cctv + '_GO_RATE' for cctv in cctv_list]
    cols.extend([cctv + '_LEFT_RATE' for cctv in cctv_list])
    cols = sorted(cols)
    cols = ['TOD'] + cols + ['TOTAL_TRF']

    return df1, cols