Beispiel #1
0
    def __call__(self, df):

        if len(self.args) >= 2:

            if not isinstance(self.args[0], dplython.later.Later) or \
                not isinstance(self.args[1], dplython.later.Later):

                raise ValueError(
                    "Arguments must be of the form \"X.column1, X.column2, ...\""
                )

            sp_key = self.args[0]._name
            sp_value = self.args[1]._name

        else:

            raise ValueError(
                "You must provide at least two arguments, the key and the value."
            )

        all_id_cols = []
        all_value_cols = list(df.columns)

        if len(self.args) > 2:

            if 'exclude' in self.kwargs and self.kwargs['exclude'] == True:

                for arg in self.args[2:]:

                    if not isinstance(arg, dplython.later.Later):
                        raise ValueError(
                            "Arguments must be of the form \"X.column1, X.column2, ...\""
                        )

                    all_id_cols.append(arg._name)
                    all_value_cols.remove(arg._name)

            else:

                all_id_cols = list(df.columns)
                all_value_cols = []

                for arg in self.args[2:]:

                    if not isinstance(arg, dplython.later.Later):
                        raise ValueError(
                            "Arguments must be of the form \"X.column1, X.column2, ...\""
                        )

                    all_id_cols.remove(arg._name)
                    all_value_cols.append(arg._name)

        outdf = DplyFrame(
            df.melt(id_vars=all_id_cols, value_vars=all_value_cols))

        cols = list(outdf.columns)
        cols[-2:] = sp_key, sp_value
        outdf.columns = cols

        return outdf
Beispiel #2
0
def create_task2():
	df = clean_df(request.json)
	print(df["Category"])	
	df['supplier'] = df['Category'].apply(lambda x: supp(x))
	df = DplyFrame(df) >> group_by(X.supplier) >> summarize(max1 = most_common( X.Name ) ) 

	print(df)
	# df_fav = df >> mutate(new = supp(X.Category))
	jsondf = df.to_json(orient='records')

	return (jsondf);
Beispiel #3
0
def create_task2():
    df = clean_df(request.json)
    print(df["Category"])
    df['supplier'] = df['Category'].apply(lambda x: supp(x))
    df = DplyFrame(df) >> group_by(
        X.supplier) >> summarize(max1=most_common(X.Name))

    print(df)
    # df_fav = df >> mutate(new = supp(X.Category))
    jsondf = df.to_json(orient='records')

    return (jsondf)
Beispiel #4
0
    def __call__(self, df):

        if len(self.args) >= 2:

            if not isinstance(self.args[0], dplython.later.Later) or \
                not isinstance(self.args[1], dplython.later.Later):

                raise ValueError(
                    "Arguments must be of the form \"X.column1, X.column2, ...\""
                )

            sp_key = self.args[0]._name
            sp_value = self.args[1]._name

        else:
            raise ValueError(
                "You must provide at least two arguments, the key and the value."
            )

        multiindex = [s for s in df.columns if s != sp_key and s != sp_value]

        outdf = DplyFrame(
            df.set_index(multiindex).pivot(columns=sp_key,
                                           values=sp_value)).reset_index()
        outdf.columns.name = None

        outdf = outdf[multiindex + list(dict.fromkeys(df[sp_key]))]

        return outdf
Beispiel #5
0
def read_delim(f, delim, col_names = True):

    assert isinstance(f, str)
    assert isinstance(delim, str)
    assert isinstance(col_names, bool)

    if col_names == True:
        col_names = 0
    else:
        col_names = None

    df = DplyFrame(pd.read_csv(filepath_or_buffer=f, header=col_names, sep=delim))

    if col_names == None:
        df.columns = [''.join(map(str, list(n))) for n in zip(cycle(['X']), range(1, df.shape[1]+1))]

    return df
Beispiel #6
0
def clean_df(requestjson):
    # data = request.json
    data = requestjson
    dd = data["file2"]
    df = pd.DataFrame(dd)

    df.columns = df.columns.map(lambda x: re.sub(r'\W+', '', x))
    a = DplyFrame(df)

    return (a)
Beispiel #7
0
def add_day(df, countriez):

    df_temp = df.copy()
    df = pd.DataFrame(index=range(0, 100000))

    for country in countriez:
        data = (DplyFrame(df_temp) >> sift(X.country == country))

        df_filt = (data >> mutate(day=range(1, len(data) + 1)))

        df = pd.concat([df, df_filt], sort=False).dropna(how='all')

    return df
Beispiel #8
0
    def dply(self):
        """
        Return dplyr frame for the read table.

        dplyr is an R inspired wrapper to process Pandas tables in a
        flow-like manner. See https://github.com/dodger487/dplython and
        https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html
        for more details about dplyr.

        dplyr and nuts-ml use the same syntax (>>) for chaining functions and
        integrate nicely with each other.

        :return: dplyr dataframe instead of Pandas dataframe.
        :rtype: DplyFrame
        """
        return DplyFrame(self.dataframe)
# cummin(series)
diamonds >> select(X.price) >> mutate(price_cummin=cummin(X.price)) >> head(6)
# cumprod(series)
diamonds >> select(X.price) >> mutate(price_cumprod=cumprod(X.price)) >> head(6)


# Extending dfply with custom functions
# https://github.com/kieferk/dfply/blob/master/examples/basics-extending-functionality.ipynb


############### dplython #################

from dplython import (DplyFrame, X, diamonds, select, sift,
  sample_n, sample_frac, head, arrange, mutate, group_by,
  summarize, DelayFunction)
df = DplyFrame(df)

df >> head(5)
df >> sample_n(5)
df >> select(X.name, X.category, X.country, X.role, X.description)
df >> sift(X.category == 'Leadership') # As in pandas, use bitwise logical operators like |, & (, is same as &)
df >> arrange(X.country) # couldnt find a way to sort descending so moved to dfply library
df >> mutate(carat_bin=X.carat.round())
df >> group_by(X.category) >> summarize(num_of_people = X.name.count())

# It's possible to pass the entire dataframe using X._
# The special Later name, "_" will refer to the entire DataFrame. 

# Combine multiple
(df >> sift(X.name != 'Unsung hero')
    >> group_by(X.category)
Beispiel #10
0
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)

    return numerator / (denominator + 1e-7)


# train Parameters
seq_length = 60
data_dim = 8
hidden_dim = 10
output_dim = 1
learning_rate = 0.01
iterations = 500

# last, diff_24h, diff_per_24h, bid, ask, low, high, volume
data = DplyFrame(pd.read_csv('./bitcoin_ticker.csv', delimiter=','))
data = data >> sift(X.rpt_key == 'btc_krw') >> select(
    X.last, X.diff_24h, X.diff_per_24h, X.bid, X.ask, X.low, X.high, X.volume)
data = np.asarray(data)
#data = MinMaxScaler(data)
data = tf.layers.batch_normalization(data)
x = data
y = data[:, [0]]  # last as label

# build a dataset
dataX = []
dataY = []
for i in range(0, len(y) - seq_length):
    _x = x[i:i + seq_length]
    _y = y[i + seq_length]  # Next close price
    print(_x, "->", _y)
Beispiel #11
0
"""Plotting distribution of feature coefficients for ARIMAX models."""

import json
import pandas as pd
from dplython import DplyFrame
import matplotlib
matplotlib.use('TkAgg')
from ggplot import (ggplot, scale_color_brewer, geom_histogram,
                    scale_x_continuous, facet_wrap, labs, ggtitle,
                    scale_y_continuous, theme_gray, aes,
                    geom_boxplot)  # noqa: E402

# data frame processing; creating a master df for faceting plots
json_filename = "arima_1.json"
coefficient_dict = json.load(open(json_filename))
coefficients = DplyFrame(
    pd.DataFrame.from_dict(coefficient_dict, orient='index'))
coefficients = coefficients.transpose()
folder_name = json_filename.split('.')[0]

dfs_to_concat = []
for feature in [
        "home_goal", "away_goal", "home_yellow", "away_yellow", "home_red",
        "away_red"
]:

    # create a new long-form dataframe for clean plotting purposes
    values_dict = {
        "significant": coefficients[feature]["significant"],
        "insignificant": coefficients[feature]["unsignificant"]
    }
    df = pd.DataFrame.from_dict(values_dict, orient='index')
Beispiel #12
0
output = output.drop(columns ='content')


"""to filter the bigrams only"""
bigr = output[output['word'].str.contains("_")]

"""FROM THIS PART, 2 STRATEGIES, SAVE THE OUTPUT AND CONTINUE W R OR GO AHEAD W PYTHON"""




"""5 plotting"""
"""5 1 aggregating for plotting"""
from dplython import (DplyFrame, X, diamonds, select, sift, sample_n,
    sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) 
dfr = DplyFrame(output)
dfr = (dfr >> 
  group_by(X.word, X.source) >> 
  summarize(tot=X.count.sum()))
dff = (dfr >>select(X.word, X.tot ))

"""5.2 wordcloud"""
"""turns the word freq to dict"""
d = {}
for a, x in dff.values:
    d[a] = x
wordcloud = WordCloud(width = 1000, height = 1000,
                background_color ='white',
                min_font_size =15, max_font_size=120).generate_from_frequencies(frequencies=d)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
Beispiel #13
0
def load_data(input_dir, crsrd_id):
    cctv_log = pd.read_csv(input_dir + "/ORT_CCTV_5MIN_LOG.csv")
    cctv_mst = pd.read_csv(input_dir + "/ORT_CCTV_MST.csv")

    cctv_log['DATE'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).date)
    cctv_log['HOUR'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).hour)
    cctv_log['MINUTE'] = (
        pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).minute) // 30) * 30
    cctv_log['temp_DAY'] = pd.to_datetime(cctv_log['DATE']).dt.dayofweek
    cctv_log.loc[cctv_log['temp_DAY'] < 5, 'DAY'] = int(0)  #mon - fri
    cctv_log.loc[cctv_log['temp_DAY'] == 5, 'DAY'] = int(1)  #sat
    cctv_log.loc[cctv_log['temp_DAY'] == 6, 'DAY'] = int(2)  #sun
    df0 = DplyFrame(cctv_log) >> group_by(
        X.DATE, X.DAY, X.HOUR, X.MINUTE, X.CCTV_ID) >> summarize(
            GO_TRF=X.GO_BIKE.sum() + X.GO_CAR.sum() + X.GO_SUV.sum() +
            X.GO_VAN.sum() + X.GO_TRUCK.sum() + X.GO_BUS.sum() +
            X.RIGHT_BIKE.sum() + X.RIGHT_CAR.sum() + X.RIGHT_SUV.sum() +
            X.RIGHT_VAN.sum() + X.RIGHT_TRUCK.sum() + X.RIGHT_BUS.sum(),
            LEFT_TRF=X.LEFT_BIKE.sum() + X.LEFT_CAR.sum() + X.LEFT_SUV.sum() +
            X.LEFT_VAN.sum() + X.LEFT_TRUCK.sum() + X.LEFT_BUS.sum())
    # Extract records of selected crossroad
    cctv_mst = DplyFrame(cctv_mst) >> sift(X.CRSRD_ID == crsrd_id) >> select(
        X.CRSRD_ID, X.CCTV_ID)
    df0 = pd.merge(df0, cctv_mst, how="inner", on="CCTV_ID")
    df0 = df0.sort_values(['DATE', 'HOUR', 'MINUTE', 'CCTV_ID'])

    # Time frame from existing dataset
    tf = DplyFrame(
        df0.drop_duplicates(
            ['DATE', 'DAY', 'HOUR', 'MINUTE'], keep='last')) >> select(
                X.DATE, X.DAY, X.HOUR, X.MINUTE)

    # Process the datastructure into pivot
    cctv_list = sorted(cctv_mst['CCTV_ID'].unique())
    df1 = tf

    for cctv in cctv_list:
        a = df0 >> sift(X.CCTV_ID == cctv) >> select(
            X.DATE, X.DAY, X.HOUR, X.MINUTE, X.GO_TRF, X.LEFT_TRF)
        df1 = pd.merge(df1,
                       a,
                       how='left',
                       on=['DATE', 'DAY', 'HOUR', 'MINUTE'],
                       suffixes=('', '_' + str(cctv)))

    df1 = df1.set_index(['DATE', 'DAY', 'HOUR', 'MINUTE'])
    df1 = df1.fillna(df1.rolling(window=24, min_periods=1, center=True).mean())
    df1 = df1.fillna(0)
    df1 = df1.reset_index()

    df1['TOTAL_TRF'] = DplyFrame(df1.iloc[:, 4:3 + len(cctv_list) * 2].sum(
        axis=1, skipna=True))
    df1 = df1 >> sift(X.TOTAL_TRF > 0)
    print(df1)
    # Name the cctv id and direction - for tod_traffic_analyzer

    cols = [cctv + '_GO_RATE' for cctv in cctv_list]
    cols.extend([cctv + '_LEFT_RATE' for cctv in cctv_list])
    cols = sorted(cols)
    cols = ['TOD'] + cols + ['TOTAL_TRF']

    return df1, cols
Beispiel #14
0
# Read in temp file to match
import pandas as pd
co2_file_to_match = "annual_avg_co2_GFDL-ESM2M_rcp45_r1i1p1.csv"
esm_co2_data = pd.read_csv(co2_file_to_match).rename(columns={
    "value.1...": "co2_value"
}).set_index('year')
esm_co2 = esm_co2_data.co2_value * 1000000

#print esm_co2_data.head(5)
#print esm_co2.head(5)
from numpy import mean
from dplython import (DplyFrame, X, mutate)

CONCENTRATION_CO2 = "simpleNbox.Ca"
hector_co2 = pyhector.run(pyhector.rcp45)[CONCENTRATION_CO2].loc[esm_co2.index]
comp = DplyFrame({"hector": hector_co2, "esm": esm_co2})


def difference_quantifier(esm_series, hector_run_series):
    calculate_df = DplyFrame({"hector": hector_run_series, "esm": esm_series})
    calculate_df = calculate_df >> mutate(percentdiff=(X.hector - X.esm) /
                                          X.esm)
    return mean(abs(calculate_df.percentdiff))


#print difference_quantifier(esm_co2,hector_co2)


def hector_runner(params, comp_data, var):
    hector_output = pyhector.run(
        pyhector.rcp45, {
Beispiel #15
0
def difference_quantifier(esm_series, hector_run_series):
    calculate_df = DplyFrame({"hector": hector_run_series, "esm": esm_series})
    calculate_df = calculate_df >> mutate(percentdiff=(X.hector - X.esm) /
                                          X.esm)
    return mean(abs(calculate_df.percentdiff))
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 6.4, 4.8

import numpy as np
import seaborn as sns
sns.set_theme(style="ticks", palette="pastel")
import altair as alt

firsts = pd.read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-09/firsts.csv'
)
firsts.to_csv('/Users/vivekparashar/Downloads/firsts.csv')

# Create/Convert a pandas dataframe to dplython df
firsts = DplyFrame(firsts)

firsts.columns
firsts.gender.unique()
firsts.category.unique()

# firsts df summary by category
t1 = (firsts >> mutate(year_grp=((X.year / 10).round()) * 10) >> group_by(
    X.year_grp, X.category) >> summarize(nrows=X.accomplishment.count()))
c1 = alt.Chart(t1).mark_circle().encode(x='year_grp:O',
                                        y='category:O',
                                        size='nrows:Q')
c3 = alt.Chart(t1).mark_bar().encode(x='year_grp', y='nrows', color='category')
# firsts df summary by gender
t2 = (firsts >> mutate(year_grp=((X.year / 10).round()) * 10) >> group_by(
    X.year_grp, X.gender) >> summarize(nrows=X.accomplishment.count()))
Beispiel #17
0
# Django is only used here to fetch the data from djangodb
if __name__ == "__main__":
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

    application = DjangoWhiteNoise(get_wsgi_application())

from dplython import DplyFrame

from article_fetcher import ArticleFetcher
from tagger.models import Article
import pandas as pd

# Trained data
label_df = DplyFrame(
    pd.read_csv(
        "/Users/danchecketts/PycharmProjects/taggernews/supervised_topics.csv")
)

RETRY = True
DEBUG = False
DEBUG_FETCH_MAX = 100
FETCH_NOT_CACHED = True

loops = 0
skipped_due_state = 0
retry_articles = 0
loaded_from_db = 0
uncached_articles = []

for article_id in label_df.id:
Beispiel #18
0
    temp_df = temp_df.set_index(match_data.index)
    temp_df["time"] = match_data["time"]

    # append to long form df and plot
    preds_df = pd.concat([longform_df, temp_df], axis=0, ignore_index=True)
    sifted_df = preds_df >> sift((X.match_id == match_id)
                                 | (X.match_id == opponent_match_id)
                                 | (X.match_id == model_no))

    plot_matches(sifted_df, date, filename_out)


# read & dissect df
longform_df_og = pd.read_csv("../../LongForm/longform.csv",
                             dtype={'shorthand_search_vol': int})
longform_df = DplyFrame(longform_df_og)
cols = longform_df.columns
y_var = longform_df["shorthand_search_vol"]  # search volume
longform_df['ones'] = 1

# MLR MODEL #1: USING ONLY MATCH STAGES TO PREDICT SEARCH VOLUME
# note the difference in argument order; y_var is dependent, x_vars independent
# using Stage 0 as "reference level"; only vars are stage_1-4_indicators
x_var_list = [
    "ones", "stage_1_ind", "stage_2_ind", "stage_3_ind", "stage_4_ind"
]
x_vars = longform_df[x_var_list]
lm = sm.OLS(y_var, x_vars).fit()
with open('model1.txt', 'w') as f:
    print >> f, lm.summary()
predict_on_match_id(lm=lm,
Beispiel #19
0
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)

    return numerator / (denominator + 1e-7)


# train Parameters
seq_length = 60
data_dim = 8
hidden_dim = 10
output_dim = 1
learning_rate = 0.01
iterations = 500

# last, diff_24h, diff_per_24h, bid, ask, low, high, volume
data = DplyFrame(
    pd.read_csv('/home/yeolpyeong/bitcoin_ticker.csv', delimiter=','))
data = data >> sift(X.rpt_key == 'btc_krw') >> select(
    X.last, X.diff_24h, X.diff_per_24h, X.bid, X.ask, X.low, X.high, X.volume)
data = np.asarray(data)
data = MinMaxScaler(data)
x = data
y = data[:, [0]]  # last as label

# build a dataset
dataX = []
dataY = []
for i in range(0, len(y) - seq_length):
    _x = x[i:i + seq_length]
    _y = y[i + seq_length]  # Next close price
    print(_x, "->", _y)
    dataX.append(_x)
def main(argv):
    yURL = None
    outdir = None
    maxFrames = 500
    yURL = input("Enter the youtube url:")
    outdir = input("Enter the output directory:")
    maxFrames = int(input("Enter the maximum number of frames to check:"))

    faceDet = cv2.CascadeClassifier(
        "haarcascade/haarcascade_frontalface_default.xml")
    faceDet2 = cv2.CascadeClassifier(
        "haarcascade/haarcascade_frontalface_alt2.xml")
    faceDet3 = cv2.CascadeClassifier(
        "haarcascade/haarcascade_frontalface_alt.xml")
    faceDet4 = cv2.CascadeClassifier(
        "haarcascade/haarcascade_frontalface_alt_tree.xml")
    #
    pdata, pframes, pfacedims = getNewInstances(yURL,
                                                faceDet,
                                                faceDet2,
                                                faceDet3,
                                                faceDet4,
                                                maxCount=maxFrames)
    #
    headers = dict()
    headers['Ocp-Apim-Subscription-Key'] = ms_key1
    headers['Content-Type'] = 'application/octet-stream'
    #
    resultsDf = pd.DataFrame()
    frameId = 0
    for image in pframes:
        print("posting frame %d of %d" % (frameId, len(pframes)))
        #sending the frame image to MS cognitive services
        resultMS = processRequest(image, headers)
        #isinstance == type()
        if isinstance(resultMS, list):
            for result in resultMS:
                if isinstance(result, dict):
                    resFrameList = []
                    for res in result['scores'].items():
                        resFrameList.append(
                            (frameId, res[0], res[1],
                             result["faceRectangle"]['left'],
                             result["faceRectangle"]['top'],
                             result["faceRectangle"]['width'],
                             result["faceRectangle"]['height']))
                        appendDf = pd.DataFrame(resFrameList,
                                                columns=[
                                                    "frameId", "emotionLabel",
                                                    "conf", "faceleft",
                                                    "facetop", "faceW", "faceH"
                                                ])
                        resultsDf = resultsDf.append(appendDf)
        time.sleep(2)
        frameId += 1
    #
    # print(resultsDf)
    #we append all the data to the dataframe
    #http://bluescreen.club/2017/06/18/import-pandas-as-pd/
    #then we convert the dataframe to a Dplyframe object which allows us to do higher level data analytics
    #for this one, we will select out the top most ranking face frames for each of the emotions
    #microsoft provides us with around 8 emotions
    #so we sort out 8 faces for 8 emotions and then save them accordingly
    dfFaces = DplyFrame(resultsDf)
    # print(dfFaces)
    topFaces = (
        dfFaces >> group_by(X.emotionLabel) >> sift(X.conf == X.conf.max()) >>
        sift(X.frameId == X.frameId.min()) >> ungroup() >> group_by(
            X.frameId) >> sift(X.conf == X.conf.max()) >> ungroup() >> arrange(
                X.emotionLabel))

    topFaces = topFaces.drop_duplicates()
    #print(topFaces)
    i = 0
    for index, row in topFaces.iterrows():
        print("saving emotion frame %d of %d" % (i, len(topFaces.index)))
        #
        emotion = row["emotionLabel"]
        confid = int(row["conf"] * 100)
        image = pframes[int(row["frameId"])]
        faceL = row["faceleft"]
        faceT = row["facetop"]
        faceW = row["faceW"]
        faceH = row["faceH"]
        #save cropped face
        imageW = image[faceT:faceT + faceH, faceL:faceL + faceW]
        cv2.imwrite(
            os.path.expanduser("%s/Cropped_%s.jpg" % (outdir, emotion)),
            imageW)
        #if you wish to put a rectangle on the faces then uncomment below
        #
        # cv2.rectangle( image,(faceL,faceT),
        #               (faceL+faceW, faceT + faceH),
        #                color = (255,0,0), thickness = 5 )
        # cv2.putText( image, emotion, (faceL,faceT-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1 )
        #
        cv2.imwrite(os.path.expanduser("%s/%s.jpg" % (outdir, emotion)), image)
        i += 1
Beispiel #21
0
 def __rrshift__(self, other):
     return self.__call__(DplyFrame(other.copy(deep=True)))
Beispiel #22
0
    home_yellows = df.home_yellows.apply(lambda x: len(literal_eval(x)))
    away_yellows = df.away_yellows.apply(lambda x: len(literal_eval(x)))
    home_reds = df.home_reds.apply(lambda x: len(literal_eval(x)))
    away_reds = df.away_reds.apply(lambda x: len(literal_eval(x)))

    data = pd.DataFrame({'home_goals': home_goals,
                         'away_goals': away_goals,
                         'home_yellows': home_yellows,
                         'away_yellows': away_yellows,
                         'home_reds': home_reds,
                         'away_reds': away_reds})

    df_container.append(data)

# concatenate all into one master data frames & generate descriptive stats
master_df = DplyFrame(pd.concat(df_container))
print("home goals", "\n", master_df.home_goals.describe())
print("away goals", "\n", master_df.away_goals.describe())
print("home yellows", "\n", master_df.home_yellows.describe())
print("away yellows", "\n", master_df.away_yellows.describe())
print("home reds", "\n", master_df.home_reds.describe())
print("away reds", "\n", master_df.away_reds.describe())

print("frequency of home goals", len(master_df >> sift(X.home_goals > 0)))
print("frequency of away goals", len(master_df >> sift(X.away_goals > 0)))
print("frequency of home yellows", len(master_df >> sift(X.home_yellows > 0)))
print("frequency of away yellows", len(master_df >> sift(X.away_yellows > 0)))
print("frequency of home reds", len(master_df >> sift(X.home_reds > 0)))
print("frequency of away reds", len(master_df >> sift(X.away_reds > 0)))

goals = master_df.apply(lambda row: row.home_goals + row.away_goals, axis=1)
Beispiel #23
0
    temp_df = pd.DataFrame.from_records(preds, columns=labels)
    temp_df = temp_df.set_index(match_data.index)
    temp_df["time"] = match_data["time"]

    # append to long form df and plot
    preds_df = pd.concat([longform_df, temp_df], axis=0, ignore_index=True)
    sifted_df = preds_df >> sift((X.match_id == match_id)
                                 | (X.match_id == opponent_match_id)
                                 | (X.match_id == model_no))

    plot_matches(sifted_df, date, filename_out)


# read initial data
longform_df = DplyFrame(
    pd.read_csv("../../LongForm/longform.csv",
                dtype={'shorthand_search_vol': float}))

# process data
longform_df["date"] = longform_df.match_id.apply(
    lambda x: "20" + x.split("20")[-1])
longform_df['date_time'] = longform_df['date'].astype(
    str) + " " + longform_df['time'].astype(str)
longform_df['date_time'] = pd.to_datetime(longform_df['date_time'],
                                          errors="coerce",
                                          infer_datetime_format=True)

# fit several ARIMA(2, 0, 2) models
# TODO: why did Schwartz use 2, 0, 2 as model parameters?
# TODO: uncomment all below once you figure out prediction / plotting for arima 3+ (smaller models)
start_time = time.clock()
dt = d0.ply_where( X.usertype == 'Subscriber' ).ply_select(
    slat = X.latitude_start * math.pi / 180,
    elat = X.latitude_end * math.pi / 180,
    slng = X.longitude_start * math.pi / 180,
    elng = X.longitude_end * math.pi / 180        
)
print( 'pandas_ply: ' + str( round( time.clock() - start_time, 2 ) ) + ' seconds.' )

# dplython
import pandas
from dplython import (DplyFrame, X, diamonds, select, sift, sample_n,
    sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) 

start_time = time.clock()
dt = DplyFrame(d0) >> sift( X.usertype == 'Subscirber' ) >> mutate(
    slat = X.latitude_start * math.pi / 180,
    elat = X.latitude_end * math.pi / 180,
    slng = X.longitude_start * math.pi / 180,
    elng = X.longitude_end * math.pi / 180
)
print( 'dplython: ' + str( round( time.clock() - start_time, 2 ) ) + ' seconds.' )

# dfply
from dfply import *
import pandas as pd

start_time = time.clock()
dt =d0 >> mask( X.usertype == 'Subscirber' ) >> mutate(
    slat = X.latitude_start * math.pi / 180,
    elat = X.latitude_end * math.pi / 180,
Beispiel #25
0
def czMatchmaker(data, Q, precursor_fasta):
    data = pd.read_csv(
        "/Users/matteo/Documents/czMatchmaker/data/examplaryData.csv")
    data = DplyFrame(data)
    precursors = data >> \
     sift( X.tag == 'precursor' ) >> \
     select( X.active, X.neutral, X.estimates)

    fragments = data >> sift( X.tag != 'precursor' ) >> \
     group_by( X.tag, X.active, X.broken_bond ) >> \
     summarize( estimates = X.estimates.sum() )

    I_on_fragments = {}
    optiminfos = {}
    for break_point, data in fragments.groupby('broken_bond'):
        pairing, optiminfo = collect_fragments(data, Q)
        I_on_fragments[break_point] = pairing
        optiminfos[break_point] = optiminfo

    cations_fragmented_I = sum(
        sum(I_on_fragments[bP][p] for p in I_on_fragments[bP])
        for bP in I_on_fragments)

    I_no_reactions = precursors >> \
        sift( X.active==Q, X.neutral == 0) >> \
        select( X.estimates )

    I_no_reactions = I_no_reactions.values.flatten()[0]

    prec_ETnoD_PTR_I = precursors >> \
        sift( X.active != Q ) >> \
        rename( ETnoD  = X.neutral, I = X.estimates ) >> \
        mutate( PTR    = Q - X.ETnoD - X.active ) >> \
        select( X.ETnoD, X.PTR, X.I )

    I_prec_no_frag = prec_ETnoD_PTR_I >> \
        summarize( I = X.I.sum() )

    I_prec_no_frag = I_prec_no_frag.values.flatten()[0]

    precursorNoReactions = precursors >> \
        sift( X.active == Q ) >> \
        select( X.estimates )

    prec_ETnoD_PTR_I = prec_ETnoD_PTR_I >> mutate(
            I_PTR  = crossprod(X.PTR, X.I), \
            I_ETnoD = crossprod(X.ETnoD, X.I) ) >> \
        summarize( I_PTR = X.I_PTR.sum(), I_ETnoD = X.I_ETnoD.sum() )

    I_PTR_no_frag, I_ETnoD_no_frag = prec_ETnoD_PTR_I.values.flatten()

    prob_PTR = I_PTR_no_frag / (I_PTR_no_frag + I_ETnoD_no_frag)
    prob_ETnoD = 1. - prob_PTR

    I_frags = dict(
        (bP, sum(I_on_fragments[bP][pairing]
                 for pairing in I_on_fragments[bP])) for bP in I_on_fragments)

    I_frag_total = sum(I_frags[bP] for bP in I_frags)

    prob_frag = Counter(
        dict((int(bP), I_frags[bP] / I_frag_total) for bP in I_frags))
    prob_frag = [prob_frag[i] for i in range(len(precursor_fasta))]

    I_frags_PTRETnoD_total = sum(
        (Q - 1 - sum(q for cz, q in pairing)) * I_on_fragments[bP][pairing]
        for bP in I_on_fragments for pairing in I_on_fragments[bP])

    anion_meets_cation = I_frags_PTRETnoD_total + I_PTR_no_frag + I_ETnoD_no_frag
    prob_fragmentation = I_frags_PTRETnoD_total / anion_meets_cation
    prob_no_fragmentation = 1 - prob_fragmentation

    prob_no_reaction = I_no_reactions / (I_no_reactions + I_frag_total +
                                         I_prec_no_frag)
    prob_reaction = 1. - prob_no_reaction

    res = {}
    res['reaction'] = (prob_reaction, prob_no_reaction)
    res['fragmentation'] = (prob_fragmentation, prob_no_fragmentation)
    res['fragmentation_amino_acids'] = tuple(prob_frag)
    return res
def main(argv):
    ytURL = None
    outdir = None
    maxFrames = 500
    try:
        opts, args = getopt.getopt(argv, "hy:o:m:",
                                   ["yturl=", "odir=", "maxframes="])
    except getopt.GetoptError:
        print 'Error: shellScript.py -y <yturl> -o <odir> -m <maxframes>'
        sys.exit(2)
    #print opts
    for opt, arg in opts:
        if opt == '-h':
            print 'help: shellScript.py -y <yturl> -o <odir> -m <maxframes>'
            sys.exit()
        elif opt in ("-y", "--yturl"):
            print("--yturl={}".format(arg))
            ytURL = arg
        elif opt in ("-o", "--odir"):
            print("--odir={}".format(arg))
            outdir = arg
        elif opt in ("-m", "--maxframes"):
            print("--maxframes={}".format(arg))
            maxFrames = int(arg)
    #
    if ytURL is None:
        print 'bad yt: shellScript.py -y <yturl> -o <odir> -m <maxframes>'
        sys.exit()
    #
    if outdir is None:
        print 'bad outdir: shellScript.py -y <yturl> -o <odir> -m <maxframes>'
        sys.exit()
    #
    if False == isinstance(maxFrames, (int, long)):
        print 'bad maxFrames: shellScript.py -y <yturl> -o <odir> -m <maxframes>'
        sys.exit()
    #
    #
    faceDet = cv2.CascadeClassifier(
        "haarcascade/haarcascade_frontalface_default.xml")
    faceDet2 = cv2.CascadeClassifier(
        "haarcascade/haarcascade_frontalface_alt2.xml")
    faceDet3 = cv2.CascadeClassifier(
        "haarcascade/haarcascade_frontalface_alt.xml")
    faceDet4 = cv2.CascadeClassifier(
        "haarcascade/haarcascade_frontalface_alt_tree.xml")
    #
    pdata, pframes, pfacedims = getNewInstances(ytURL,
                                                faceDet,
                                                faceDet2,
                                                faceDet3,
                                                faceDet4,
                                                maxCount=maxFrames)
    #
    headers = dict()
    headers['Ocp-Apim-Subscription-Key'] = ms_key1
    headers['Content-Type'] = 'application/octet-stream'
    #
    resultsDf = pd.DataFrame()
    frameId = 0
    for image in pframes:
        print("posting frame %d of %d" % (frameId, len(pframes)))
        resultMS = processRequest(image, headers)
        #
        if isinstance(resultMS, list):
            for result in resultMS:
                if isinstance(result, dict):
                    resFrameList = []
                    for res in result['scores'].items():
                        resFrameList.append(
                            (frameId, res[0], res[1],
                             result["faceRectangle"]['left'],
                             result["faceRectangle"]['top'],
                             result["faceRectangle"]['width'],
                             result["faceRectangle"]['height']))
                        appendDf = pd.DataFrame(resFrameList,
                                                columns=[
                                                    "frameId", "emotionLabel",
                                                    "conf", "faceleft",
                                                    "facetop", "faceW", "faceH"
                                                ])
                        resultsDf = resultsDf.append(appendDf)
        time.sleep(2)
        frameId += 1
    #
    dfFaces = DplyFrame(resultsDf)
    #
    topFaces = (
        dfFaces >> group_by(X.emotionLabel) >> sift(X.conf == X.conf.max()) >>
        sift(X.frameId == X.frameId.min()) >> ungroup() >> group_by(
            X.frameId) >> sift(X.conf == X.conf.max()) >> ungroup() >> arrange(
                X.emotionLabel))

    topFaces = topFaces.drop_duplicates()
    #print(topFaces)
    #
    i = 0
    for index, row in topFaces.iterrows():
        print("saving emotion frame %d of %d" % (i, len(topFaces.index)))
        #
        emotion = row["emotionLabel"]
        confid = int(row["conf"] * 100)
        image = pframes[int(row["frameId"])]
        faceL = row["faceleft"]
        faceT = row["facetop"]
        faceW = row["faceW"]
        faceH = row["faceH"]
        #
        #save cropped face
        imageW = image[faceT:faceT + faceH, faceL:faceL + faceW]
        cv2.imwrite(
            os.path.expanduser("%s/Cropped_%s.jpg" % (outdir, emotion)),
            imageW)
        #
        cv2.rectangle(image, (faceL, faceT), (faceL + faceW, faceT + faceH),
                      color=(255, 0, 0),
                      thickness=5)
        cv2.putText(image, emotion, (faceL, faceT - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
        #
        cv2.imwrite(os.path.expanduser("%s/box%s.jpg" % (outdir, emotion)),
                    image)
        i += 1