Example #1
0
 def test_parse_orbital(self):
     self.mam1.parse_orbital()
     self.assertEqual(self.mam1.orbital.shape[0], 28)
     self.assertTrue(np.all(pd.notnull(self.mam1.orbital)))
     self.mam2.parse_orbital()
     self.assertEqual(self.mam2.orbital.shape[0], 91)
     self.assertTrue(np.all(pd.notnull(self.mam2.orbital)))
def logistic_test_using_cosine(score_feature=False):
    logger.info('using cosine features in logistic regression')
    if score_feature:
        logger.info('also use score feature')
    Cs = [2**t for t in range(0, 10, 1)]
    Cs.extend([3**t for t in range(1, 10, 1)])
    snli2cosine = SNLI2Cosine('/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin')
    logger.info('loading snli data ...')
    train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t')
    train_df = train_df[pd.notnull(train_df.sentence2)]
    train_df = train_df[train_df.gold_label != '-']
    train_df = train_df[:(len(train_df) / 3)]
    train_df.reset_index(inplace=True)
    test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t')
    test_df = test_df[pd.notnull(test_df.sentence2)]
    test_df = test_df[test_df.gold_label != '-']
    test_df.reset_index(inplace=True)
    X_train, train_labels, X_test, test_labels = snli2cosine.calculate_cosine_features(train_df, test_df)
    if score_feature:
        y_train_proba, y_test_proba = joblib.load('./snli/logistic_score_snli.pkl')
        # y_train_proba = y_train_proba.flatten()
        # y_test_proba = y_test_proba.flatten()
        X_train = np.concatenate([X_train, y_train_proba.reshape((-1, 1))], axis=1)
        X_test = np.concatenate([X_test, y_test_proba.reshape((-1, 1))], axis=1)
    logger.info('X_train.shape: {0}'.format(X_train.shape))
    logger.info('X_test.shape: {0}'.format(X_test.shape))

    logreg = LogisticRegressionCV(Cs=Cs, cv=3, n_jobs=10, random_state=919)
    logreg.fit(X_train, train_labels)
    logger.info('best C is {0}'.format(logreg.C_))
    y_test_predicted = logreg.predict(X_test)
    acc = accuracy_score(test_labels, y_test_predicted)
    logger.info('test data predicted accuracy: {0}'.format(acc))
Example #3
0
def read_tcr(filename, organism, chains, epitope_col):
    tcr_col = ['cdr1.alpha', 'cdr2.alpha', 'cdr2.5.alpha', 'cdr3.alpha',
               'cdr1.beta', 'cdr2.beta', 'cdr2.5.beta', 'cdr3.beta',
               'v.alpha', 'v.beta', epitope_col, 'species']

    all_tcrs = pd.read_table(filename, usecols=tcr_col)
    all_tcrs = all_tcrs[all_tcrs['species'] == organism]

    for chain in chains:
        if chain == 'A':
            all_tcrs = all_tcrs[pd.notnull(all_tcrs['v.alpha'])]
            all_tcrs = all_tcrs[pd.notnull(all_tcrs['cdr3.alpha'])]
            all_tcrs = all_tcrs[all_tcrs['cdr3.alpha'].str.len() > 5]
        elif chain == 'B':
            all_tcrs = all_tcrs[pd.notnull(all_tcrs['v.beta'])]
            all_tcrs = all_tcrs[pd.notnull(all_tcrs['cdr3.beta'])]
            all_tcrs = all_tcrs[all_tcrs['cdr3.beta'].str.len() > 5]

    all_tcrs['v_alpha_rep'] = all_tcrs.loc[:, 'v.alpha'].map(
        ch_cdr3s_human.all_loopseq_representative[organism.lower()])
    all_tcrs['v_beta_rep'] = all_tcrs.loc[:, 'v.beta'].map(ch_cdr3s_human.all_loopseq_representative[organism.lower()])
    all_tcrs['tcr_info'] = list(zip(all_tcrs.v_alpha_rep.str.split(','), all_tcrs.v_beta_rep.str.split(','),
                                    all_tcrs['cdr3.alpha'], all_tcrs['cdr3.beta']))

    all_tcrs = all_tcrs.drop_duplicates(subset=['v_alpha_rep', 'v_beta_rep', 'cdr3.alpha', 'cdr3.beta', epitope_col],
                                        keep='first') #remove duplicates
    all_tcrs = all_tcrs.drop_duplicates(subset=['v_alpha_rep', 'v_beta_rep', 'cdr3.alpha', 'cdr3.beta'], keep=False) #remove crossreactivity

    all_tcrs = all_tcrs.reset_index(drop=True)

    return all_tcrs
Example #4
0
 def test_parse_momatrix(self):
     self.mam1.parse_momatrix()
     self.assertEqual(self.mam1.momatrix.shape[0], 784)
     self.assertTrue(np.all(pd.notnull(self.mam1.momatrix)))
     self.mam2.parse_momatrix()
     self.assertEqual(self.mam2.momatrix.shape[0], 8281)
     self.assertTrue(np.all(pd.notnull(self.mam2.momatrix)))
    def test_update_info_intercept_norm_value(self):
        calc = self.calculated_res['intercept_norm_value']
        calc = calc[pd.notnull(calc)]
        exp = self.expected_res['intercept_norm_value']
        exp = exp[pd.notnull(exp)]

        assert_equal(calc.to_dict(), exp.to_dict())
Example #6
0
def get_prediction_summary(data_pred_df, pred_cols=None, do_print=True, transpose=True, percentiles=None):
    data_pred_df = load_if_str(data_pred_df)

    if pred_cols is None:
        pred_cols = data_pred_df.columns

    if percentiles is None:
        percentiles = []

    pred_summary = data_pred_df.describe(percentiles=percentiles)

    data_all_out = load_data('data_all_out')
    data_all_out = data_all_out[pd.notnull(data_all_out[TARGET_COL])]
    data_pred_df_actual = pd.merge(left=data_all_out, right=data_pred_df, left_index=True, right_index=True)
    if len(data_pred_df_actual) > 0:
        score_ix = len(pred_summary)
        for pred_col in pred_cols:
            try:
                pred_sel = pd.notnull(data_pred_df_actual[pred_col])
                score = auc(actual=data_pred_df_actual.ix[pred_sel, TARGET_COL],
                            pred=data_pred_df_actual.ix[pred_sel, pred_col].round(decimals=ROUND_PRED))
            except ValueError:
                score = np.nan
            pred_summary.loc[score_ix, pred_col] = score
        pred_summary.index = list(pred_summary.index[:-1]) + ['auc']
    if transpose:
        pred_summary = pred_summary.transpose()
    if do_print:
        get_log().info('\nPrediction summary:\n%s' % pred_summary.to_string())
    else:
        return pred_summary
def put_rainfall_to_dataframe(df_10min, lrf_var_all, infd_lst):
     
    for dfidx in df_10min.index:
        # Put first site in site list
        for slidx in xrange(len(infd_lst)):
            if pd.Timestamp(dfidx) in lrf_var_all[slidx]:
                df_10min[infd_lst[slidx]][dfidx] = \
                    lrf_var_all[slidx][pd.Timestamp(dfidx)]
                                            
        # Fill AS1 if there is missing data from AS2.
        # With more sites, AD, ALG, the missing data number are similar.
        
        if pd.isnull(df_10min[infd_lst[0]][dfidx]):
            if pd.notnull(df_10min[infd_lst[1]][dfidx]):
                df_10min[infd_lst[0]][dfidx] = float(df_10min[infd_lst[1]][dfidx])
            elif pd.notnull(df_10min[infd_lst[2]][dfidx]):
                df_10min[infd_lst[0]][dfidx] = float(df_10min[infd_lst[2]][dfidx])
            elif pd.notnull(df_10min[infd_lst[3]][dfidx]):
                df_10min[infd_lst[0]][dfidx] = float(df_10min[infd_lst[3]][dfidx])                
            elif pd.notnull(df_10min[infd_lst[4]][dfidx]):
                df_10min[infd_lst[0]][dfidx] = float(df_10min[infd_lst[4]][dfidx])
            else:
                df_10min[infd_lst[0]][dfidx] = df_10min[infd_lst[0]][dfidx]
               
    return df_10min
Example #8
0
def delete_empty_rows(df, column_list):
    """
    The Data might contains some rows with Empty value NaN.
    This function delete those rows.

    Input:
        df: pandas DataFrame
        column_list: list of String, each item is column name in the DF
    Output:
        df: pandas DataFrame
    """
    empty_rows=[]
    len_1 = len(df)
    if isinstance(column_list, str):
        df = df[pd.notnull(df[column_list])]
    elif isinstance(column_list, list):
        for column in column_list:
            df = df[pd.notnull(df[column])]
            empty_rows.append(df[pd.isnull(df[column])])
    else:
        raise ValueError("Unsupported input!")
    if len_1 - len(df) > 0:
        note = "{0} rows deleted because containing empty value in column {1}."
        print note.format(len_1 - len(df), str(column_list))
        print empty_rows
    return df
Example #9
0
    def test_basic(self):

        # array or list or dates
        N = 50
        rng = date_range('1/1/1990', periods=N, freq='53s')
        ts = Series(np.random.randn(N), index=rng)
        ts[15:30] = np.nan
        dates = date_range('1/1/1990', periods=N * 3, freq='25s')

        result = ts.asof(dates)
        assert notnull(result).all()
        lb = ts.index[14]
        ub = ts.index[30]

        result = ts.asof(list(dates))
        assert notnull(result).all()
        lb = ts.index[14]
        ub = ts.index[30]

        mask = (result.index >= lb) & (result.index < ub)
        rs = result[mask]
        assert (rs == ts[lb]).all()

        val = result[result.index[result.index >= ub][0]]
        assert ts[ub] == val
Example #10
0
	def snap_largest_volume(self, product, sdate, nn=10, midcurves=True):
		"""Snap options grid and get the 10 most traded by volume for each month.
		"""
		self.reset(product, sdate)
		
		if self.livedata:
			ffields = ['BID', 'ASK', 'VOLUME']
			_columns = ['BID', 'BID_VOL', 'ASK', 'ASK_VOL', 'VOLUME']
			
		else:
			ffields = ['PX_SETTLE', 'VOLUME']
			_columns = ['SETTLE', 'SETTLE_VOL', 'VOLUME']
		
		fdata = pd.DataFrame.from_records(self.futures_chain(), columns=['mon', 'last_trade'], index='mon')
		fdata.insert(0, 'ticker', [self.product+mm+' '+self.suffix for mm in fdata.index])
		fdata = pd.merge(fdata, self.pb.fetch(fdata.ticker, ffields, self.date), left_on='ticker', \
			right_index=True, how='outer')
			
		if not 'VOLUME' in fdata:
			raise Exception("No futures volume on " + sdate.strftime("%Y-%m-%d"))
			
		optr = self.options_chain()
		
		# discard months with no volume
		optr = {k:v for k,v in optr.iteritems() if pd.notnull(fdata.VOLUME[v.undl])}
		
		if self.livedata:
			self.calc_atm_vols(optr, 0.5*(fdata.BID+fdata.ASK))
		else:
			self.calc_atm_vols(optr, fdata.PX_SETTLE)
		
		self.get_options_by_volume(optr, nn)
		
		if pd.notnull(self.xs.midcurves) and midcurves:
			mcoptr = self.midcurves_chain()
			mcre = re.compile('^(?P<mm>\d' + self.xs.midcurves[1] + \
				'[FGHJKMNQUVXZ]\d{1})(?P<type>[PC])\s(?P<k>\d+(\.\d+)?) ' + self.suffix + '$')
			if self.livedata:
				mcoptr = {k:v for k,v in mcoptr.iteritems() if pd.notnull(fdata.BID[v.undl])
					and pd.notnull(fdata.ASK[v.undl])}
				self.calc_atm_vols(mcoptr, 0.5*(fdata.BID+fdata.ASK), '', mcre)
			else:
				mcoptr = {k:v for k,v in mcoptr.iteritems() if pd.notnull(fdata.VOLUME[v.undl])}
				self.calc_atm_vols(mcoptr, fdata.PX_SETTLE, '', mcre)
				
			self.get_options_by_volume(mcoptr, nn, 0.05, '', mcre)
		else:
			mcoptr = []
		
		idx = pd.MultiIndex.from_tuples( \
			[(mm,kk[0]) for mm in sorted(optr) for kk in optr[mm].data] \
				+ [(mc,kk[0]) for mc in sorted(mcoptr) for kk in mcoptr[mc].data], \
			names=['month', 'strike'])	
		odata = pd.DataFrame( \
			[list(x[2:]) for mm in sorted(optr) for x in optr[mm].data]
				+ [list(x[2:]) for mc in sorted(mcoptr) for x in mcoptr[mc].data], \
			columns = _columns, \
			index = idx)
			
		return fdata, odata
Example #11
0
    def cleanupforanalysis(self, df_orig, col_sample_an, col_background_an, col_background_int):
        '''
        remove NaNs, remove duplicates, split protein groups, remove splice variant appendix
        create 2 DataFrames
        self.df_all: columns = [sample_ans, background_ans]
        --> contains all AccessionNumbers regardless if intensity values present or not
        self.df_int: columns = [sample_ans, background_ans, intensity]
        --> only if intensity value given
        :return: None
        '''
        self.sample_ser = df_orig[col_sample_an]
        self.background_df = df_orig[[col_background_an, col_background_int]]

        # remove duplicate AccessionNumbers and NaNs from samplefrequency and backgroundfrequency AN-cols
        cond = pd.notnull(self.sample_ser)
        self.sample_ser = self.sample_ser.loc[cond, ].drop_duplicates()
        cond = pd.notnull(self.background_df[col_background_an])
        self.background_df = self.background_df.loc[cond, [col_background_an, col_background_int]].drop_duplicates(subset=col_background_an)

        # split AccessionNumber column into mulitple rows P63261;I3L4N8;I3L1U9;I3L3I0 --> 4 rows of values
        # remove splice variant appendix from AccessionNumbers (if present) P04406-2 --> P04406
        self.sample_ser = self.removeSpliceVariants_takeFirstEntryProteinGroups_Series(self.sample_ser)
        self.background_df = self.removeSpliceVariants_takeFirstEntryProteinGrous_DataFrame(self.background_df, col_background_an, col_background_int)

        # remove duplicate AccessionNumbers and NaNs from samplefrequency and backgroundfrequency AN-cols
        cond = pd.notnull(self.sample_ser)
        self.sample_ser = self.sample_ser.loc[cond, ].drop_duplicates()
        cond = pd.notnull(self.background_df[col_background_an])
        self.background_df = self.background_df.loc[cond, [col_background_an, col_background_int]].drop_duplicates(subset=col_background_an)

        # concatenate data
        self.df_all = self.concat_and_align_sample_and_background(self.sample_ser, self.background_df)

        # remove AccessionNumbers from sample and background-frequency without intensity values
        self.df_int  = self.df_all.loc[pd.notnull(self.df_all[col_background_int]), ]
Example #12
0
    def line(self, x, y, label='', alpha=1.0, add_legend=True, color_from=None, color=None, dashed=False):
        """
        Add a line to the chart object.
        Input:
            x: pandas.Series or list, containing datetime.date objects or strings of the form: 'YYYY-mm_dd'.
            y: pandas.Series or list, containing numerical values.
            label: string, to be used in the legend and tooltip.
            alpha: float, opacity: [0.0, 1.0].
            add_legend: boolean, either adds or removes this line from the legend.
            color_from: string, using the label from another line you can copy its color onto this line.
        """
        # pandas.Series to list
        if isinstance(x, s.Series):
            x = x.where((notnull(x)), None)
            x = x.tolist()
        # datetime.date to str
        if isinstance(x[0], date):
            x = [str(dt) for dt in x]
        # pandas.Series to list
        if isinstance(y, s.Series):
            y = y.where((notnull(y)), None)
            y = y.tolist()


        if not label:
            add_legend = False

        kwargs = {'alpha':alpha, 'add_legend':add_legend, 'color':color, 'dashed':dashed}
        if color_from:
            kwargs['color_from'] = color_from

        curr_line = """ch.line({x}, {y}, '{label}', {kwargs});""".format(x=json.dumps(x), y=json.dumps(y), label=label, kwargs=json.dumps(kwargs))
        self.lines.append(curr_line)
        return self.render_js()
Example #13
0
    def process_missing_data(self, missing='drop'):
        """ Process rows in item array that contain missing values.
        Args:
            missing (string): Method for dealing with missing values. Options:
                'drop': Drop any subjects with at least one missing items
                'impute': Impute the mean for that item across all subjects
        """
        if missing == 'drop':
            inds = pd.notnull(self.X).all(1).nonzero()[0]
            if self.y is not None:
                inds = np.intersect1d(inds, pd.notnull(self.y).all(1).nonzero()[0])
            n_missing = len(self.X) - len(inds)

            if n_missing:
                # Slice and reindex X and y
                self.X = self.X.ix[inds]
                if self.y is not None:
                    self.y = self.y.ix[inds]
                logger.info('Found and deleted %d subjects with missing data.' % n_missing)

        # Imputation. Note that we don't impute the y values, because these should really be 
        # inspected and validated by the user before abbreviating.
        elif missing == 'impute':
            self.X = self.X.apply(lambda x: x.fillna(x.mean()), axis=0)
            # self.y = self.y.apply(lambda x: x.fillna(x.mean()), axis=0)

        self.n_subjects = len(self.X)
Example #14
0
def load_markers():
    df_cities = pd.read_csv("static/data/cities.csv", encoding="cp1255")
    df_acc = pd.concat(pd.read_csv(filename, encoding="cp1255") for filename in
                       glob("static/data/lms/Accidents Type */*/*AccData.csv"))
    df_acc = df_acc[df_acc.SEMEL_YISHUV > 0]
    groups = df_acc.groupby(["SEMEL_YISHUV", "HUMRAT_TEUNA"], as_index=False)
    df_size = groups.size()
    df_size_total = df_acc.groupby("SEMEL_YISHUV", as_index=False).size()
    max_size = df_size_total.max()
    df = groups.mean()
    df = pd.merge(df, df_cities, left_on="SEMEL_YISHUV", right_on="SEMEL")
    df = df[pd.notnull(df.X) & pd.notnull(df.Y) & (df_size_total > 1)]
    app.markers = []
    for index, row in df.iterrows():
        lng, lat = coordinates_converter.convert(row.X, row.Y)
        size = 30 * np.log(1.25 + df_size_total[row.SEMEL_YISHUV] / float(max_size))
        size_per_severity = df_size[row.SEMEL_YISHUV]
        color = max(0, 200 - 200 * (size_per_severity.get(1, 0) +
                                    size_per_severity.get(2, 0)) /
                    size_per_severity.get(3, 1))
        print size
        app.markers.append({
            "lat": lat,
            "lng": lng,
            "title": row.NAME,
            "size": size,
            "color": color
        })
Example #15
0
    def get_Gallup_country_lookups(verbose=True):
        """ Kosovo is the only GWP country not matched to a 3-letter ISO code. Let's ignore it.
        """
        dfr = pd.read_table(__local_input_path__+'GallupWorldPoll-region-country.tsv').rename(columns={'country':'rcountry'})
        dfr['lccountry'] = dfr.rcountry.str.lower()
        dfr = dfr.set_index('lccountry')
        dfw = pd.read_table(__local_input_path__+'GallupWorldPoll-WP5-defs-2016.tsv').rename(columns={'country':'wcountry'})
        dfw['lccountry'] = dfw.wcountry.str.lower()
        dfw = dfw.set_index('lccountry')
        wp5s = pd.read_table(__local_input_path__ +'countrycode_main.tsv',  skiprows=3).set_index('country_GWP3_wp5')
        wp5s = wp5s[['countryCode_GWP3_wp5', 'countryCode_ISO3','country_bestShortName','country_bestName','twoletter_AlexShultz_svg']]
        df= wp5s.join(dfr).join(dfw).rename(columns = {'countryCode_ISO3':'ISO',})
        df.index.name = 'country'
        assert 'South Africa'.lower() in dfr.rcountry
        assert 'South Africa'.lower() in df.index


        # Now several checks:
        # Did regions get their ISO?
        problems = {
            ' Published WHR country lacks an ISO: ': df[pd.notnull(df.rcountry) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','rcountry']],
            ' Published WHR country lacks a WP5: ': df[pd.notnull(df.rcountry) & pd.isnull(df.WP5)],
            ' Published WHR country lacks a map code: ': df[pd.notnull(df.rcountry) & pd.isnull(df.twoletter_AlexShultz_svg)],
            ' Old Gallup micro country lacks an ISO in my master lookup: ': df[pd.notnull(df.countryCode_GWP3_wp5) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','wcountry']],
            ' 2016 Gallup micro country lacks an ISO in my master lookup: ': df[pd.notnull(df.WP5) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','wcountry']],
        }

        if verbose:
            for tt,dd in problems.items():
                if not dd.empty:
                    print('\n\n -- country_tools WARNING: '+tt)
                    print dd
        return df.reset_index()
def process_df(df, file, ax):
    duration_val = []
    for index, row in enumerate(df['days_to_death']):
        if row == '[Not Applicable]':
            duration_val.append(df['days_to_last_followup'][index])
        else:
            duration_val.append(row)
    df['duration'] = duration_val
    df['duration'] = df['duration'].convert_objects(convert_numeric = True).dropna()
    vital_status = []
    for row in df['vital_status']:
        if row not in ['Alive', 'Dead']:
            vital_status.append(None)
        else:
            vital_status.append(row)
    df['vital_status'] = vital_status
    df['SARS'] = df['SARS'].dropna()
    df = df[pd.notnull(df['duration'])]
    df = df[pd.notnull(df['SARS'])]
    df = df[pd.notnull(df['vital_status'])]
    lst = df['SARS'].tolist()
    q1 = np.percentile(lst, 33.33)
    q2 = np.percentile(lst,66.66)
    df1 = df[df['SARS']<=q1]
    df2 = df[(df['SARS']>q1) & (df['SARS'] <= q2)]
    df3 = df[df['SARS']>q2]
    plot_km(df, ax, '', file, "q1")
    ax.get_figure().savefig(result_dir+file+'_kmplot(samples='+str(len(df.index))+').png')
def prepare_data(subdata):

	subdata.ix[notnull(subdata['share']),'cost']=subdata.ix[notnull(subdata['share']),'Pop']
	subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost'])

	### predicts missing water level data points

	formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel=sm.ols(formula,data=subdata).fit()
	predictions=olsmodel.predict(subdata)
	subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values]

	formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel2=sm.ols(formula,data=subdata).fit()
	res2=olsmodel2.params
	predictions2=olsmodel2.predict(subdata)
	subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values]

	### predicts damages based on a few points using water level
	subdata['log{}'.format(varin1)]=np.log(subdata[varin1])
	subdata['log{}'.format(varin2)]=np.log(subdata[varin2])

	formula="costlog ~ log{}".format(varin1)
	damagemodel=sm.ols(formula,data=subdata).fit()
	predicted_damages=damagemodel.predict(subdata)
	subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values]
	subdata['popestimated']=np.exp(subdata['costlog'])
	return subdata
Example #18
0
def combine_basic_detailed(basic, detailed):
    """
    Combine the basic and detailed player information from BBR
    Input:
        basic - dict of basic player info (identified by bbr_id)
        detailed - dict of detailed player info (also identified by bbr_id)
    Output:
        A dict that has parsed certain keys and removed redundant keys
    """
    combined = pd.merge(pd.DataFrame(basic), pd.DataFrame(detailed), how='outer', on='bbr_id')    
    # Parse birth date and location
    tmp_born = pd.DataFrame([str(x).split(' in ') for x in combined['born_dets']]) # TO DO: Make this less hard-coded
    combined['birth_date'], combined['birth_loc'] = tmp_born.ix[:,0].str.strip(), tmp_born.ix[:,1].str.strip()
    # Convert height to inches
    combined['height_in'] = combined['height_dets'].str.split('-').apply(lambda x: int(x[0])*12 + int(x[1]))
    # Parse draft details
    tmp_draft = combined.draft_dets[pd.notnull(combined.draft_dets)].apply(parse_nba_draft_dets).apply(pd.Series)
    tmp_draft.columns = ['draft_team','draft_round','draft_pick_round','draft_pick_overall', 'draft_year']
    combined = combined.join(tmp_draft)
    # Parse Hall of Fame details
    tmp_hof = combined.hall_of_fame_dets[pd.notnull(combined.hall_of_fame_dets)].apply(parse_nba_hof_dets).apply(pd.Series)
    tmp_hof.rename(columns={'Coach':'hof_coach','Contributor':'hof_contributor','Player':'hof_player'}, inplace=True)
    combined = combined.join(tmp_hof)
    # Return parsed/non-redundant columns
    combined.rename(columns={'pos':'position','wt':'weight_lbs','high_school_dets':'high_school','nba_debut_dets':'nba_debut','shoots_dets':'shoots'}, inplace=True)
    combined = combined[BBR_NBA_PLAYER_COLS]
    # Scrub missing values to None
    combined = combined.where(pd.notnull(combined), None)
    return [list(row) for idx, row in combined.iterrows()]
Example #19
0
    def ImportCleanData(self):   
        """ Import and clean the data by removing ratings observations for restaurants that
        aren't yet rated or have a z or p score. """
 
        # Import the restaurant grades data. 
        try:
            self.grades = pd.read_csv('grades.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
        except:
            raise InvalidInputError("Problem in reading in the restaurant data.")   
 
        # Just Select the variables we need
        self.grades = self.grades[['GRADE','CAMIS','INSPECTION DATE', 'BORO']]
        
        # Convert date to date time variable.
        self.grades = self.grades.loc[~self.grades['INSPECTION DATE'].isin(['01/01/1900'])]
        self.grades['INSPECTION DATE'] = pd.to_datetime(self.grades['INSPECTION DATE'])

        # Drop rows that have a missing values.
        self.grades = self.grades[pd.notnull(self.grades['GRADE'])]
        self.grades = self.grades[pd.notnull(self.grades['BORO'])]
        self.grades = self.grades[pd.notnull(self.grades['INSPECTION DATE'])]
             
        # Drop row where  the grade has not been given yet.
        self.grades = self.grades.loc[~self.grades['GRADE'].isin(['Not Yet Graded', 'P', 'Z'])]
                
        # Drop row where  the borough info is missing has not been given yet.
        self.grades = self.grades.loc[~self.grades['BORO'].isin(['Missing'])]
   
        # Drop duplicated (same restaurant and same date) inspection records.
        self.grades.drop_duplicates(['CAMIS','INSPECTION DATE','GRADE'], take_last=True, inplace=True)
        
        # Sort the data
        self.grades = self.grades.sort(['BORO','CAMIS','INSPECTION DATE'], ascending=[1,1,1])
        return self.grades
Example #20
0
 def createUniqueId( longData ):
     import numpy as np
     if pd.notnull(longData['user_id']) and pd.notnull(['course_id']):
         uid = str(longData['user_id']) + str('__') + str(longData['course_id'])
     else:
         uid = 'NULL'
     return uid
Example #21
0
    def get_taps_mazs(self, maz, attribute=None, filter=None):

        # we return multiple tap rows for each maz, so we add an 'idx' row to tell caller
        # which maz-taz rows belong to which row in the original maz list
        # i.e. idx contains the index of the original maz series so we know which
        # rows belong together
        # if maz is a series, then idx has the original maz series index values
        # otherwise it has the 0-based integer offset of the original maz

        if filter:
            maz2tap_df = self.maz2tap_df[pd.notnull(self.maz2tap_df[filter])]
        else:
            maz2tap_df = self.maz2tap_df

        if attribute:
            # FIXME - not sure anyone needs this feature
            maz2tap_df = maz2tap_df[['MAZ', 'TAP', attribute]]
            # filter out null attribute rows
            maz2tap_df = maz2tap_df[pd.notnull(self.maz2tap_df[attribute])]
        else:
            maz2tap_df = maz2tap_df[['MAZ', 'TAP']]

        if isinstance(maz, pd.Series):
            # idx based on index of original maz series
            maz_df = pd.DataFrame({'MAZ': maz, 'idx': maz.index})
        else:
            # 0-based index of original maz
            maz_df = pd.DataFrame({'MAZ': maz, 'idx': range(len(maz))})

        df = pd.merge(maz_df, maz2tap_df, how="inner", sort=False)

        return df
Example #22
0
def phonetic(s, method):
    """
    Phonetically encode the values in the Series. 

    :param method: The algorithm that is used to phonetically encode the values. The possible options are 'soundex' (`wikipedia <https://en.wikipedia.org/wiki/Soundex>`_) and 'nysiis' (`wikipedia <https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System>`_). 
    :type method: str

    :return: A Series with phonetic encoded values.
    :rtype: pandas.Series

    .. note::

        The 'soundex' and 'nysiis' algorithms use the package 'jellyfish'. 
        It can be installed with pip (``pip install jellyfish``).

    """


    try:
        import jellyfish
    except ImportError:
        print ("Install jellyfish to use string encoding.")

    s = clean(s, replace_by_none='[^\-\_A-Za-z0-9]+')
 
    if method == 'soundex':
        return s.str.upper().apply(lambda x: jellyfish.soundex(x) if pandas.notnull(x) else np.nan)

    elif method == 'nysiis':
        return s.str.upper().apply(lambda x: jellyfish.nysiis(x) if pandas.notnull(x) else np.nan)

    else:
        raise Exception("Phonetic encoding method not found")
Example #23
0
    def GetJoinedDf(self):
        run_id = 12332 #dstore.Get("current_run")
            
        # DB df 
        #[u'state', u'id', u'run_id', u'user_id', u'cell', u'time_raw', u'time1', u'lap1', u'time2', u'lap2', u'time3', u'lap3', u'un1', u'un2', u'un3', u'us1']            
        self.joinedDf = psql.read_sql(\
                                "SELECT * FROM times" +\
                                " WHERE (times.run_id = "+ str(run_id ) +")"\
                                , self.db)                  
        
        #set index = id
        self.joinedDf.set_index('id',  drop=False, inplace = True)            
        
        #replace nan with None            
        self.joinedDf = self.joinedDf.where(pd.notnull(self.joinedDf), None)                    

        
        if(self.dstore.GetItem("racesettings-app", ['rfid']) == 2):
            tDf = psql.read_sql("SELECT * FROM tags", self.db, index_col = "id")   
            tDf = tDf[["user_nr", "tag_id"]]
            self.joinedDf =  pd.merge(self.joinedDf,  tDf, left_on='user_id', right_on='tag_id', how="left")
            self.joinedDf =  pd.merge(self.joinedDf,  self.ucDf, left_on='user_nr', right_on='nr',  how="left")
            self.joinedDf.set_index('id',  drop=False, inplace = True) 
        else:
            self.joinedDf =  pd.merge(self.joinedDf,  self.ucDf, left_on='user_id', right_index=True, how="left")
        

        self.joinedDf.sort("time_raw", inplace=True)            
        
        #replace nan with None
        self.joinedDf = self.joinedDf.where(pd.notnull(self.joinedDf), None)
                
        return self.joinedDf
Example #24
0
def _get_project_data(project_df, project_attribute_keys_data):
    """ Yields a project record and project attribute records grouped by project
    """
    for i, row in project_df.iterrows():
        project_data = {
            "project_id": row.project_id,
            "zipcode": str(row.zipcode) if pd.notnull(row.zipcode) else None,
            "weather_station": str(row.weather_station) if pd.notnull(row.weather_station) else None,
            "latitude": row.latitude if pd.notnull(row.latitude) else None,
            "longitude": row.longitude if pd.notnull(row.longitude) else None,
            "baseline_period_start": None,
            "reporting_period_end": None,
        }

        assert pd.notnull(project_data["project_id"])

        baseline_period_end_localized = pytz.UTC.localize(row.baseline_period_end)
        if pd.isnull(baseline_period_end_localized):
            project_data["baseline_period_end"] = None
        else:
            project_data["baseline_period_end"] = baseline_period_end_localized.strftime("%Y-%m-%dT%H:%M:%S%z")

        reporting_period_start_localized = pytz.UTC.localize(row.reporting_period_start)
        if pd.isnull(reporting_period_start_localized):
            project_data["reporting_period_start"] = None
        else:
            project_data["reporting_period_start"] = reporting_period_start_localized.strftime("%Y-%m-%dT%H:%M:%S%z")

        project_attributes_data = []
        for project_attribute_key_data in project_attribute_keys_data:
            project_attribute_data = _get_project_attribute_data(row, project_attribute_key_data)
            project_attributes_data.append(project_attribute_data)

        yield project_data, project_attributes_data
Example #25
0
def load_data(input_file):

    unii = pd.read_csv(input_file, sep='\t', low_memory=False, dtype=str)
    unii.rename(columns={'MF': 'molecular_formula',
                         'PT': 'preferred_term',
                         'RN': 'registry_number'}, inplace=True)
    unii.columns = unii.columns.str.lower()

    # half of them don't have inchikeys
    # set the primary key to inchikey and fill in missing ones with unii
    unii['_id'] = unii.inchikey
    unii['_id'].fillna(unii.unii, inplace=True)

    dupes = set(unii._id) - set(unii._id.drop_duplicates(False))
    records = [{k:v for k,v in record.items() if pd.notnull(v)} for record in unii.to_dict("records") if record['_id'] not in dupes]
    records = [{'_id': record['_id'], 'unii': record} for record in records]
    # take care of a couple cases with identical inchikeys
    for dupe in dupes:
        dr = unii.query("_id == @dupe").to_dict("records")
        dr = [{k:v for k,v in record.items() if pd.notnull(v)} for record in dr]
        records.append({'_id': dupe, 'unii': dr})
    for record in records:
        if isinstance(record['unii'], dict):
            del record['unii']['_id']
        else:
            for subr in record['unii']:
                del subr['_id']
        yield record
Example #26
0
 def test_parse_frame(self):
     self.mam1.parse_frame()
     self.assertEqual(self.mam1.frame.shape[0], 1)
     self.assertTrue(np.all(pd.notnull(self.mam1.frame)))
     self.mam2.parse_frame()
     self.assertEqual(self.mam2.frame.shape[0], 1)
     self.assertTrue(np.all(pd.notnull(self.mam2.frame)))
def parse_concepts_from_sheet(graph, vocabulary_name, sheet_data):
    """Parse vocabulary concepts from spreadsheet and into a graph."""

    base_uri = uri_prefix + vocabulary_name

    for index, row in sheet_data.iterrows():

        concept = URIRef(base_uri + uri_common_part + str(index))
        graph.add((concept, RDF.type, SKOS.Concept))
        graph.add((concept, SKOS.inScheme, URIRef(base_uri)))

        graph.add((concept, SKOS.topConceptOf, URIRef(base_uri)))
        graph.add((URIRef(base_uri), SKOS.hasTopConcept, concept))

        graph.add((concept, SKOS.prefLabel, Literal(row['Suomeksi'].rstrip(), lang='fi')))
        graph.add((concept, SKOS.prefLabel, Literal(row['Englanniksi'].rstrip(), lang='en')))
        graph.add((concept, SKOS.prefLabel, Literal(row['Ruotsiksi'].rstrip(), lang='sv')))

        if pandas.notnull(row[u'Synonyymi (YSO)']):
            graph.add((concept, SKOS.exactMatch, URIRef(str(row['Synonyymi (YSO)']))))

        if pandas.notnull(row[u'Läheinen käsite']):
            graph.add((concept, SKOS.closeMatch, URIRef(str(row[u'Läheinen käsite']))))

    return
def format_json_tweets(unprocessed_tweets):
    """
    This function accepts a list of json-formatted tweets.
    It stores them as a Pandas' DataFrame preserving only the content of the tweet, i.e. the text,
    and it's time of creation.
    It will apply a cleanup to the text in order to remove potentially harmful characters, such as
    unicode characters or escape characters.

    Parameters
    ----------
    unprocessed_tweets : list
        It is a list of json-formatted tweets with many data such as text, created_at, geo, is_translator
        Each line contains one tweet.

    Returns
    -------
    df : pandas.DataFrame
        df is a DataFrame that contains the content and timestamp of each tweet.
    """

    df = pd.DataFrame(json.loads(line) for line in unprocessed_tweets)

    # Remove unwanted data
    df = df[['text', 'created_at']]
    df = df[ pd.notnull(df['text']) ]
    df = df[ pd.notnull(df['created_at']) ]

    # Remove unicode characters
    df['text'] = df['text'].apply(removeUnicode)

    # Replace escape characters
    df['text'] = df['text'].apply(replaceEscape)

    return df
Example #29
0
 def test_parse_basis_set_order(self):
     self.mam1.parse_basis_set_order()
     self.assertEqual(self.mam1.basis_set_order.shape[0], 28)
     self.assertTrue(np.all(pd.notnull(self.mam1.basis_set_order)))
     self.mam2.parse_basis_set_order()
     self.assertEqual(self.mam2.basis_set_order.shape[0], 91)
     self.assertTrue(np.all(pd.notnull(self.mam2.basis_set_order)))
Example #30
0
def merge(listenings, artists):

    dataset = pd.merge(listenings, artists, left_on='artist', right_on='artist')
    dataset = dataset[pd.notnull(dataset['artist'])]
    dataset = dataset[pd.notnull(dataset['genre'])]
    print dataset.describe()
    save_csv(dataset, "../OUTPUT/listenings_genre_merged.csv")
Example #31
0
def get_relevant_and_reformatted_prescs(prescriptions, druglists, pt_features,
                                        window):
    '''
    Filter prescriptions to only include ones which are for relevant drugs and within the exposure window,
    and create 'amount' and 'unit' columns (necessary for calculating PDD)
    '''
    prescs = pd.merge(prescriptions,
                      pt_features[['patid', 'index_date']],
                      how='left',
                      on='patid')
    prescs = prescs.loc[pd.notnull(prescs['qty'])].copy(
    )  #remove the relatively small number of prescriptions where the quantity is NaN

    pegprod = pd.read_csv('dicts/proc_pegasus_prod.csv')
    prescs = pd.merge(
        prescs,
        pegprod[['prodcode', 'strength', 'route', 'drugsubstance']],
        how='left')

    #Only use prescriptions belonging to the main exposure window (not the ones used in sensitivity analysis)
    start_year = timedelta(days=(365 *
                                 abs(sd.exposure_windows[1]['start_year'])))
    end_year = timedelta(days=(
        365 *
        abs(sd.exposure_windows[1]['start_year'] + sd.window_length_in_years)))
    timely_presc_mask = (prescs['eventdate'] >=
                         (prescs['index_date'] - start_year)) & (
                             prescs['eventdate'] <=
                             (prescs['index_date'] - end_year))
    timely_prescs = prescs.loc[timely_presc_mask].copy()

    all_drugs = [drug for druglist in druglists for drug in druglist['drugs']]

    prodcodes = get_prodcodes_from_drug_name(all_drugs)
    relev_prescs = timely_prescs.loc[timely_prescs['prodcode'].isin(
        prodcodes)].copy()

    # Create new columns ('amount' and 'unit', extracted from the 'substrance strength' string)
    amount_and_unit = relev_prescs['strength'].str.extract(
        '([\d\.]+)([\d\.\+ \w\/]*)', expand=True)
    amount_and_unit.columns = ['amount', 'unit']
    amount_and_unit.amount = amount_and_unit.amount.astype('float')
    reformatted_prescs = pd.concat(
        [relev_prescs, amount_and_unit],
        axis=1).drop(['numpacks', 'numdays', 'packtype', 'issueseq'], axis=1)

    # Convert micrograms to mg
    micro_mask = reformatted_prescs['unit'].str.contains('microgram',
                                                         na=False,
                                                         case=False)
    reformatted_prescs.loc[micro_mask, 'amount'] /= 1000
    reformatted_prescs.loc[micro_mask, 'unit'] = 'mg'

    #Convert mg/Xml to mg for simplicity
    micro_mask = reformatted_prescs['unit'].str.contains('mg/',
                                                         na=False,
                                                         case=False)
    reformatted_prescs.loc[micro_mask, 'unit'] = 'mg'

    #Remove the small number of  prescriptions where there is no amount
    reformatted_prescs = reformatted_prescs[pd.notnull(
        reformatted_prescs['amount'])].copy()

    # Create a 'total_amount' column - used to calculate each pt's PDDs for a given drug.
    reformatted_prescs['total_amount'] = reformatted_prescs[
        'qty'] * reformatted_prescs['amount']

    #Change all 'numeric daily doses' (NDD) from 0 (this appears to be the default in the CPRD data) to 1.
    #Note that an NDD of 2 means 'twice daily'
    reformatted_prescs.loc[reformatted_prescs['ndd'] == 0, 'ndd'] = 1

    return reformatted_prescs
Example #32
0
def test():
    df = pd.read_csv("./result/kko_regex.csv")
    df.Date = pd.to_datetime(df.Date)

    df["year"] = df['Date'].dt.strftime('%Y')
    df["month"] = df['Date'].dt.strftime('%m')
    df["day"] = df['Date'].dt.strftime('%d')
    df["weekday"] = df['Date'].dt.strftime('%A')

    df["24time"] = df["timetype"] + " " + df["time"]

    df.time = pd.to_datetime(df.time)

    temp = []
    transform_time = []

    for i in range(len(df)):
        time = df["24time"][i]
        #print(time)
        temp.append(dt.datetime.strptime(time, "%p %I:%M:%S"))
        transform_time.append(temp[i].time())

    df["24time"] = transform_time

    df["hh"] = df["24time"].apply(lambda x: x.strftime("%H")
                                  if pd.notnull(x) else '')
    df["mm"] = df["24time"].apply(lambda x: x.strftime("%M")
                                  if pd.notnull(x) else '')

    df.head()
    print(username)

    plt.rc('font', family='NanumGothic')

    plt.figure(2)
    sns.countplot(x="weekday", data=df)
    plt.title("요일 별 대화 수")
    plt.legend()
    # plt.savefig('./static/days.png')
    plt.savefig('./static/' + name2 + '.png')

    plt.figure(3)
    sns.countplot(x="Speaker", data=df)
    plt.title("사용자 별 대화 수")
    plt.legend()
    # plt.savefig('./static/users.png')
    plt.savefig('./static/' + name3 + '.png')

    plt.figure(4)
    sns.countplot(x="emotions", data=df)
    plt.title("감정의 분포")
    plt.legend()
    # plt.savefig('./static/emotion.png')
    plt.savefig('./static/' + name4 + '.png')

    # 추가하고 싶은 부분
    # plt.figure(4)
    # for i in username:
    #     g = sns.kdeplot(df["hh"][(df['Speaker'] == i) & (df["hh"].notnull())], bw=1.5)
    # g.set_xlabel("viewCount")
    # g.set_ylabel("Frequency")
    # plt.title("Chat Rate by Hour")
    # plt.legend()
    # plt.show()
    # plt.savefig('./static/'+name5+'.png')

    file_path = './static'
    file_list = os.listdir(file_path)

    for file_name in file_list:
        old_name = file_path + '/' + file_name
        new_name = file_path + '/' + random.choice(
            string.ascii_letters) + random.choice(
                string.ascii_letters) + random.choice(
                    string.ascii_letters) + random.choice(
                        string.ascii_letters) + random.choice(
                            string.ascii_letters) + random.choice(
                                string.ascii_letters) + '.png'
        os.rename(old_name, new_name)
Example #33
0
candidate_tickets

# Nice, we got some candidates! Let's verify with one of them.

# In[17]:

df[df.Ticket == '113781']

# Great! We can complete some Cabins!

# In[18]:

shared_tickets = candidate_tickets.index.tolist()

find_cabin_given_ticket = lambda ticket: df[
    (df.Ticket == ticket) & (pd.notnull(df.Cabin))].Cabin.values[0]


def assign_cabin(row):
    if pd.isnull(row.Cabin) and row.Ticket in shared_tickets:
        return find_cabin_given_ticket(row.Ticket)
    return row.Cabin


df['Cabin'] = df[['Cabin', 'Ticket']].apply(assign_cabin, axis=1)
df['cabin_letter'] = df['Cabin'].apply(lambda c: c[0]
                                       if not pd.isnull(c) else 'N')  # N=none

df[df.Ticket == '113781']

# In[19]:
Example #34
0
def calculate_bbh(blast_results_1,
                  blast_results_2,
                  r_name=None,
                  g_name=None,
                  outdir=''):
    """Calculate the best bidirectional BLAST hits (BBH) and save a dataframe of results.

    Args:
        blast_results_1 (str): BLAST results for reference vs. other genome
        blast_results_2 (str): BLAST results for other vs. reference genome
        r_name: Name of reference genome
        g_name: Name of other genome
        outdir: Directory where BLAST results are stored.

    Returns:
        Path to Pandas DataFrame of the BBH results.

    """
    # TODO: add force_rerun option

    cols = [
        'gene', 'subject', 'PID', 'alnLength', 'mismatchCount', 'gapOpenCount',
        'queryStart', 'queryEnd', 'subjectStart', 'subjectEnd', 'eVal',
        'bitScore'
    ]

    if not r_name and not g_name:
        r_name = op.basename(blast_results_1).split('_vs_')[0]
        g_name = op.basename(blast_results_1).split('_vs_')[1].replace(
            '_blast.out', '')

        r_name2 = op.basename(blast_results_2).split('_vs_')[1].replace(
            '_blast.out', '')
        if r_name != r_name2:
            log.warning('{} != {}'.format(r_name, r_name2))

    outfile = op.join(outdir, '{}_vs_{}_bbh.csv'.format(r_name, g_name))
    if op.exists(outfile) and os.stat(outfile).st_size != 0:
        log.debug('{} vs {} BLAST BBHs already found at {}'.format(
            r_name, g_name, outfile))
        return outfile

    bbh1 = pd.read_csv(blast_results_1, sep='\t', names=cols)
    bbh2 = pd.read_csv(blast_results_2, sep='\t', names=cols)

    out = pd.DataFrame()
    log.debug('Finding BBHs for {} vs. {}'.format(r_name, g_name))

    for g in bbh1[pd.notnull(bbh1.gene)].gene.unique():
        res = bbh1[bbh1.gene == g]
        if len(res) == 0:
            continue
        best_hit = res.ix[res.PID.idxmax()].copy()
        best_gene = best_hit.subject
        res2 = bbh2[bbh2.gene == best_gene]
        if len(res2) == 0:
            continue
        best_hit2 = res2.ix[res2.PID.idxmax()]
        best_gene2 = best_hit2.subject
        if g == best_gene2:
            best_hit['BBH'] = '<=>'
        else:
            best_hit['BBH'] = '->'
        out = pd.concat([out, pd.DataFrame(best_hit).transpose()])

    out.to_csv(outfile)
    log.debug('{} vs {} BLAST BBHs saved at {}'.format(r_name, g_name,
                                                       outfile))
    return outfile
Example #35
0
#-*- coding:utf-8 -*-
# Peishichao
import pandas as pd
from apriori import find_rule

inputfile = '../data/menu_orders.xls'
outputfile = '../data/apriori_rules.xls'

data = pd.read_excel(inputfile, index_col=None)

print(u'\n转化原始数据至0-1矩阵...')

ct = lambda x: pd.Series(1, index=x[pd.notnull(x)])  #转化0-1矩阵的过度函数

b = map(ct, data.as_matrix())
data = pd.DataFrame(list(b)).fillna(0)

print(u'\n转化完毕')

del b

support = 0.2
confidence = 0.5

ms = '---'
find_rule(data, support, confidence, ms).to_excel(outputfile)
Example #36
0
def get_series_for_label(mgra_dataframe, label, multiplier):
    # grab the column
    series = mgra_dataframe[label].copy()
    # set all non-null values equal to the multiplier
    series[pandas.notnull(series)] = multiplier
    return series
Example #37
0
 def remove_random_nan(pd_obj):
     return pd_obj.where((pd.notnull(pd_obj)), None)
Example #38
0
def get_condition_status(pt_features, entries, prescriptions, window,
                         condition):
    '''
    Searches a patient's history (i.e. the list of medcoded entries) for any one of a list of related Read codes
    (e.g. 'clinically significant alcohol use', or 'insomnia') during a given exposure window  (e.g. 5-10 years prior to index date).
    According to the 'count_or_boolean' parameter, will return either a count of the Read codes (i.e. insomnia) or a simple boolean (all other conditions).
    '''

    new_colname = condition['name']

    if new_colname in pt_features.columns:  #delete column if it already exists (otherwise this causes problems with the 'fillna' command below)
        pt_features.drop(new_colname, axis=1, inplace=True)

    # If we're using all the patient's history from the exposure window back to birth
    #(e.g. for intellectual disability), overwrite the predefined exposure windows with a single window
    if condition['record_exposure_in_window_period_only'] == True:
        start_year = timedelta(days=(365 * abs(window['start_year'])))
    else:  #for all other conditions, record exposure from end of window period back to start of their records
        start_year = timedelta(days=(365 * 100))

    medcount_colname = new_colname + '_Read_code_count'

    medcodes = get_medcodes_from_readcodes(condition['codes'])
    medcode_events = entries[entries['medcode'].isin(medcodes)]

    medcode_events = medcode_events[pd.notnull(
        medcode_events['eventdate']
    )]  #drops a small number of rows  with NaN eventdates
    # display(medcode_events.head(10))

    # print('\tTotal {0} events in all medcoded_events dataframe: {1}'.format(condition['name'],len(medcode_events)))
    medcode_events = pd.merge(medcode_events[['patid', 'eventdate']],
                              pt_features[['patid', 'index_date']],
                              how='inner',
                              on='patid')

    # Restrict event counts to those that occur during pt's exposure window
    relevant_event_mask = (medcode_events['eventdate'] >=
                           (medcode_events['index_date'] - start_year)) & (
                               medcode_events['eventdate'] <=
                               (medcode_events['index_date'] - timedelta(
                                   days=(365 * sd.window_length_in_years))))
    window_medcode_events = medcode_events.loc[relevant_event_mask]
    window_medcode_events = window_medcode_events.groupby(
        'patid')['eventdate'].count().reset_index()
    window_medcode_events.columns = ['patid', medcount_colname]
    # print('\t{0} events in this window for our patients: {1}'.format(new_colname,len(window_medcode_events)))

    #delete zero counts
    window_medcode_events = window_medcode_events.loc[
        window_medcode_events[medcount_colname] > 0]

    pt_features = pd.merge(pt_features, window_medcode_events, how='left')
    pt_features[medcount_colname].fillna(0, inplace=True)

    pt_features.loc[pt_features[medcount_colname] > 0, new_colname] = 1
    pt_features.loc[pt_features[medcount_colname] == 0, new_colname] = 0

    if len(condition['medications']) > 0:
        presc_count_colname = new_colname + '_prescription_count'

        prodcodes = get_prodcodes_from_drug_name(condition['medications'])
        prescriptions = prescriptions.loc[prescriptions['prodcode'].isin(
            prodcodes)].copy()
        prescriptions = prescriptions.loc[pd.notnull(
            prescriptions['qty']
        )].copy(
        )  #remove the relatively small number of prescriptions where the quantity is NaN

        # Some conditions (e.g. insomnia) are also defined by whether or not certain medications are prescribed
        prescriptions = pd.merge(prescriptions[['patid', 'eventdate']],
                                 pt_features[['patid', 'index_date']],
                                 how='inner',
                                 on='patid')

        start_year = timedelta(days=(365 * abs(window['start_year'])))
        end_year = timedelta(
            days=(365 * abs(window['start_year'] + sd.window_length_in_years)))
        timely_presc_mask = (prescriptions['eventdate'] >=
                             (prescriptions['index_date'] - start_year)) & (
                                 prescriptions['eventdate'] <=
                                 (prescriptions['index_date'] - end_year))
        prescriptions = prescriptions.loc[timely_presc_mask].copy()

        prescriptions = prescriptions.groupby(
            'patid')['eventdate'].count().reset_index()
        prescriptions.columns = ['patid', presc_count_colname]
        prescriptions = prescriptions.loc[
            prescriptions[presc_count_colname] > 0]

        pt_features = pd.merge(pt_features, prescriptions, how='left')
        pt_features[presc_count_colname].fillna(0, inplace=True)

        # convert condition from a count to a boolean
        pt_features.loc[(pt_features[medcount_colname] > 0) |
                        (pt_features[presc_count_colname] > 0),
                        new_colname] = 1
        pt_features.drop(presc_count_colname, axis=1, inplace=True)

    pt_features.drop(medcount_colname, axis=1, inplace=True)

    pt_features[new_colname] = pt_features[new_colname].astype(int)

    return pt_features
Example #39
0
                   left_on=['PROPERTYADDRESS', 'PROPERTYHOUSENUM'],
                   right_on=['street', 'number'])

# making the fire column with all type 100s as fires
pcafire['fire'] = pcafire['full.code'].astype(str).str[0]
pcafire.loc[pcafire.fire == '1', 'fire'] = 'fire'
pcafire.loc[pcafire.fire != 'fire', 'fire'] = 'No fire'
pcafire['full.code'][pcafire['fire'] == 'fire'] = None

#Removing vacant commerical land
pcafire = pcafire[pcafire.USEDESC != 'VACANT COMMERCIAL LAND']

#Fire occured after inspection
pcafire1 = pcafire[(pcafire.CALL_CREATED_DATE >= pcafire.INSPECTION_DATE)]
pcafire1 = pcafire[(pcafire.CALL_CREATED_DATE >= pcafire.INSPECTION_DATE)]
pcafire1 = pcafire1[pd.notnull(pcafire1.INSPECTION_DATE)]

#checking if violation is in the same year as the fire and keeping only those
pcafire2 = pcafire1[(pcafire1.violation_year == pcafire1.fire_year)]

#joining all rows with no pli violations
fire_nopli = pd.concat([
    fire_new, pcafire2[[
        'number', 'street', 'CALL_CREATED_DATE', 'full.code', 'response_time',
        'fire_year'
    ]], pcafire2[[
        'number', 'street', 'CALL_CREATED_DATE', 'full.code', 'response_time',
        'fire_year'
    ]]
]).drop_duplicates(keep=False)
pcafire_nopli = pd.merge(pcafinal,
Example #40
0
def get_index_date_and_caseness_and_add_final_dementia_subtype(
        all_entries, pt_features):
    '''
    Calculates  index date and establishes caseness by looking for first dementia diagnoses.
    Also looks for final dementia diagnosis (e.g. 'vascular dementia'), as this is likely to be our best guess as to the dementia subtype
    '''
    pegmed = pd.read_csv('dicts/proc_pegasus_medical.csv', delimiter=',')
    pegprod = pd.read_csv('dicts/proc_pegasus_prod.csv', delimiter=',')
    medcodes = get_medcodes_from_readcodes(
        codelists.alzheimer_vascular_and_non_specific_dementias['codes'])
    prodcodes = get_prodcodes_from_drug_name(
        codelists.alzheimer_vascular_and_non_specific_dementias['medications'])

    entries_with_antidementia_presc_mask = all_entries['prodcode'].isin(
        prodcodes)
    entries_with_dementia_dx_mask = all_entries['medcode'].isin(medcodes)

    #For the purpose of my paper's flow chart of patient selection,
    #get number of cases where there is an antidementia prescription but not a dementia diagnosis
    patids_prescribed_antidementia_drugs = set(
        all_entries.loc[entries_with_antidementia_presc_mask, 'patid'])
    patids_with_dementia_dx = set(
        all_entries.loc[entries_with_dementia_dx_mask, 'patid'])
    total_pts_prescribed_antidementia_drugs_but_no_dementia_dx = len(
        pt_features[
            (pt_features['patid'].isin(patids_prescribed_antidementia_drugs))
            & ~(pt_features['patid'].isin(patids_with_dementia_dx))])
    print(
        'Number of patients prescribed antidementia drugs but not diagnosed with dementia:',
        total_pts_prescribed_antidementia_drugs_but_no_dementia_dx)

    # from the all_entries df, get just those which contain a dementia dx or an antidementia drug prescription
    all_dementia_entries = all_entries[entries_with_antidementia_presc_mask
                                       | entries_with_dementia_dx_mask]
    # for clarity, look up the Read terms
    all_dem_labelled = pd.merge(all_dementia_entries, pegmed, how='left')[[
        'patid', 'prodcode', 'medcode', 'sysdate', 'eventdate', 'type'
    ]]
    # for clarity, look up the drug names
    all_dem_labelled = pd.merge(all_dem_labelled, pegprod, how='left')[[
        'patid', 'medcode', 'prodcode', 'sysdate', 'eventdate', 'type',
        'drugsubstance'
    ]]
    all_dem_labelled.loc[:, 'eventdate'] = pd.to_datetime(
        all_dem_labelled.loc[:, 'eventdate'])
    #Get the date of earliest dementia diagnosis / antidementia drug prescription - this will be the revised index date, and will also determine revised caseness
    earliest_dementia_dates = all_dem_labelled.groupby(
        'patid')['eventdate'].min().reset_index()
    earliest_dementia_dates.rename(columns={'eventdate': 'index_date'},
                                   inplace=True)

    pt_features = pd.merge(pt_features, earliest_dementia_dates, how='left')
    pt_features['isCase'] = np.where(pd.notnull(pt_features['index_date']),
                                     True, False)
    # Get the final dementia diagnosis
    just_dementia_diagnoses = all_dem_labelled[pd.isnull(
        all_dem_labelled['prodcode'])]
    final_dementia_dx = just_dementia_diagnoses.loc[
        just_dementia_diagnoses.groupby('patid')['eventdate'].idxmax()][[
            'patid', 'medcode'
        ]]
    final_dementia_dx.rename(columns={'medcode': 'final dementia medcode'},
                             inplace=True)

    pt_features = pd.merge(pt_features, final_dementia_dx, how='left')
    return pt_features
Example #41
0
import pandas as pd
df = pd.read_csv('train.csv')
dfield = df[df['Survived'] == 1]
print(dfield)
array_sobreviventes = df['PassengerId'].unique()
all_number = array_sobreviventes[-1]
survived_number = dfield.sum()['Survived']
percent = (survived_number / all_number) * 100

print('Porcetagem de sobreviventes: ' + str(percent))
df = df[pd.notnull(df['Embarked'])]
print(df)

pd.get_dummies(data=df, columns=df['Embarked'])
    plt.legend(bbox_to_anchor=(1.1, 1.05))

    plt.savefig("../reports/figures/club_01_bar_{0}.PNG".format(txt),
                bbox_inches='tight')
    plt.show()

# In[12]:

atts = ["Country", "Tier"]
for col in [
        "Manager", "ManagerOpp", "Stadium", "Referee", "Latitude", "Longitude"
]:
    #     for att in ["Country", "Tier"]:
    print("\n#######\n")
    print("Sample size and means with {0} by {1}".format(col, ", ".join(atts)))
    print(df.loc[pd.notnull(df[col]), ].groupby(atts).TotalGoals.agg(
        ["size", "mean"]).sort_index())

# ## Mapping Goals

# In[13]:

# mapdata=df.dropna(subset=['Latitude', 'Longitude'])
mapdata = df.dropna(subset=['Latitude', 'Longitude']).groupby(
    ['Latitude', 'Longitude', 'Country']).TotalGoals.mean().reset_index()

# fg = sns.FacetGrid(data=mapdata, hue='Country', height=6, aspect=.9)
# fg.map(plt.scatter, 'Longitude', 'Latitude').add_legend()

# sns.lmplot(x='Longitude', y='Latitude', s='TotalGoals', hue='Country', data=mapdata, fit_reg=False,
#            x_jitter=0.1, y_jitter=0.1, markers="o", palette="viridis", height=7)
Example #43
0
    def make_df(self, exclude_stmts=None, complex_members=3):
        """Create a dataframe containing information extracted from assembler's
        list of statements necessary to build an IndraNet.

        Parameters
        ----------
        exclude_stmts : list[str]
            A list of statement type names to not include in the dataframe.
        complex_members : int
            Maximum allowed size of a complex to be included in the
            data frame. All complexes larger than complex_members will be
            rejected. For accepted complexes, all permutations of their
            members will be added as dataframe records. Default is `3`.

        Returns
        -------
        df : pd.DataFrame
            Pandas DataFrame object containing information extracted from
            statements. It contains the following columns:
            
            *agA_name*
                The first Agent's name.
            *agA_ns*
                The first Agent's identifier namespace as per `db_refs`.
            *agA_id*
                The first Agent's identifier as per `db_refs`
            *ags_ns, agB_name, agB_id*
                As above for the second agent. Note that the Agent may be None
                (and these fields left empty) if the Statement consists only
                of a single Agent (e.g., SelfModification, ActiveForm,
                or Translocation statement).
            *stmt_type*
                Statement type, given by the name of the class
                in indra.statements.
            *evidence_count*
                Number of evidences for the statement.
            *stmt_hash*
                An unique long integer hash identifying the content of the
                statement.
            *belief*
                The belief score associated with the statement.
            *source_counts*
                The number of evidences per input source for the statement.
            *initial_sign*
                The default sign (polarity) associated with the given
                statement if the statement type has implied polarity.
                To facilitate weighted path finding, the sign is represented
                as 0 for positive polarity and 1 for negative polarity.
        """
        rows = []
        if exclude_stmts:
            exclude_types = tuple(
                get_statement_by_name(st_type) for st_type in exclude_stmts)
        else:
            exclude_types = ()
        for stmt in self.statements:
            # Exclude statements from given exclude list
            if isinstance(stmt, exclude_types):
                logger.debug('Skipping a statement of a type %s.'
                             % type(stmt).__name__)
                continue
            agents = stmt.agent_list()
            not_none_agents = [a for a in agents if a is not None]

            # Exclude statements with less than 2 agents
            if len(not_none_agents) < 2:
                continue
            # Special handling for Influences and Associations
            if isinstance(stmt, (Influence, Association)):
                stmt_pol = stmt.overall_polarity()
                if stmt_pol == 1:
                    sign = 0
                elif stmt_pol == -1:
                    sign = 1
                else:
                    sign = None
                if isinstance(stmt, Influence):
                    edges = [(stmt.subj.concept, stmt.obj.concept, sign)]
                else:
                    edges = [(a, b, sign) for a, b in
                             permutations(not_none_agents, 2)]
            # Handle complexes by creating pairs of their
            # not-none-agents.
            elif isinstance(stmt, Complex):
                # Do not add complexes with more members than complex_members
                if len(not_none_agents) > complex_members:
                    logger.debug('Skipping a complex with %d members.'
                                 % len(not_none_agents))
                    continue
                else:
                    # add every permutation with a neutral polarity
                    edges = [(a, b, None) for a, b in
                             permutations(not_none_agents, 2)]
            elif isinstance(stmt, Conversion):
                edges = []
                if stmt.subj:
                    for obj in stmt.obj_from:
                        edges.append((stmt.subj, obj, 1))
                    for obj in stmt.obj_to:
                        edges.append((stmt.subj, obj, 0))
            # This is for any remaining statement type that may not be
            # handled above explicitly but somehow has more than two
            # not-none-agents at this point
            elif len(not_none_agents) > 2:
                continue
            else:
                edges = [(not_none_agents[0], not_none_agents[1], None)]
            for (agA, agB, sign) in edges:
                agA_ns, agA_id = get_ag_ns_id(agA)
                agB_ns, agB_id = get_ag_ns_id(agB)
                stmt_type = type(stmt).__name__
                row = OrderedDict([
                    ('agA_name', agA.name),
                    ('agB_name', agB.name),
                    ('agA_ns', agA_ns),
                    ('agA_id', agA_id),
                    ('agB_ns', agB_ns),
                    ('agB_id', agB_id),
                    ('stmt_type', stmt_type),
                    ('evidence_count', len(stmt.evidence)),
                    ('stmt_hash', stmt.get_hash(refresh=True)),
                    ('belief', stmt.belief),
                    ('source_counts', _get_source_counts(stmt)),
                    ('initial_sign', sign)])
                rows.append(row)
        df = pd.DataFrame.from_dict(rows)
        df = df.where((pd.notnull(df)), None)
        return df
Example #44
0
def load_template_to_dataframe(fn, index='sample_name'):
    """Load a sample/prep template or a QIIME mapping file into a data frame

    Parameters
    ----------
    fn : str or file-like object
        filename of the template to load, or an already open template file
    index : str, optional
        Defaults to 'sample_name'. The index to use in the loaded information

    Returns
    -------
    DataFrame
        Pandas dataframe with the loaded information

    Raises
    ------
    ValueError
        Empty file passed
    QiitaDBColumnError
        If the sample_name column is not present in the template.
    QiitaDBWarning
        When columns are dropped because they have no content for any sample.
    QiitaDBError
        When non UTF-8 characters are found in the file.
    QiitaDBDuplicateHeaderError
        If duplicate columns are present in the template

    Notes
    -----
    The index attribute of the DataFrame will be forced to be 'sample_name'
    and will be cast to a string. Additionally rows that start with a '\t'
    character will be ignored and columns that are empty will be removed. Empty
    sample names will be removed from the DataFrame.

    Column names are case-insensitive but will be lowercased on addition to
    the database

    Everything in the DataFrame will be read and managed as string
    """
    # Load in file lines
    holdfile = None
    with qdb.util.open_file(fn, mode='U') as f:
        errors = defaultdict(list)
        holdfile = f.readlines()
        # here we are checking for non UTF-8 chars
        for row, line in enumerate(holdfile):
            for col, block in enumerate(line.split('\t')):
                try:
                    tblock = block.encode('utf-8')
                except UnicodeDecodeError:
                    tblock = unicode(block, errors='replace')
                    tblock = tblock.replace(u'\ufffd', '&#128062;')
                    errors[tblock].append('(%d, %d)' % (row, col))
        if bool(errors):
            raise ValueError(
                "There are invalid (non UTF-8) characters in your information "
                "file. The offending fields and their location (row, column) "
                "are listed below, invalid characters are represented using "
                "&#128062;: %s" % '; '.join([
                    '"%s" = %s' % (k, ', '.join(v))
                    for k, v in viewitems(errors)
                ]))

    if not holdfile:
        raise ValueError('Empty file passed!')

    if index == "#SampleID":
        # We're going to parse a QIIME mapping file. We are going to first
        # parse it with the QIIME function so we can remove the comments
        # easily and make sure that QIIME will accept this as a mapping file
        data, headers, comments = _parse_mapping_file(holdfile)
        holdfile = ["%s\n" % '\t'.join(d) for d in data]
        holdfile.insert(0, "%s\n" % '\t'.join(headers))
        # The QIIME parser fixes the index and removes the #
        index = 'SampleID'

    # Strip all values in the cells in the input file
    for pos, line in enumerate(holdfile):
        cols = line.split('\t')
        if pos == 0 and index != 'SampleID':
            # get and clean the controlled columns
            ccols = {'sample_name'}
            ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
            newcols = [
                c.lower().strip() if c.lower().strip() in ccols else c.strip()
                for c in cols
            ]

            # while we are here, let's check for duplicate columns headers
            if len(set(newcols)) != len(newcols):
                raise qdb.exceptions.QiitaDBDuplicateHeaderError(
                    find_duplicates(newcols))
        else:
            # .strip will remove odd chars, newlines, tabs and multiple
            # spaces but we need to read a new line at the end of the
            # line(+'\n')
            newcols = [d.strip(" \r\n") for d in cols]

        holdfile[pos] = '\t'.join(newcols) + '\n'

    # index_col:
    #   is set as False, otherwise it is cast as a float and we want a string
    # keep_default:
    #   is set as False, to avoid inferring empty/NA values with the defaults
    #   that Pandas has.
    # comment:
    #   using the tab character as "comment" we remove rows that are
    #   constituted only by delimiters i. e. empty rows.
    template = pd.read_csv(StringIO(''.join(holdfile)),
                           sep='\t',
                           dtype=str,
                           encoding='utf-8',
                           infer_datetime_format=False,
                           keep_default_na=False,
                           index_col=False,
                           comment='\t',
                           converters={index: lambda x: str(x).strip()})
    # remove newlines and tabs from fields
    template.replace(to_replace='[\t\n\r\x0b\x0c]+',
                     value='',
                     regex=True,
                     inplace=True)

    initial_columns = set(template.columns)

    if index not in template.columns:
        raise qdb.exceptions.QiitaDBColumnError(
            "The '%s' column is missing from your template, this file cannot "
            "be parsed." % index)

    # remove rows that have no sample identifier but that may have other data
    # in the rest of the columns
    template.dropna(subset=[index], how='all', inplace=True)

    # set the sample name as the index
    template.set_index(index, inplace=True)

    # it is not uncommon to find templates that have empty columns so let's
    # find the columns that are all ''
    columns = np.where(np.all(template.applymap(lambda x: x == ''), axis=0))
    template.drop(template.columns[columns], axis=1, inplace=True)

    initial_columns.remove(index)
    dropped_cols = initial_columns - set(template.columns)
    if dropped_cols:
        warnings.warn(
            'The following column(s) were removed from the template because '
            'all their values are empty: %s' % ', '.join(dropped_cols),
            qdb.exceptions.QiitaDBWarning)

    # Pandas represents data with np.nan rather than Nones, change it to None
    # because psycopg2 knows that a None is a Null in SQL, while it doesn't
    # know what to do with NaN
    template = template.where((pd.notnull(template)), None)

    return template
mRests = rests[rests['BORO'] == "MANHATTAN"]  ## Look at only Manhattan Data

# In[4]:

list(mRests.columns.values)

# In[5]:

mRests = mRests[
    mRests['GRADE'] !=
    "Not Yet Graded"]  ## Remove stores that have not been graded yet

# In[6]:

mRests = mRests[pd.notnull(
    mRests["GRADE"])]  ## Remove Stores that have no grade

# In[7]:

mRests = mRests[pd.notnull(mRests["SCORE"])]  ## Remove stores with no score

# In[8]:

mRests["SCORE"].describe()

# In[9]:

mRests["GRADE"] = mRests["GRADE"].astype(
    "category", categories=["A", "B", "C", "P",
                            "Z"], ordered=True)  ## redefine score levels
Example #46
0
 def prepare_compare(df):
     df = df.sort_values(orderby).reset_index(drop=True)
     df = df.where((pd.notnull(df)), None)
     return df
Example #47
0
for i in range(len(df)):
    u = df['User'][i].lower()
    r = df['RoomName'][i]
    mention = df['Mentions'][i]
    if "P:" in r:
        u1, u2 = r.replace("P:", "").split('|')
        if u1 == u:
            r = u2
        else:
            r = u1
        r = r.lower()
        dtset['nodes']['people'][r]['size'] += 1
    else:
        dtset['nodes']['projects'][r]['size'] += 1
    l = "{},{}".format(r, u)
    if l not in dtset['links']:
        dtset['links'][l] = {'weight': 0, 'mention_weight': 0}
    dtset['links'][l]['weight'] += 1
    dtset['nodes']['people'][u]['size'] += 1
    if pd.notnull(mention):
        users = mention.split(',')
        regex = re.compile('[^a-zA-Z]')
        users = [regex.sub('', k).lower() for k in users]
        users = [k for k in users if k in dtset['nodes']['people'].keys()]
        for user in users:
            l = "{},{}".format(u, user)
            if l not in dtset['links']:
                dtset['links'][l] = {'weight': 0, 'mention_weight': 0}
            dtset['links'][l]['mention_weight'] += 1
json.dump(dtset, open("leo.json", 'w'))
Example #48
0
    agg.sort_index(ascending=asc, inplace=True)
    agg.columns = [
        'Matches played', 'Points per game', '% correct result',
        '% correct goal diff', '% correct score', 'Goals per game (predicted)',
        'Goals per game (actual)', '% games won (predicted)',
        '% games won (actual)'
    ]
    #     print(agg.columns)

    return agg


overall = pd.DataFrame(
    {
        "Matches played":
        output[pd.notnull(output.Actual_result)].shape[0],
        "Points per game":
        output[pd.notnull(output.Actual_result)].Points.mean(),
        "% correct result":
        output[pd.notnull(output.Actual_result)].Correct_result.mean(),
        "% correct goal diff":
        output[pd.notnull(output.Actual_result)].Correct_goal_diff.mean(),
        "% correct score":
        output[pd.notnull(output.Actual_result)].Correct_score.mean(),
        "Goals per game (predicted)":
        output[pd.notnull(output.Actual_result)].Predicted_goal_total.mean(),
        "Goals per game (actual)":
        output[pd.notnull(output.Actual_result)].Actual_goal_total.mean(),
        "% games won (predicted)":
        output[pd.notnull(output.Actual_result)
               & (output.Predicted_result != "Draw")].shape[0] /
Example #49
0
def _clean_features(struct):
    """Cleans up the features collected in parse_play_details.

    :struct: Pandas Series of features parsed from details string.
    :returns: the same dict, but with cleaner features (e.g., convert bools,
    ints, etc.)
    """
    struct = dict(struct)
    # First, clean up play type bools
    ptypes = [
        'isKickoff', 'isTimeout', 'isFieldGoal', 'isPunt', 'isKneel',
        'isSpike', 'isXP', 'isTwoPoint', 'isPresnapPenalty', 'isPass', 'isRun'
    ]
    for pt in ptypes:
        struct[pt] = struct[pt] if pd.notnull(struct.get(pt)) else False
    # Second, clean up other existing variables on a one-off basis
    struct['callUpheld'] = struct.get('callUpheld') == 'upheld'
    struct['fgGood'] = struct.get('fgGood') == 'good'
    struct['isBlocked'] = struct.get('isBlocked') == 'blocked'
    struct['isComplete'] = struct.get('isComplete') == 'complete'
    struct['isFairCatch'] = struct.get('isFairCatch') == 'fair catch'
    struct['isMuffedCatch'] = pd.notnull(struct.get('isMuffedCatch'))
    struct['isNoPlay'] = (
        ' (no play)' in struct['detail']
        and 'penalty enforced in end zone' not in struct['detail']
        if struct.get('detail') else False)
    struct['isOnside'] = struct.get('isOnside') == 'onside'
    struct['isSack'] = pd.notnull(struct.get('sackYds'))
    struct['isSafety'] = (struct.get('isSafety') == ', safety' or
                          (struct.get('detail') and
                           'enforced in end zone, safety' in struct['detail']))
    struct['isTD'] = struct.get('isTD') == ', touchdown'
    struct['isTouchback'] = struct.get('isTouchback') == ', touchback'
    struct['oob'] = pd.notnull(struct.get('oob'))
    struct['passLoc'] = PASS_OPTS.get(struct.get('passLoc'), np.nan)
    if struct['isPass']:
        pyds = struct['passYds']
        struct['passYds'] = pyds if pd.notnull(pyds) else 0
    if pd.notnull(struct['penalty']):
        struct['penalty'] = struct['penalty'].strip()
    struct['penDeclined'] = struct.get('penDeclined') == 'Declined'
    if struct['quarter'] == 'OT':
        struct['quarter'] = 5
    struct['rushDir'] = RUSH_OPTS.get(struct.get('rushDir'), np.nan)
    if struct['isRun']:
        ryds = struct['rushYds']
        struct['rushYds'] = ryds if pd.notnull(ryds) else 0
    year = struct.get('season', np.nan)
    struct['timeoutTeam'] = sportsref.nfl.teams.team_ids(year).get(
        struct.get('timeoutTeam'), np.nan)
    struct['twoPointSuccess'] = struct.get('twoPointSuccess') == 'succeeds'
    struct['xpGood'] = struct.get('xpGood') == 'good'

    # Third, ensure types are correct
    bool_vars = [
        'fgGood', 'isBlocked', 'isChallenge', 'isComplete', 'isFairCatch',
        'isFieldGoal', 'isKickoff', 'isKneel', 'isLateral', 'isNoPlay',
        'isPass', 'isPresnapPenalty', 'isPunt', 'isRun', 'isSack', 'isSafety',
        'isSpike', 'isTD', 'isTimeout', 'isTouchback', 'isTwoPoint', 'isXP',
        'isMuffedCatch', 'oob', 'penDeclined', 'twoPointSuccess', 'xpGood'
    ]
    int_vars = [
        'down', 'fgBlockRetYds', 'fgDist', 'fumbRecYdLine', 'fumbRetYds',
        'intRetYds', 'intYdLine', 'koRetYds', 'koYds', 'muffRetYds',
        'pbp_score_aw', 'pbp_score_hm', 'passYds', 'penYds', 'puntBlockRetYds',
        'puntRetYds', 'puntYds', 'quarter', 'rushYds', 'sackYds', 'timeoutNum',
        'ydLine', 'yds_to_go'
    ]
    float_vars = ['exp_pts_after', 'exp_pts_before', 'home_wp']
    string_vars = [
        'challenger', 'detail', 'fairCatcher', 'fgBlockRecoverer', 'fgBlocker',
        'fgKicker', 'fieldSide', 'fumbForcer', 'fumbRecFieldSide',
        'fumbRecoverer', 'fumbler', 'intFieldSide', 'interceptor', 'kneelQB',
        'koKicker', 'koReturner', 'muffRecoverer', 'muffedBy', 'passLoc',
        'passer', 'penOn', 'penalty', 'puntBlockRecoverer', 'puntBlocker',
        'puntReturner', 'punter', 'qtr_time_remain', 'rushDir', 'rusher',
        'sacker1', 'sacker2', 'spikeQB', 'tackler1', 'tackler2', 'target',
        'timeoutTeam', 'xpKicker'
    ]
    for var in bool_vars:
        struct[var] = struct.get(var) is True
    for var in int_vars:
        try:
            struct[var] = int(struct.get(var))
        except (ValueError, TypeError):
            struct[var] = np.nan
    for var in float_vars:
        try:
            struct[var] = float(struct.get(var))
        except (ValueError, TypeError):
            struct[var] = np.nan
    for var in string_vars:
        if var not in struct or pd.isnull(struct[var]) or var == '':
            struct[var] = np.nan

    # Fourth, create new helper variables based on parsed variables
    # creating fieldSide and ydline from location
    if struct['isXP']:
        struct['fieldSide'] = struct['ydLine'] = np.nan
    else:
        fieldSide, ydline = _loc_to_features(struct.get('location'))
        struct['fieldSide'] = fieldSide
        struct['ydLine'] = ydline
    # creating secsElapsed (in entire game) from qtr_time_remain and quarter
    if pd.notnull(struct.get('qtr_time_remain')):
        qtr = struct['quarter']
        mins, secs = map(int, struct['qtr_time_remain'].split(':'))
        struct['secsElapsed'] = qtr * 900 - mins * 60 - secs
    # creating columns for turnovers
    struct['isInt'] = pd.notnull(struct.get('interceptor'))
    struct['isFumble'] = pd.notnull(struct.get('fumbler'))
    # create column for isPenalty
    struct['isPenalty'] = pd.notnull(struct.get('penalty'))
    # create columns for EPA
    struct['team_epa'] = struct['exp_pts_after'] - struct['exp_pts_before']
    struct['opp_epa'] = struct['exp_pts_before'] - struct['exp_pts_after']
    return pd.Series(struct)
Example #50
0
!wget -O weather-stations20140101-20141231.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/weather-stations20140101-20141231.csv


import csv
import pandas as pd
import numpy as np

filename='weather-stations20140101-20141231.csv'

#Read csv
pdf = pd.read_csv(filename)
pdf.head(5)

# .  DATA CLEANING
pdf = pdf[pd.notnull(pdf["Tm"])]
pdf = pdf.reset_index(drop=True)
pdf.head(5)

# .   VISULIZATION
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = (14,10)

llon=-140
ulon=-50
llat=40
ulat=65
Example #51
0
def funds_as_dict(country=None, columns=None, as_json=False):
    """
    This function retrieves all the available funds on Investing.com and returns them as a :obj:`dict` containing 
    the country, name, symbol, tag, id, issuer, isin, asset_class, currency and underlying data. All the available
    funds can be found at: https://www.investing.com/funds/

    Args:
        country (:obj:`str`, optional): name of the country to retrieve all its available funds from.
        columns (:obj:`list` of :obj:`str`, optional): description
            a :obj:`list` containing the column names from which the data is going to be retrieved.
        as_json (:obj:`bool`, optional): description
            value to determine the format of the output data (:obj:`dict` or :obj:`json`).

    Returns:
        :obj:`dict` or :obj:`json` - funds_dict:
            The resulting :obj:`dict` contains the retrieved data if found, if not, the corresponding
            fields are filled with `None` values.

            In case the information was successfully retrieved, the :obj:`dict` will look like::

                {
                    'country': country,
                    'name': name,
                    'symbol': symbol,
                    'issuer': issuer,
                    'isin': isin,
                    'asset_class': asset_class,
                    'currency': currency,
                    'underlying': underlying
                }

    Raises:
        ValueError: raised whenever any of the introduced arguments is not valid or errored.
        FileNotFoundError: raised when the `funds.csv` file was not found.
        IOError: raised if the `funds.csv` file is missing or errored.
    
    """

    if country is not None and not isinstance(country, str):
        raise ValueError("ERR#0025: specified country value not valid.")

    if not isinstance(as_json, bool):
        raise ValueError(
            "ERR#0002: as_json argument can just be True or False, bool type.")

    resource_package = 'investpy'
    resource_path = '/'.join(('resources', 'funds.csv'))
    if pkg_resources.resource_exists(resource_package, resource_path):
        funds = pd.read_csv(
            pkg_resources.resource_filename(resource_package, resource_path))
    else:
        raise FileNotFoundError("ERR#0057: funds file not found or errored.")

    if funds is None:
        raise IOError("ERR#0005: funds not found or unable to retrieve.")

    funds.drop(columns=['tag', 'id'], inplace=True)
    funds = funds.where(pd.notnull(funds), None)

    if columns is None:
        columns = funds.columns.tolist()
    else:
        if not isinstance(columns, list):
            raise ValueError(
                "ERR#0020: specified columns argument is not a list, it can just be list type."
            )

    if not all(column in funds.columns.tolist() for column in columns):
        raise ValueError(
            "ERR#0023: specified columns does not exist, available columns are "
            "<country, name, symbol, issuer, isin, asset_class, currency, underlying>"
        )

    if country is None:
        if as_json:
            return json.dumps(funds[columns].to_dict(orient='records'))
        else:
            return funds[columns].to_dict(orient='records')
    else:
        country = unidecode(country.strip().lower())

        if country not in fund_countries_as_list():
            raise ValueError("ERR#0034: country " + country +
                             " not found, check if it is correct.")

        if as_json:
            return json.dumps(
                funds[funds['country'] == country][columns].to_dict(
                    orient='records'))
        else:
            return funds[funds['country'] == country][columns].to_dict(
                orient='records')
Example #52
0
def remove_roman_characters_column(df):
    column_mandarin_keyword = df['Keyword in Chinese']
    column_no_roman_characters = column_mandarin_keyword.apply(
        lambda x: remove_roman_characters(x) if pd.notnull(x) else x)
    df['Keyword in Chinese'] = column_no_roman_characters
    return df
Example #53
0
def message_df(fitfile=None,
               msgtype='record',
               outfile=None,
               appendunits=True,
               missing='drop',
               addlasttimestamp=False,
               fromR=False):

    #  serial number has numpy/pandas conversion problems
    baddevinfovars = ['serial_number']

    if fitfile is None:
        print("No fitfile given")
        sys.exit(1)

    lasttimestamp = pd.to_datetime(float("NaN"))
    msgdf = pd.DataFrame()

    with fitdecode.FitReader(fitfile) as fit:
        for frame in fit:
            # The yielded frame object is of one of the following types:
            # * fitdecode.FitHeader
            # * fitdecode.FitDefinitionMessage
            # * fitdecode.FitDataMessage
            # * fitdecode.FitCRC
            if isinstance(frame, fitdecode.FitDataMessage):
                # Here, frame is a FitDataMessage object.
                # A FitDataMessage object contains decoded values that
                # are directly usable in your script logic.
                if frame.has_field('timestamp'):
                    lasttimestamp = frame.get_value('timestamp')
                if frame.name == msgtype:
                    msgdict = {}
                    if addlasttimestamp and not frame.has_field('timestamp'):
                        msgdict['timestamp'] = lasttimestamp
                    # Go through all the data entries in this msg
                    for fld in frame.fields:
                        if fld.units and appendunits:
                            keyname = fld.name + "." + fld.units.replace(
                                "/", ".")
                        else:
                            keyname = fld.name

                        if (msgtype == 'device_info') and (fld.name
                                                           in baddevinfovars):
                            msgdict[keyname] = force_to_int(
                                frame.get_value(fld.name,
                                                fallback=float('NaN')))
                        else:
                            msgdict[keyname] = frame.get_value(
                                fld.name, fallback=float('NaN'))

                    msgdf = msgdf.append(msgdict, ignore_index=True)

    msgdf = msgdf.where((pd.notnull(msgdf)), None)
    if missing == 'drop':
        msgdf.dropna(axis=1, how='all', inplace=True)

    if not fromR:
        print("variables extracted:")
        print("\n".join(str(x) for x in msgdf.columns))
        print("dtypes: ")
        print(msgdf.dtypes)

    if outfile is None:
        return msgdf
    else:
        msgdf.to_json(path_or_buf=outfile, date_format='iso', date_unit='s')
Example #54
0
def main():
    # import data
    filename = "data.csv"
    raw = pd.read_csv(filename)
    originalFrame = raw.copy()

    ################## PREPROCESSING ###########################
    raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw[
        'seconds_remaining']
    raw["last_5_sec_in_period"] = raw["remaining_time"] < 5
    drops = ["minutes_remaining", "seconds_remaining","team_id", "shot_zone_area", \
             'shot_zone_range', 'shot_zone_basic', "game_date", "team_name", "matchup", "lat", "lon", 'game_event_id']
    raw["home_play"] = raw["matchup"].str.contains("vs").astype("int")
    for drop in drops:
        raw = raw.drop(drop, 1)
    raw = randomForestStrToNum(raw)
    nona = raw[pd.notnull(raw['shot_made_flag'])]

    #splitting explantory and response variables
    train = nona.drop('shot_made_flag', 1)
    train_y = nona['shot_made_flag']

    # setting up KFolds
    seed = 24
    num_folds = 3
    num_rounds = 10

    folds = KFold(len(train),
                  n_folds=num_folds,
                  random_state=seed,
                  shuffle=True)

    model = RandomForestClassifier(n_estimators=200,
                                   max_depth=10,
                                   max_features=0.25,
                                   random_state=seed)
    # model = model.fit(train, train_y)

    #################################################################
    #Looking at specific shots and their predicted probability
    layupFrame = nona.loc[nona["Layup"] == 1]
    fadeawayShotFrame = nona.loc[nona["Fadeaway Jump Shot"] == 1]
    dunkFrame = nona.loc[nona["Dunk"] == 1]

    ################# LAYUPS #########################
    layup_train = train.loc[train["Layup"] == 1]
    layup_train_y = layupFrame['shot_made_flag']

    print("LayupFrame shape: " + str(layupFrame.shape))
    layupScore = testSubset(model, train, train_y, layup_train, layup_train_y,
                            num_rounds, num_folds, seed)
    print(layupScore)

    ################## DUNKS ########################
    dunk_train = train.loc[train["Dunk"] == 1]
    dunk_train_y = dunkFrame['shot_made_flag']

    print("DunkFrame shape: " + str(dunkFrame.shape))
    dunkScore = testSubset(model, train, train_y, dunk_train, dunk_train_y,
                           num_rounds, num_folds, seed)
    print(dunkScore)

    ################## FADEAWAY SHOTS ########################
    fade_train = train.loc[train["Fadeaway Jump Shot"] == 1]
    fade_train_y = fadeawayShotFrame['shot_made_flag']

    print("FadeawayShotFrame shape: " + str(fadeawayShotFrame.shape))
    fadeScore = testSubset(model, train, train_y, fade_train, fade_train_y,
                           num_rounds, num_folds, seed)
    print(fadeScore)
def import_data():
    t2 = time.process_time()

    data = pd.ExcelFile('C://Users//mima//Documents//price_freight_assay_data.xlsx')
    raw_rates = pd.ExcelFile('C://Users//mima//Documents//flat_rates.xlsx')
    trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm')
    
    assay = pd.read_excel(data, 'assay', index_col = 'Database_Name').to_dict('index')
    ws = pd.read_excel(data, 'ws')
    expiry_table = pd.read_excel(data, 'expiry', index_col = 'Month')
    ports = pd.read_excel(data, 'ports')
    sub_to_ws = pd.read_excel(data, 'sub_to_ws', header = None)
    sub_to_ws = sub_to_ws.set_index([0]).to_dict()
    
    """table containing the basrah base worldscale that they fix their freight against"""
    basrah_ws_base = pd.read_excel(data, 'basrah_ws_base', index_col = 'YEAR')
    
    
    
    """Take in the crude prices and codes and convert to a dataframe.
    We need to take the first 2 rows of the prices with no headers as this will give us the cude name and the code ascociated
    Then transpose from rows to columns and rename the columns. This will be for later when we determine crude prices basis desired comaprison"""
    #prices_reference = (pd.read_excel(data, 'paper prices', header = None).iloc[0:2,1:]).transpose().rename(columns={0:'Name', 1: 'Code'})  
    
    """Merge the WS table with the prices table, slice df so 2016 onwards (Flat rates last date is 2015). 
    We don't drop rows now as dropping would be dependent on any nans in any column"""
    #total = prices.merge(ws_table, how = 'inner', left_index = True, right_index = True)
    #total = total.merge(paper_prices, how = 'inner', left_index = True, right_index = True)
    #total = total.iloc[total.index > dt(2015,12,31)]
    
    """this new total table generates all the prices in one place for us"""
    total = pd.read_excel(data, 'price_warehouse', header = 4).drop(['Timestamp'])
    total.index = pd.to_datetime(total.index)
    total.sort_index(inplace=True)
    total.fillna(method='ffill', inplace=True)
    total = total[total.index > dt(2015,1,1)]
    
    """We know there are some perculiarities in the data, such as the OSPs. So create this table here to handle. Found out need to shift the prices back a month but in order
    to identify which ones, needed the list of OSP crudes"""
    exceptions = {
            'Arab Extra Light':
                {'ROTTERDAM':{'Code':'AAIQQ00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAWQK00','Index':'BWAVE'},
                 'HOUSTON':{'Code':'AAIQZ00','Index':'WTI'},
                 'SINGAPORE':{'Code':'AAIQV00','Index':'OMAN/DUBAI'}},
            'Arab Light':
                {'ROTTERDAM':{'Code':'AAIQR00','Index':'BWAVE'},
                'AUGUSTA':{'Code':'AAWQL00','Index':'BWAVE'},
                'HOUSTON':{'Code':'AAIRA00','Index':'WTI'},
                'SINGAPORE':{'Code':'AAIQW00','Index':'OMAN/DUBAI'}},
            'Arab Medium':
                {'ROTTERDAM':{'Code':'AAIQS00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAWQM00','Index':'BWAVE'},
                 'HOUSTON':{'Code':'AAIRB00','Index':'WTI'},
                 'SINGAPORE':{'Code':'AAIQX00','Index':'OMAN/DUBAI'}},
            'Arab Heavy':
                {'ROTTERDAM':{'Code':'AAIQT00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAWQN00','Index':'BWAVE'},
                 'HOUSTON':{'Code':'AAIRC00','Index':'WTI'},
                 'SINGAPORE':{'Code':'AAIQY00','Index':'OMAN/DUBAI'}},
            'Basrah Light':
                {'ROTTERDAM':{'Code':'AAIPH00','Index':'Dated'},
                 'AUGUSTA':{'Code':'AAIPH00','Index':'Dated'},
                 'HOUSTON':{'Code':'AAIPG00','Index':'WTI'},
                 'SINGAPORE':{'Code':'AAIPE00','Index':'OMAN/DUBAI'}},
            'Basrah Heavy':
                {'ROTTERDAM':{'Code':'AAXUC00','Index':'Dated'},
                 'AUGUSTA':{'Code':'AAXUC00','Index':'Dated'},
                 'HOUSTON':{'Code':'AAXUE00','Index':'Mars'},
                 'SINGAPORE':{'Code':'AAXUA00','Index':'OMAN/DUBAI'}},
            'Iranian Heavy':
                {'ROTTERDAM':{'Code':'AAIPB00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAUCH00','Index':'BWAVE'},
                 #'Iranian Heavy':{'HOUSTON':{'Code':abcde,'Index':'WTI'}},
                'SINGAPORE':{'Code':'AAIOY00','Index':'OMAN/DUBAI'}},
            'Iranian Light':
                {'ROTTERDAM':{'Code':'AAIPA00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAUCJ00','Index':'BWAVE'},
                'SINGAPORE':{'Code':'AAIOX00','Index':'OMAN/DUBAI'}},
            'Forozan':
                {'ROTTERDAM':{'Code':'AAIPC00','Index':'BWAVE'},
                'AUGUSTA':{'Code':'AAUCF00','Index':'BWAVE'},
                'SINGAPORE':{'Code':'AAIOZ00','Index':'OMAN/DUBAI'}},
            'Isthmus':{'ROTTERDAM':{'Code':'AAIQC00','Index':'Dated'},
                'AUGUSTA':{'Code':'AAIQC00','Index':'Dated'},
                'HOUSTON':{'Code':'AAIPZ00','Index':'WTI'},
                'SINGAPORE':{'Code':'AAIQE00','Index':'OMAN/DUBAI'}},
            'Maya':{'ROTTERDAM':{'Code':'AAIQB00','Index':'Dated'},
                'AUGUSTA':{'Code':'AAIQB00','Index':'Dated'},
                'HOUSTON':{'Code':'AAIPY00','Index':'WTI'},
                'SINGAPORE':{'Code':'AAIQD00','Index':'OMAN/DUBAI'}}
            }
     
    crudes_to_shift = pd.DataFrame.from_dict({(crude,destination): exceptions[crude][destination] 
            for crude in exceptions.keys() 
            for destination in exceptions[crude].keys()}, 
            orient='index')
    
    """convert the dataseries to a list, then use setr to get the unique items, then convert back to a list"""   
    crudes_to_shift = list(set(list(crudes_to_shift['Code'])))
    
    """Fopr the crudes in the list, I want to resample the series at the month start so there is a common value for the start of each month,
    I then want shift these values by 1 backwards, in this case because we resampled, this automatically means shift abck one month,
    I then want to re-index the new dataframe to conform to where we are putting it back into, and finally I assign the total dataframe where the 
    column headers are equal to the crude list, the new shifted and filled forward values to make sure everything lines up"""
    total[crudes_to_shift] = total[crudes_to_shift].resample('MS').mean().shift(-1, freq='MS').reindex(total.index).fillna(method='ffill')  

    #total['AAXUC00']
    
    """This will help with the date error. Turn the index into a numpy array and then assign the value"""
    if total.index[-1] - total.index[-2] > pd.Timedelta(days=2):
        total.index.values[-1] = total.index[-2] + pd.Timedelta(days=1)


    """Clean the column hedaers so no white spcaes - use simple list comprehension and set headers equal to cleaned"""
    cleaned_column_headers = [i.strip() for i in total.columns.values]
    total.columns = cleaned_column_headers
    
    """The below was get rid of the row in the index that hax NaT against it and then expand to daily and fill backwards"""
    crude_diffs = pd.read_excel(trader_assessed, 'Crude Diffs Traders', header = 0)
    crude_diffs = crude_diffs.loc[pd.notnull(crude_diffs.index)]
    crude_diffs = crude_diffs.drop([name for name in crude_diffs.columns if 'Unnamed' in name], axis=1)

   
    #crude_diffs.index = crude_diffs.index.map(lambda x : x + 1*BDay())
    crude_diffs = crude_diffs.reindex(total.index).fillna(method='bfill').fillna(method='ffill')
    
    """Slice the crude diffs where the dates in the index are the same as the dates in the total dataframe"""
    #crude_diffs = crude_diffs[crude_diffs.index.isin(total.index)]
    crudes_diff_against_osp = ['Basrah Light','Basrah Heavy']
    codes_list = [x for x in crude_diffs.columns if x not in crudes_diff_against_osp]
    
    """Apply the values in crude diffs to the correct codes and dates in the total dataframe"""
    total.update(crude_diffs[codes_list])
    
    
        
    
    """We have to convert the prices that are in absolutes into a diff vs a local index, and if there are, set to zero.
    This is LOOP Sour"""
    total['AALSM01'].loc[total['AALSM01'] > 30] = total['AALSM01'].loc[total['AALSM01'] > 30] - total['CLc1']
    #total.loc[total.index.isin(crude_diffs.index), codes_list] = crude_diffs[codes_list]
    #total[codes_list]
    
    #total.update(crude_diffs[codes_list])
    """ Need this for the sulphur table"""
    forties_sulphur = pd.read_excel(trader_assessed, 'Forties de-esc', header = [22], parse_cols="H:I").set_index('week ending')
    forties_sulphur = forties_sulphur.loc[pd.notnull(forties_sulphur.index)]
    forties_sulphur = forties_sulphur.reindex(total.index).fillna(method='ffill')

    """Also need to adjust the cfds to take into account the inter month BFOE spread"""   
    cfd_list = ['PCAKA00','PCAKC00','PCAKE00','PCAKG00','AAGLU00','AAGLV00','AALCZ00','AALDA00']
    temp = total[cfd_list].sub(pd.Series(total['PCAAQ00'] - total['PCAAR00']), axis=0)
    temp = temp[temp.index > dt(2017,6,30)]
    total.loc[total.index.isin(temp.index), list(temp.columns)] = temp[list(temp.columns)]
    
    """This turns the 5 years of rate matricies into a table for use to reference - 12/04/2018"""    
    rates = []
    for x,y in enumerate([name.split()[2] for name in raw_rates.sheet_names]):
        f  = pd.read_excel(raw_rates, sheetname = x, header = None).iloc[1:47,1:]
        lplen = len(f.iloc[:,1])
        dplen = len(f.iloc[1,:])
        for j in range(1, dplen):
            for i in range(1,lplen):
                LoadPort = f.iloc[i,0]
                DischargePort = f.iloc[0,j]
                Year = y
                Rate = f.iloc[i,j]
                rates.append({'LoadPort':LoadPort, 'DischargePort': DischargePort, 'Year':Year,'Rate':Rate})
            
    rate_data = pd.DataFrame(rates)
    
    """Also initialise the temp df with index of total. Temp df is tol hold the dataseries needed to calculate the freight"""
    df = pd.DataFrame(index=total.index)
    df['Date'] = df.index
    
    """This function allows us to apply the expiration date for the wti futures used to determine what structure we apply to the CMA
    Have tried timing and slight improvment with the blow of 0.2seconds...."""
   
    t = time.process_time()

    for_dates = lambda x: (expiry_table.loc[(expiry_table.index.month == x.month)&(expiry_table.index.year == x.year)]['Expiry']).iat[0]
   
    df['Expiry'] = df['Date'].apply(for_dates)
    df.drop(['Date'], inplace=True, axis=1)
    
    
    
    

    print("df['Expiry'] created successfully: Time was {}".format(time.process_time() - t))
    print("Temp DataFrame created successfully")
    print("import_data() created successfully: Time was {}".format(time.process_time() - t2))
    
    return assay, ws, ports, total, rate_data, sub_to_ws, df, basrah_ws_base, crude_diffs, forties_sulphur, exceptions, crudes_to_shift
'b' in obj2       #用于需要字典参数的函数中,例如字典键的判断
'e' in obj2

#通过字典创建series,只传入字典,则键即为索引
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)      #返回的series有序排列
obj3

#同时传入字典和指定索引,字典的键与索引是否匹配-读取匹配的数据,不匹配,其索引的值为NaN
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4

#pandas的isnull和notnull可以检测缺失数据
pd.isnull(obj4)   #返回布尔型
pd.notnull(obj4)

#series的isnull
obj4.isnull()

#series会自动对齐不同索引的数据
obj3
obj4
obj3 + obj4

#series对象本身和索引都有一个“name”属性,类似于标签
obj4.name = 'population'      #设置series对象本身的“name”属性
obj4.index.name = 'state'
obj4

#通过赋值的方式就地修改series的索引
 def prep_pbp_data(df):
     df = df[df['qb_dropback'].notna()]
     df['epa'] = pd.to_numeric(df['epa'])
     df['wpa'] = pd.to_numeric(df['wpa'])
     # Passing
     pass_df = df[(df['qb_dropback'] == "1")
                  & (df['passer_player_name'].notna())]
     cols_pass = [  # "airEPA_Result", "airWPA_Result", "yacEPA_Result", "yacWPA_Result",
         "passer_id",
         "receiver_id",
         "passer_player_name",  # "Passer_salary", "Passer_capHit", "PasserName",
         "receiver_player_name"  # , "Receiver_salary", "Receiver_capHit", "Receiver_position"]
     ]
     for i in cols_pass:
         pass_df = pass_df[pd.notnull(pass_df[i])]
     pass_df = pass_df[(pass_df['receiver_id'] != "None")
                       & (pass_df['passer_id'] != "None")]
     # Rushing
     rush_df = df[(df['qb_dropback'] == "0")]
     cols_rush = [
         "epa",
         "wpa",
         "rusher_player_name",
         "rusher_id",  # "Rusher_capHit", "Rusher_salary"]
     ]
     for k in cols_rush:
         rush_df = rush_df[pd.notnull(rush_df[k])]
     rush_df = rush_df[(rush_df['rusher_id'] != "None")]
     # Receiving
     rec_df = pass_df
     cols_rec = [  # "airEPA_Result", "airWPA_Result", "yacEPA_Result", "yacWPA_Result",
         "passer_id",
         "receiver_id",
         "receiver_player_name",  # "Passer_salary", "Passer_capHit",
         "passer_player_name"
         # , "Receiver_salary", "Receiver_capHit", "Receiver_position"]
     ]
     for g in cols_rec:
         rec_df = rec_df[pd.notnull(rec_df[g])]
     rec_df = rec_df[(rec_df['receiver_id'] != "None")
                     & (rec_df['passer_id'] != "None")]
     # Team Passing
     team_passing = pass_df.groupby(['Season', 'posteam']).agg({
         'epa':
         sum,
         'wpa':
         sum,
         'play_id':
         'count'
     }).reset_index()
     team_passing.rename(columns={
         'play_id': 'Pass_Attempts',
         'epa': 'Pass_EPA',
         'wpa': 'Pass_WPA'
     },
                         inplace=True)
     team_passing['Pass_EPA_Att'] = team_passing['Pass_EPA'] / team_passing[
         'Pass_Attempts']
     team_passing['Pass_WPA_Att'] = team_passing['Pass_WPA'] / team_passing[
         'Pass_Attempts']
     # Team Rushing
     team_rushing = rush_df.groupby(['Season', 'posteam']).agg({
         'epa':
         sum,
         'wpa':
         sum,
         'play_id':
         'count'
     }).reset_index()
     team_rushing.rename(columns={
         'play_id': 'Rush_Attempts',
         'epa': 'Rush_EPA',
         'wpa': 'Rush_WPA'
     },
                         inplace=True)
     team_rushing['Rush_EPA_Att'] = team_rushing['Rush_EPA'] / team_rushing[
         'Rush_Attempts']
     team_rushing['Rush_WPA_Att'] = team_rushing['Rush_WPA'] / team_rushing[
         'Rush_Attempts']
     # Ind Passing
     ind_passing = pass_df.groupby(
         ['Season', 'passer_player_name', 'posteam']).agg({
             'epa':
             sum,
             'wpa':
             sum,
             # 'airEPA_Result': sum,
             'play_id':
             'count',
             # 'Passer_salary': 'max',
             # 'Passer_capHit': 'max'
         }).reset_index()
     ind_passing.rename(columns={
         'play_id': 'Pass_Attempts',
         'epa': 'Pass_EPA',
         'wpa': 'Pass_WPA'
     },
                        inplace=True)
     ind_passing['Pass_EPA_Att'] = ind_passing['Pass_EPA'] / ind_passing[
         'Pass_Attempts']
     ind_passing['Pass_WPA_Att'] = ind_passing['Pass_WPA'] / ind_passing[
         'Pass_Attempts']
     # ind_passing['airEPA_Att'] = ind_passing['airEPA_Result'] / ind_passing['Pass_Attempts']
     ind_passing = ind_passing[(ind_passing['Pass_Attempts'] > 10)]
     # Ind Rushing
     ind_rushing = rush_df.groupby(
         ['Season', 'rusher_player_name', 'posteam']).agg({
             'epa':
             sum,
             'wpa':
             sum,
             'play_id':
             'count',
             # 'Rusher_salary': 'max',
             # 'Rusher_capHit': 'max'
         }).reset_index()
     ind_rushing.rename(columns={
         'play_id': 'Rush_Attempts',
         'epa': 'Rush_EPA',
         'wpa': 'Rush_WPA',
         'rusher_player_name': 'Player'
     },
                        inplace=True)
     ind_rushing['Rush_EPA_Att'] = ind_rushing['Rush_EPA'] / ind_rushing[
         'Rush_Attempts']
     ind_rushing['Rush_WPA_Att'] = ind_rushing['Rush_WPA'] / ind_rushing[
         'Rush_Attempts']
     # ind_rushing = ind_rushing[(ind_rushing['Rush_Attempts'] > 25)]
     # filter out QBs from Rush df
     qbs = ind_passing[(ind_passing['Pass_Attempts'] >
                        15)]['passer_player_name']
     ind_rushing = ind_rushing[~ind_rushing['Player'].isin(qbs)]
     # Ind Receiving
     ind_receiving = rec_df.groupby(
         ['Season', 'receiver_player_name', 'posteam']).agg({
             'epa':
             sum,
             'wpa':
             sum,
             'play_id':
             'count',
             # 'Receiver_salary': 'max',
             # 'Receiver_capHit': 'max'
         }).reset_index()
     ind_receiving.rename(columns={
         'play_id': 'Targets',
         'epa': 'Rec_EPA',
         'wpa': 'Rec_WPA',
         'Receiver': 'Player'
     },
                          inplace=True)
     ind_receiving['Rec_EPA_Target'] = ind_receiving[
         'Rec_EPA'] / ind_receiving['Targets']
     ind_receiving['Rec_WPA_Target'] = ind_receiving[
         'Rec_WPA'] / ind_receiving['Targets']
     # ind_receiving = ind_receiving[(ind_receiving['Targets'] > 25)]
     # Combine ind_rushing and ind_receiving
     merged_ind = pd.merge(
         ind_rushing,
         ind_receiving,
         left_on=["Season", "Player", "posteam"],
         right_on=["Season", "receiver_player_name", "posteam"])
     merged_ind['Opportunities'] = merged_ind['Rush_Attempts'] + merged_ind[
         'Targets']
     merged_ind['Weighted_Rush_EPA'] = merged_ind[
         'Rush_Attempts'] * merged_ind['Rush_EPA_Att']
     merged_ind['Weighted_Rush_WPA'] = merged_ind[
         'Rush_Attempts'] * merged_ind['Rush_WPA_Att']
     merged_ind['Weighted_Target_EPA'] = merged_ind['Targets'] * merged_ind[
         'Rec_EPA_Target']
     merged_ind['Weighted_Target_WPA'] = merged_ind['Targets'] * merged_ind[
         'Rec_WPA_Target']
     merged_ind['Weighted_EPA_Opps'] = (merged_ind['Weighted_Rush_EPA'] + merged_ind['Weighted_Target_EPA']) \
                                       / merged_ind['Opportunities']
     merged_ind['Weighted_WPA_Opps'] = (merged_ind['Weighted_Rush_WPA'] + merged_ind['Weighted_Target_WPA']) \
                                       / merged_ind['Opportunities']
     merged_team = pd.merge(team_passing,
                            team_rushing,
                            on=["posteam", "Season"])
     return merged_team, team_passing, team_rushing, ind_passing, ind_rushing, ind_receiving, merged_ind
def create_price_table():
    

total.head()



where(pd.notnull(assay_input),None).to_dict('records')


total.index = pd.to_datetime(total.index)
total.sort_index(inplace=True)
total.fillna(method='ffill', inplace=True)
total = total[total.index > dt(2015,1,1)]


def import_prices_prepare_for_sql():
    total = pd.read_excel(data, 'price_warehouse', header=4).drop(['Timestamp']).iloc[:10]
    total_descriptions = pd.read_excel(data, 'price_warehouse', header=3).columns
    total.columns = [total_descriptions,total.columns]
    total.index = pd.to_datetime(total.index)
    total.columns.names = ('Series','Code')
    total.index.name = 'Date'
    total = total.unstack().reset_index().rename(columns={0:'Value'})
    total['Value'] = pd.to_numeric(total['Value'], errors='coerce')
    total = total.dropna(subset=['Value'])
    prices_input = total.to_dict('records')
    return prices_input

def basrah_data_records():
    basrah_ws_base = pd.read_excel(data, 'basrah_ws_base', index_col = 'Date') 
    basrah_ws_base_input = pd.DataFrame(basrah_ws_base).unstack().reset_index().rename(columns={'level_0':'Series', 0:'Value'}).to_dict('records') 
    return basrah_ws_base_input

def freight_rates_records():
    raw_rates = pd.ExcelFile('C://Users//mima//Documents//flat_rates.xlsx')
    rates = []
    for x,y in enumerate([name.split()[2] for name in raw_rates.sheet_names]):
        f  = pd.read_excel(raw_rates, sheetname = x, header = None).iloc[1:47,1:]
        lplen = len(f.iloc[:,1])
        dplen = len(f.iloc[1,:])
        for j in range(1, dplen):
            for i in range(1,lplen):
                LoadPort = f.iloc[i,0]
                DischargePort = f.iloc[0,j]
                Year = y
                Rate = f.iloc[i,j]
                rates.append({'LoadPort':LoadPort, 'DischargePort': DischargePort, 'Year':Year,'Rate':Rate})    
    rate_data_input = pd.DataFrame(rates).dropna(axis=0).to_dict('records')
    return rate_data_input

def assay_records():
    assay_input = pd.read_excel(data, 'assay', index_col='Database_Name')
    assay_input.dtypes
    assay_input['RESIDUE_v40'] = assay_input['RESIDUE_v40'].astype(float)
    assay_input['GradesId'] = assay_input['GradesId'].astype(float)
    assay_input = assay_input.reset_index()
    assay_input = assay_input.where(pd.notnull(assay_input),None).to_dict('records')
    return assay_input

def world_scale_records():
    ws_input = pd.read_excel(data, 'ws').to_dict('records')
    return ws_input

def world_scale_mappings_data():
    world_scale_mappings_input = pd.read_excel(data, 'sub_to_ws').to_dict('records')
    return world_scale_mappings_input

def exceptions_data():
    exceptions = {
                'Arab Extra Light':
                    {'ROTTERDAM':{'Code':'AAIQQ00','Index':'BWAVE'},
                     'AUGUSTA':{'Code':'AAWQK00','Index':'BWAVE'},
                     'HOUSTON':{'Code':'AAIQZ00','Index':'WTI'},
                     'SINGAPORE':{'Code':'AAIQV00','Index':'OMAN/DUBAI'}},
                'Arab Light':
                    {'ROTTERDAM':{'Code':'AAIQR00','Index':'BWAVE'},
                    'AUGUSTA':{'Code':'AAWQL00','Index':'BWAVE'},
                    'HOUSTON':{'Code':'AAIRA00','Index':'WTI'},
                    'SINGAPORE':{'Code':'AAIQW00','Index':'OMAN/DUBAI'}},
                'Arab Medium':
                    {'ROTTERDAM':{'Code':'AAIQS00','Index':'BWAVE'},
                     'AUGUSTA':{'Code':'AAWQM00','Index':'BWAVE'},
                     'HOUSTON':{'Code':'AAIRB00','Index':'WTI'},
                     'SINGAPORE':{'Code':'AAIQX00','Index':'OMAN/DUBAI'}},
                'Arab Heavy':
                    {'ROTTERDAM':{'Code':'AAIQT00','Index':'BWAVE'},
                     'AUGUSTA':{'Code':'AAWQN00','Index':'BWAVE'},
                     'HOUSTON':{'Code':'AAIRC00','Index':'WTI'},
                     'SINGAPORE':{'Code':'AAIQY00','Index':'OMAN/DUBAI'}},
                'Basrah Light':
                    {'ROTTERDAM':{'Code':'AAIPH00','Index':'Dated'},
                     'AUGUSTA':{'Code':'AAIPH00','Index':'Dated'},
                     'HOUSTON':{'Code':'AAIPG00','Index':'WTI'},
                     'SINGAPORE':{'Code':'AAIPE00','Index':'OMAN/DUBAI'}},
                'Basrah Heavy':
                    {'ROTTERDAM':{'Code':'AAXUC00','Index':'Dated'},
                     'AUGUSTA':{'Code':'AAXUC00','Index':'Dated'},
                     'HOUSTON':{'Code':'AAXUE00','Index':'Mars'},
                     'SINGAPORE':{'Code':'AAXUA00','Index':'OMAN/DUBAI'}},
                'Iranian Heavy':
                    {'ROTTERDAM':{'Code':'AAIPB00','Index':'BWAVE'},
                     'AUGUSTA':{'Code':'AAUCH00','Index':'BWAVE'},
                     #'Iranian Heavy':{'HOUSTON':{'Code':abcde,'Index':'WTI'}},
                    'SINGAPORE':{'Code':'AAIOY00','Index':'OMAN/DUBAI'}},
                'Iranian Light':
                    {'ROTTERDAM':{'Code':'AAIPA00','Index':'BWAVE'},
                     'AUGUSTA':{'Code':'AAUCJ00','Index':'BWAVE'},
                    'SINGAPORE':{'Code':'AAIOX00','Index':'OMAN/DUBAI'}},
                'Forozan':
                    {'ROTTERDAM':{'Code':'AAIPC00','Index':'BWAVE'},
                    'AUGUSTA':{'Code':'AAUCF00','Index':'BWAVE'},
                    'SINGAPORE':{'Code':'AAIOZ00','Index':'OMAN/DUBAI'}},
                'Isthmus':{'ROTTERDAM':{'Code':'AAIQC00','Index':'Dated'},
                    'AUGUSTA':{'Code':'AAIQC00','Index':'Dated'},
                    'HOUSTON':{'Code':'AAIPZ00','Index':'WTI'},
                    'SINGAPORE':{'Code':'AAIQE00','Index':'OMAN/DUBAI'}},
                'Maya':{'ROTTERDAM':{'Code':'AAIQB00','Index':'Dated'},
                    'AUGUSTA':{'Code':'AAIQB00','Index':'Dated'},
                    'HOUSTON':{'Code':'AAIPY00','Index':'WTI'},
                    'SINGAPORE':{'Code':'AAIQD00','Index':'OMAN/DUBAI'}}
                }
      
    exceptions_input = pd.DataFrame.from_dict({(crude,destination): exceptions[crude][destination] 
            for crude in exceptions.keys() 
            for destination in exceptions[crude].keys()}, 
            orient='index')
    exceptions_input = exceptions_input.unstack().unstack().reset_index().rename(columns={'level_0':'Series','level_1':'Destination','level_2':'Crude', 0:'Value'}).to_dict('records')  
    return exceptions_input

def forties_sulphur_records():
    trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm')
    forties_sulphur = pd.read_excel(trader_assessed, 'Forties de-esc', header = [22], parse_cols="H:I").set_index('week ending')
    forties_sulphur = forties_sulphur.loc[pd.notnull(forties_sulphur.index)]
    forties_sulphur_input = forties_sulphur.reset_index().rename(columns={'buzzard content':'BuzzardContent','week ending':'Date'}).to_dict('records')
    return forties_sulphur_input



params = urllib.parse.quote("DRIVER={SQL Server Native Client 11.0};SERVER=STCHGS112;DATABASE=MIMAWorkSpace;UID=mima;Trusted_Connection=Yes")
eng = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params, echo=True)

Base = declarative_base()
session = Session(bind=eng)

class Basrah_WS_Base(Base):
    """this tells SQLAlchemy that rows of Basrah_WS_Base table must be mapped to this class"""
    __tablename__ = 'Basrah_WS_Base'
    Id = Column(Integer, primary_key=True)
    Date = Column(Date)
    Series = Column(String(32))
    Value = Column(Float)
    
class Global_Flat_Rates(Base):
    __tablename__ = 'Global_Flat_Rates'
    __table_args__ = {'extend_existing': True} 
    Id = Column(Integer, primary_key=True)
    DischargePort = Column(String(32))
    LoadPort = Column(String(32))
    Rate = Column(Float)
    Year = Column(Integer)
    
class Crude_Assay(Base):
    __tablename__ = 'Crude_Assay'
    __table_args__ = {'extend_existing': True} 
    Id = Column(Integer, primary_key=True)
    Database_Name = Column(String(32))
    H_Comet_Name = Column(String(32))
    Crude_Manager_Name = Column(String(32))
    Gravity = Column(Float)
    API = Column(Float)
    Sulphur = Column(Float)
    Conversion = Column(Float)
    LPG = Column(Float)
    LVN = Column(Float)
    HVN = Column(Float)
    KERO = Column(Float)
    LGO = Column(Float)
    HGO = Column(Float)
    VGO = Column(Float)
    RESIDUE = Column(Float)
    LGO_density = Column(Float)
    HGO_desnsity = Column(Float)
    VGO_density = Column(Float)
    RESIDUE_density = Column(Float)
    LGO_sulphur = Column(Float)
    HGO_sulphur = Column(Float)
    VGO_sulphur = Column(Float)
    RESIDUE_sulphur = Column(Float)
    RESIDUE_v50 = Column(Float)
    RESIDUE_v40 = Column(Float)
    RESIDUE_v100 = Column(Float)
    GradesId = Column(Float)
    Code = Column(String(32))
    Index = Column(String(32))
    Basis = Column(String(32))
    LoadPort = Column(String(32))
    FOBLoadPort = Column(String(32))
    FOBCode = Column(String(32))

class World_Scale_Table(Base):
    __tablename__ = 'World_Scale_Table'
    __table_args__ = {'extend_existing': True} 
    Id = Column(Integer, primary_key=True)
    Name = Column(String(64))
    Origin = Column(String(32))
    Destination = Column(String(32))
    Size = Column(String(32))
    Volume = Column(Integer)
    Terms = Column(String(32))
    Code = Column(String(32))
    bbls = Column(Integer)
    
class World_Scale_Mappings(Base):
    __tablename__ = 'World_Scale_Mappings'
    Id = Column(Integer, primary_key=True)
    Port_SubRegion = Column(String(32))
    WS_Region = Column(String(32))
    local_index = Column(String(32))
    Price_Set = Column(String(32))
    
class Exceptions(Base):
    __tablename__ = 'Exceptions'
    Id = Column(Integer, primary_key=True)
    Crude = Column(String(32))
    Destination = Column(String(32))
    Series = Column(String(32))
    Value = Column(String(32))
    
class Forties_Sulphur(Base):
    __tablename__ = 'Forties_Sulphur'
    Id = Column(Integer, primary_key=True)
    BuzzardContent = Column(Float)
    Date = Column(Date)
    
class Prices(Base):
    __tablename__ = 'Prices'
    Id = Column(Integer, primary_key=True)
    Series = Column(String)
    Code = Column(String(32))
    Date = Column(Date)
    Value = Column(Float)




    

"""Create the tables"""    
Base.metadata.create_all(eng)
    
"""Commit the data to a database"""
session.bulk_insert_mappings(Basrah_WS_Base, basrah_ws_base_input)
session.bulk_insert_mappings(Global_Flat_Rates, rate_data_input)
session.bulk_insert_mappings(Crude_Assay, assay_input)
session.bulk_insert_mappings(World_Scale_Table, ws_input)
session.bulk_insert_mappings(World_Scale_Mappings, world_scale_mappings_input)
session.bulk_insert_mappings(Exceptions, exceptions_input)
session.bulk_insert_mappings(Forties_Sulphur, forties_sulphur_input)
session.bulk_insert_mappings(Prices, prices_input)




session.commit()


session.rollback()



Base.metadata.bind = eng

DBSession = sessionmaker(bind=eng)



"""EXTRACTION: 
    Create the connection to the database to be able to upload data"""

with eng.connect() as con:
    
    """Load definition of the Basrah_WS_Base table and the connection metadata"""
    meta = MetaData(eng)
    basrah_ws_base = Table('Basrah_WS_Base', meta, autoload=True)
    
    """Create the SQL select statement and execute and print - use table.c.column name for individual columns"""
    stm = select([basrah_ws_base])
    rs = con.execute(stm) #You can also put a standard query in here
    data = pd.DataFrame(rs.fetchall(), columns=rs.keys())
    df = data.pivot(columns='Series', index = 'Date', values='Value')
    print(df)
    
    
    

"""Import data from local files if needed"""
raw_rates = pd.ExcelFile('C://Users//mima//Documents//flat_rates.xlsx')
trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm')

assay = pd.read_excel(data, 'assay', index_col = 'Database_Name').to_dict('index')
ws = pd.read_excel(data, 'ws')
expiry_table = pd.read_excel(data, 'expiry', index_col = 'Month')
ports = pd.read_excel(data, 'ports')
sub_to_ws = pd.read_excel(data, 'sub_to_ws', header = None)
sub_to_ws = sub_to_ws.set_index([0]).to_dict()


 



engine.execute("SELECT Series, Date, Value FROM Basrah_WS_Base").fetchall()

pd.DataFrame(engine.execute("SELECT Series, Date, Value FROM Basrah_WS_Base").fetchall())  
basrah_ws_base = basrah_ws_base_flat.pivot(columns='Series', index='Date', values='Value')





test = pd.DataFrame(refinery_configurations).unstack().reset_index().rename(columns={'index':'configuration'})
pd.DataFrame(refinery_configurations).T.reset_index()

t = time.process_time()
test.to_sql('Refinery_Configs_2', con =engine, index_label = 'Id', if_exists = 'replace')
print("Uploaded successfully: Time was {}".format(time.process_time() - t))

test.to_csv('L:/TRADING/ANALYSIS/Python/test.csv', sep='\t')




def retrieve_prices_model():
    df1 = database_prices.pivot(columns='Code', index = 'Date', values='Value')


















import pandas as pd
import numpy as np
from datetime import datetime as dt
import time
from pandas.tseries.offsets import BDay


    
def import_data():
    t2 = time.process_time()

    data = pd.ExcelFile('C://Users//mima//Documents//price_freight_assay_data.xlsx')
    raw_rates = pd.ExcelFile('C://Users//mima//Documents//flat_rates.xlsx')
    trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm')
    
    assay = pd.read_excel(data, 'assay', index_col = 'Database_Name').to_dict('index')
    ws = pd.read_excel(data, 'ws')
    expiry_table = pd.read_excel(data, 'expiry', index_col = 'Month')
    ports = pd.read_excel(data, 'ports')
    sub_to_ws = pd.read_excel(data, 'sub_to_ws', header = None)
    sub_to_ws = sub_to_ws.set_index([0]).to_dict()
    
    """table containing the basrah base worldscale that they fix their freight against"""
    basrah_ws_base = pd.read_excel(data, 'basrah_ws_base', index_col = 'YEAR')
    
    
    
    """Take in the crude prices and codes and convert to a dataframe.
    We need to take the first 2 rows of the prices with no headers as this will give us the cude name and the code ascociated
    Then transpose from rows to columns and rename the columns. This will be for later when we determine crude prices basis desired comaprison"""
    #prices_reference = (pd.read_excel(data, 'paper prices', header = None).iloc[0:2,1:]).transpose().rename(columns={0:'Name', 1: 'Code'})  
    
    """Merge the WS table with the prices table, slice df so 2016 onwards (Flat rates last date is 2015). 
    We don't drop rows now as dropping would be dependent on any nans in any column"""
    #total = prices.merge(ws_table, how = 'inner', left_index = True, right_index = True)
    #total = total.merge(paper_prices, how = 'inner', left_index = True, right_index = True)
    #total = total.iloc[total.index > dt(2015,12,31)]
    
    """this new total table generates all the prices in one place for us"""
    total = pd.read_excel(data, 'price_warehouse', header = 4).drop(['Timestamp'])
    total.index = pd.to_datetime(total.index)
    total.sort_index(inplace=True)
    total.fillna(method='ffill', inplace=True)
    total = total[total.index > dt(2015,1,1)]
    
    """We know there are some perculiarities in the data, such as the OSPs. So create this table here to handle. Found out need to shift the prices back a month but in order
    to identify which ones, needed the list of OSP crudes"""
    exceptions = {
            'Arab Extra Light':
                {'ROTTERDAM':{'Code':'AAIQQ00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAWQK00','Index':'BWAVE'},
                 'HOUSTON':{'Code':'AAIQZ00','Index':'WTI'},
                 'SINGAPORE':{'Code':'AAIQV00','Index':'OMAN/DUBAI'}},
            'Arab Light':
                {'ROTTERDAM':{'Code':'AAIQR00','Index':'BWAVE'},
                'AUGUSTA':{'Code':'AAWQL00','Index':'BWAVE'},
                'HOUSTON':{'Code':'AAIRA00','Index':'WTI'},
                'SINGAPORE':{'Code':'AAIQW00','Index':'OMAN/DUBAI'}},
            'Arab Medium':
                {'ROTTERDAM':{'Code':'AAIQS00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAWQM00','Index':'BWAVE'},
                 'HOUSTON':{'Code':'AAIRB00','Index':'WTI'},
                 'SINGAPORE':{'Code':'AAIQX00','Index':'OMAN/DUBAI'}},
            'Arab Heavy':
                {'ROTTERDAM':{'Code':'AAIQT00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAWQN00','Index':'BWAVE'},
                 'HOUSTON':{'Code':'AAIRC00','Index':'WTI'},
                 'SINGAPORE':{'Code':'AAIQY00','Index':'OMAN/DUBAI'}},
            'Basrah Light':
                {'ROTTERDAM':{'Code':'AAIPH00','Index':'Dated'},
                 'AUGUSTA':{'Code':'AAIPH00','Index':'Dated'},
                 'HOUSTON':{'Code':'AAIPG00','Index':'WTI'},
                 'SINGAPORE':{'Code':'AAIPE00','Index':'OMAN/DUBAI'}},
            'Basrah Heavy':
                {'ROTTERDAM':{'Code':'AAXUC00','Index':'Dated'},
                 'AUGUSTA':{'Code':'AAXUC00','Index':'Dated'},
                 'HOUSTON':{'Code':'AAXUE00','Index':'Mars'},
                 'SINGAPORE':{'Code':'AAXUA00','Index':'OMAN/DUBAI'}},
            'Iranian Heavy':
                {'ROTTERDAM':{'Code':'AAIPB00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAUCH00','Index':'BWAVE'},
                 #'Iranian Heavy':{'HOUSTON':{'Code':abcde,'Index':'WTI'}},
                'SINGAPORE':{'Code':'AAIOY00','Index':'OMAN/DUBAI'}},
            'Iranian Light':
                {'ROTTERDAM':{'Code':'AAIPA00','Index':'BWAVE'},
                 'AUGUSTA':{'Code':'AAUCJ00','Index':'BWAVE'},
                'SINGAPORE':{'Code':'AAIOX00','Index':'OMAN/DUBAI'}},
            'Forozan':
                {'ROTTERDAM':{'Code':'AAIPC00','Index':'BWAVE'},
                'AUGUSTA':{'Code':'AAUCF00','Index':'BWAVE'},
                'SINGAPORE':{'Code':'AAIOZ00','Index':'OMAN/DUBAI'}},
            'Isthmus':{'ROTTERDAM':{'Code':'AAIQC00','Index':'Dated'},
                'AUGUSTA':{'Code':'AAIQC00','Index':'Dated'},
                'HOUSTON':{'Code':'AAIPZ00','Index':'WTI'},
                'SINGAPORE':{'Code':'AAIQE00','Index':'OMAN/DUBAI'}},
            'Maya':{'ROTTERDAM':{'Code':'AAIQB00','Index':'Dated'},
                'AUGUSTA':{'Code':'AAIQB00','Index':'Dated'},
                'HOUSTON':{'Code':'AAIPY00','Index':'WTI'},
                'SINGAPORE':{'Code':'AAIQD00','Index':'OMAN/DUBAI'}}
            }
     
    crudes_to_shift = pd.DataFrame.from_dict({(crude,destination): exceptions[crude][destination] 
            for crude in exceptions.keys() 
            for destination in exceptions[crude].keys()}, 
            orient='index')
    
    """convert the dataseries to a list, then use setr to get the unique items, then convert back to a list"""   
    crudes_to_shift = list(set(list(crudes_to_shift['Code'])))
    
    """Fopr the crudes in the list, I want to resample the series at the month start so there is a common value for the start of each month,
    I then want shift these values by 1 backwards, in this case because we resampled, this automatically means shift abck one month,
    I then want to re-index the new dataframe to conform to where we are putting it back into, and finally I assign the total dataframe where the 
    column headers are equal to the crude list, the new shifted and filled forward values to make sure everything lines up"""
    total[crudes_to_shift] = total[crudes_to_shift].resample('MS').mean().shift(-1, freq='MS').reindex(total.index).fillna(method='ffill')  

    #total['AAXUC00']
    
    """This will help with the date error. Turn the index into a numpy array and then assign the value"""
    if total.index[-1] - total.index[-2] > pd.Timedelta(days=2):
        total.index.values[-1] = total.index[-2] + pd.Timedelta(days=1)


    """Clean the column hedaers so no white spcaes - use simple list comprehension and set headers equal to cleaned"""
    cleaned_column_headers = [i.strip() for i in total.columns.values]
    total.columns = cleaned_column_headers
    
    """The below was get rid of the row in the index that hax NaT against it and then expand to daily and fill backwards"""
    crude_diffs = pd.read_excel(trader_assessed, 'Crude Diffs Traders', header = 0)
    crude_diffs = crude_diffs.loc[pd.notnull(crude_diffs.index)]
    crude_diffs = crude_diffs.drop([name for name in crude_diffs.columns if 'Unnamed' in name], axis=1)

   
    #crude_diffs.index = crude_diffs.index.map(lambda x : x + 1*BDay())
    crude_diffs = crude_diffs.reindex(total.index).fillna(method='bfill').fillna(method='ffill')
    
    """Slice the crude diffs where the dates in the index are the same as the dates in the total dataframe"""
    #crude_diffs = crude_diffs[crude_diffs.index.isin(total.index)]
    crudes_diff_against_osp = ['Basrah Light','Basrah Heavy']
    codes_list = [x for x in crude_diffs.columns if x not in crudes_diff_against_osp]
    
    """Apply the values in crude diffs to the correct codes and dates in the total dataframe"""
    total.update(crude_diffs[codes_list])
    
    
        
    
    """We have to convert the prices that are in absolutes into a diff vs a local index, and if there are, set to zero.
    This is LOOP Sour"""
    total['AALSM01'].loc[total['AALSM01'] > 30] = total['AALSM01'].loc[total['AALSM01'] > 30] - total['CLc1']
    #total.loc[total.index.isin(crude_diffs.index), codes_list] = crude_diffs[codes_list]
    #total[codes_list]
    
    #total.update(crude_diffs[codes_list])
    """ Need this for the sulphur table"""
    forties_sulphur = pd.read_excel(trader_assessed, 'Forties de-esc', header = [22], parse_cols="H:I").set_index('week ending')
    forties_sulphur = forties_sulphur.loc[pd.notnull(forties_sulphur.index)]
    forties_sulphur = forties_sulphur.reindex(total.index).fillna(method='ffill')

    """Also need to adjust the cfds to take into account the inter month BFOE spread"""   
    cfd_list = ['PCAKA00','PCAKC00','PCAKE00','PCAKG00','AAGLU00','AAGLV00','AALCZ00','AALDA00']
    temp = total[cfd_list].sub(pd.Series(total['PCAAQ00'] - total['PCAAR00']), axis=0)
    temp = temp[temp.index > dt(2017,6,30)]
    total.loc[total.index.isin(temp.index), list(temp.columns)] = temp[list(temp.columns)]
    
    """This turns the 5 years of rate matricies into a table for use to reference - 12/04/2018"""    
    rates = []
    for x,y in enumerate([name.split()[2] for name in raw_rates.sheet_names]):
        f  = pd.read_excel(raw_rates, sheetname = x, header = None).iloc[1:47,1:]
        lplen = len(f.iloc[:,1])
        dplen = len(f.iloc[1,:])
        for j in range(1, dplen):
            for i in range(1,lplen):
                LoadPort = f.iloc[i,0]
                DischargePort = f.iloc[0,j]
                Year = y
                Rate = f.iloc[i,j]
                rates.append({'LoadPort':LoadPort, 'DischargePort': DischargePort, 'Year':Year,'Rate':Rate})
            
    rate_data = pd.DataFrame(rates)
    
    """Also initialise the temp df with index of total. Temp df is tol hold the dataseries needed to calculate the freight"""
    df = pd.DataFrame(index=total.index)
    df['Date'] = df.index
    
    """This function allows us to apply the expiration date for the wti futures used to determine what structure we apply to the CMA
    Have tried timing and slight improvment with the blow of 0.2seconds...."""
   
    t = time.process_time()

    for_dates = lambda x: (expiry_table.loc[(expiry_table.index.month == x.month)&(expiry_table.index.year == x.year)]['Expiry']).iat[0]
   
    df['Expiry'] = df['Date'].apply(for_dates)
    df.drop(['Date'], inplace=True, axis=1)
    
    
    
    

    print("df['Expiry'] created successfully: Time was {}".format(time.process_time() - t))
    print("Temp DataFrame created successfully")
    print("import_data() created successfully: Time was {}".format(time.process_time() - t2))
    
    return assay, ws, ports, total, rate_data, sub_to_ws, df, basrah_ws_base, crude_diffs, forties_sulphur, exceptions, crudes_to_shift





































cxn = pyodbc.connect('Driver=SQL Server Native Client 11.0;'
                                    'Server=STCHGS112;'
                                    'Database=MIMAWorkSpace;'
                                    'uid=mima;'
                                    'Trusted_Connection=Yes;')
    
query = '''CREATE TABLE Global_Arbs_GPWs (
Example #59
0
def read_bpp_from_input_file(
    filename: str,
    expand_orient: bool = False,
    expand_strand: bool = False,
    expand_svtype: bool = False,
    integer_columns: Set[str] = INTEGER_COLUMNS,
    float_columns: Set[str] = FLOAT_COLUMNS,
    required_columns: Set[str] = set(),
    add_default: Dict[str, Any] = {},
    summary: bool = False,
    apply: Dict[str, Callable] = {},
    overwrite: Dict[str, Any] = {},
) -> List[BreakpointPair]:
    """
    reads a file using the tab module. Each row is converted to a breakpoint pair and
    other column data is stored in the data attribute

    Args:
        filename: path to the input file
        expand_ns: expand not specified orient/strand settings to all specific version (for strand this is only applied if the bam itself is stranded)
        explicit_strand: used to stop unstranded breakpoint pairs from losing input strand information
        summary: the input is post-summary so some float/int columns have been merged and delimited with semi-colons
        overwrite: set column values for all breakpoints, if the column exists overwrite its current value

    Returns:
        a list of pairs
    """
    def soft_null_cast(value):
        try:
            cast_null(value)
        except TypeError:
            return value

    if summary:
        integer_columns = integer_columns - SUMMARY_LIST_COLUMNS
        float_columns = float_columns - SUMMARY_LIST_COLUMNS

    try:
        df = pd.read_csv(
            filename,
            dtype={
                **{col: pd.Int64Dtype()
                   for col in integer_columns},
                **{col: float
                   for col in float_columns},
                **{
                    col: str
                    for col in COLUMNS.keys() if col not in (float_columns | integer_columns)
                },
            },
            sep='\t',
            comment='#',
            na_values=[
                'None', 'none', 'N/A', 'n/a', 'null', 'NULL', 'Null', 'nan',
                '<NA>', 'NaN'
            ],
        )
        df = df.where(pd.notnull(df), None)
    except pd.errors.EmptyDataError:
        return []

    for col in required_columns:
        if col not in df and col not in add_default:
            raise KeyError(f'missing required column: {col}')

    if COLUMNS.opposing_strands in df:
        df[COLUMNS.opposing_strands] = df[COLUMNS.opposing_strands].apply(
            lambda x: None if x == '?' else soft_cast(x, cast_type=bool))
    else:
        df[COLUMNS.opposing_strands] = None

    if COLUMNS.stranded in df:
        df[COLUMNS.stranded] = df[COLUMNS.stranded].apply(cast_boolean)
    else:
        df[COLUMNS.stranded] = None

    if COLUMNS.untemplated_seq in df:
        df[COLUMNS.untemplated_seq] = df[COLUMNS.untemplated_seq].apply(
            soft_null_cast)
    else:
        df[COLUMNS.untemplated_seq] = None

    for col in [COLUMNS.break1_chromosome, COLUMNS.break2_chromosome]:
        df[col] = df[col].apply(lambda v: re.sub(r'^chr', '', v))

    if COLUMNS.tracking_id not in df:
        df[COLUMNS.tracking_id] = ''
    else:
        df[COLUMNS.tracking_id] = df[COLUMNS.tracking_id].fillna(str(uuid()))

    # add default values
    for col, default_value in add_default.items():
        if col in df:
            df[col] = df[col].fillna(default_value)
        else:
            df[col] = default_value

    # run the custom functions
    for col, func in apply.items():
        df[col] = df[col].apply(func)

    # set overwriting defaults
    for col, value in overwrite.items():
        df[col] = value

    # enforce controlled vocabulary
    for vocab, cols in [
        (ORIENT, [COLUMNS.break1_orientation, COLUMNS.break2_orientation]),
        (STRAND, [COLUMNS.break1_strand, COLUMNS.break2_strand]),
        (PROTOCOL, [COLUMNS.protocol]),
    ]:
        for col in cols:
            if col in df:
                df[col].apply(lambda c: vocab.enforce(c))  # type: ignore
            elif hasattr(vocab, 'NS'):
                df[col] = vocab.NS  # type: ignore

    def validate_pipeline_id(value):
        if not re.match(r'^([A-Za-z0-9-]+|)(;[A-Za-z0-9-]+)*$', value):
            raise AssertionError(
                'All mavis pipeline step ids must satisfy the regex:',
                '^([A-Za-z0-9-]+|)(;[A-Za-z0-9-]+)*$',
                value,
            )

    for col in [
            COLUMNS.cluster_id, COLUMNS.annotation_id, COLUMNS.validation_id
    ]:
        if col in df:
            try:
                df[col].apply(validate_pipeline_id)
            except AssertionError as err:
                raise AssertionError(f'error in column ({col}): {err}')

    rows = df.where(df.notnull(), None).to_dict('records')
    non_data_columns = {
        COLUMNS.break1_chromosome,
        COLUMNS.break1_position_start,
        COLUMNS.break1_position_end,
        COLUMNS.break1_strand,
        COLUMNS.break1_orientation,
        COLUMNS.break2_chromosome,
        COLUMNS.break2_position_start,
        COLUMNS.break2_position_end,
        COLUMNS.break2_strand,
        COLUMNS.break2_orientation,
        COLUMNS.stranded,
        COLUMNS.opposing_strands,
        COLUMNS.untemplated_seq,
    }
    pairs: List[BreakpointPair] = []

    for line_index, row in enumerate(rows):
        row['line_no'] = line_index + 1

        if '_index' in row:
            del row['_index']
        for attr, val in row.items():
            row[attr] = soft_null_cast(val)

        stranded = row[COLUMNS.stranded]

        strand1 = row[COLUMNS.break1_strand] if stranded else STRAND.NS
        strand2 = row[COLUMNS.break2_strand] if stranded else STRAND.NS

        temp = []
        expand_strand = stranded and expand_strand
        event_type = [None]
        if not pd.isnull(row.get(COLUMNS.event_type)):
            try:
                event_type = row[COLUMNS.event_type].split(';')
                for putative_event_type in event_type:
                    SVTYPE.enforce(putative_event_type)
            except KeyError:
                pass

        for orient1, orient2, strand1, strand2, putative_event_type in itertools.product(
                ORIENT.expand(row[COLUMNS.break1_orientation])
                if expand_orient else [row[COLUMNS.break1_orientation]],
                ORIENT.expand(row[COLUMNS.break2_orientation])
                if expand_orient else [row[COLUMNS.break2_orientation]],
                STRAND.expand(strand1)
                if expand_strand and stranded else [strand1],
                STRAND.expand(strand2)
                if expand_strand and stranded else [strand2],
                event_type,
        ):
            try:
                break1 = Breakpoint(
                    row[COLUMNS.break1_chromosome],
                    row[COLUMNS.break1_position_start],
                    row[COLUMNS.break1_position_end],
                    strand=strand1,
                    orient=orient1,
                )
                break2 = Breakpoint(
                    row[COLUMNS.break2_chromosome],
                    row[COLUMNS.break2_position_start],
                    row[COLUMNS.break2_position_end],
                    strand=strand2,
                    orient=orient2,
                )

                data = {
                    k: v
                    for k, v in row.items() if k not in non_data_columns
                }
                bpp = BreakpointPair(
                    break1,
                    break2,
                    opposing_strands=row[COLUMNS.opposing_strands],
                    untemplated_seq=row[COLUMNS.untemplated_seq],
                    stranded=row[COLUMNS.stranded],
                )
                bpp.data.update(data)
                if putative_event_type:
                    bpp.data[COLUMNS.event_type] = putative_event_type
                    if putative_event_type not in BreakpointPair.classify(bpp):
                        raise InvalidRearrangement(
                            'error: expected one of',
                            BreakpointPair.classify(bpp),
                            'but found',
                            putative_event_type,
                            str(bpp),
                            row,
                        )
                if expand_svtype and not putative_event_type:
                    for svtype in BreakpointPair.classify(
                            bpp, distance=lambda x, y: Interval(y - x)):
                        new_bpp = bpp.copy()
                        new_bpp.data[COLUMNS.event_type] = svtype
                        temp.append(new_bpp)
                else:
                    temp.append(bpp)
            except InvalidRearrangement as err:
                if not any([expand_strand, expand_svtype, expand_orient]):
                    raise err
            except AssertionError as err:
                if not expand_strand:
                    raise err
        if not temp:
            raise InvalidRearrangement(
                'could not produce a valid rearrangement', row)
        else:
            pairs.extend(temp)
    return pairs
def forties_sulphur_records():
    trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm')
    forties_sulphur = pd.read_excel(trader_assessed, 'Forties de-esc', header = [22], parse_cols="H:I").set_index('week ending')
    forties_sulphur = forties_sulphur.loc[pd.notnull(forties_sulphur.index)]
    forties_sulphur_input = forties_sulphur.reset_index().rename(columns={'buzzard content':'BuzzardContent','week ending':'Date'}).to_dict('records')
    return forties_sulphur_input