Ejemplo n.º 1
0
df["age"]=df["age"].fillna('',inplace=True)
df["age"]=pd.to_numeric(df['age'].astype(float), errors='ignore')

print("데이터 타입확인:", df.dtypes)
df['price']=df['price'].astype(str).str.strip()
df["price"].fillna('',inplace=True)
df["price"]
df["price"] = pd.to_numeric(df['price'], errors='ignore')
df["price"]=df["price"].astype(float)
print("데이터 타입확인:", df.dtypes)

#panel(3차원) python에서 사이즈 다르면 : 제일 큰 것을 기준으로
data = {'Item1' : pd.DataFrame(np.random.randn(4, 3)),
'Item2' : pd.DataFrame(np.random.randn(4, 2)),
'Item3' : pd.DataFrame(np.random.randn(4, 2))}
p = pd.Panel(data)
print(p)
print(p['Item1'])
print(p['Item2'])
print(p['Item3'])
print(p.major_xs(0))
print(p.minor_xs(1))

# Major_axis axis: 0 to 3  행으로
# Minor_axis axis: 0 to 2  열로


#  1) dataset.csv를 로딩하고, resident열과 position열을 출력하시오


#  2) describe로 확인할 때 숫자 데이터인 것이 나타나지 않는 원인을 확인하시오
Ejemplo n.º 2
0
def FrameExtract(path1, path2):
    print(f"Working with video file {path1[path1.find('/')+1:]} ")
    video_obj = cv2.VideoCapture(path1)  #Video object
    detector = dlib.get_frontal_face_detector()  #Face detector
    predictor = dlib.shape_predictor(
        path2
    )  #Landmark identifier. Set the filename to whatever you named the downloaded file
    count = 0
    ret = 1
    p = []

    while ret:  #Runs ONCE for each FRAME
        ret, frame = video_obj.read()
        if (ret == 0):
            continue
        gray = cv2.cvtColor(frame,
                            cv2.COLOR_BGR2GRAY)  #Convert image to grayscale
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        clahe_image = clahe.apply(gray)
        detections = detector(clahe_image, 1)
        #Detect the faces in the image
        for k, d in enumerate(detections):
            #print("enter loop") #For each detected face
            shape = predictor(clahe_image, d)
            #shape1 = face_utils.shape_to_np(shape) This works by utilizing imutis library
            #print(shape1)
            #print(shape.part(i).y)#Get coordinates
            #print(shape)
        vec = np.empty([68, 2], dtype=int)
        for b in range(68):
            vec[b][0] = shape.part(b).x
            vec[b][1] = shape.part(b).y
        #print(vec.shape)
        v = vec.tolist()
        #print(type(v))
        #print(type(p))
        p.append(v)

        #
        #print(shape1-vec)
        #cv2.imshow("image", frame) #Display the frame
        #print("enter after loop")
        if not os.path.exists(path1[:path1.find(".")]):
            os.makedirs(path1[:path1.find(".")])
        #print(count)
        #cv2.imwrite(path1[:path1.find(".")]+"/frame%d.jpg" % count, imag)

        count += 1
    p1 = np.array(p)
    print(p1.shape)
    panel = pd.Panel(
        p1,
        items=['Frame {}'.format(i) for i in range(0, p1.shape[0])],
        major_axis=['Landmark {}'.format(i) for i in range(0, 68)],
        minor_axis=['x', 'y'])
    pnl = panel.to_frame()
    #print(p1)
    #print(pnl)
    pnl.to_csv(path1[:path1.find(".")] + '/landmark_points_dataframe.csv')
    np.save(path1[:path1.find(".")] + '/landmark_points_array.out', p1)
    #np.savetxt(path1[:path1.find(".")]+"/reshaped.txt", p1.reshape((3,-1)), fmt="%s", header=str(p1.shape))
    return p1
Ejemplo n.º 3
0
    def export_results_as_panel(self):
        """Convert results into a pandas panel object for easier data visualization

        Returns
        -------
        output : pandas.Panel
            A panel containing the results of the testing.
        """
        problemIDs = [problem.id for problem in self.problems]
        configIDs = [config.id for config in self.configs]

        # Make a dummy TestResults instance to generate labels:
        dummy = TestResults(TestProblem(None, None),
                            SolverConfiguration(None, None, None, None))
        attributes = inspect.getmembers(dummy,
                                        lambda a: not (inspect.isroutine(a)))
        labels = [
            label[0] for label in attributes
            if not (label[0].startswith('__') and label[0].endswith('__'))
        ]
        # Unpack size_metrics label with another dummy
        dummy = cvx.Problem(cvx.Minimize(cvx.Variable())).size_metrics
        attributes = inspect.getmembers(dummy,
                                        lambda a: not (inspect.isroutine(a)))
        size_metrics_labels = [label[0] for label in attributes if not(label[0].startswith('__') and \
                                                                       label[0].endswith('__'))]

        labels += size_metrics_labels

        # Remove unused columns
        labels.remove("size_metrics")
        labels.remove("test_problem")
        labels.remove("config")

        output = pd.Panel(items=labels,
                          major_axis=problemIDs,
                          minor_axis=configIDs)
        for result in self._results:
            result_dict = result.__dict__

            # Unpack the size_metrics object inside it:
            sizemetrics_dict = result_dict["size_metrics"].__dict__
            del (result_dict["size_metrics"])

            result_dict.update(sizemetrics_dict)

            problemID = result_dict["test_problem"]
            del (result_dict["test_problem"])
            configID = result_dict["config"]
            del (result_dict["config"])

            for key, value in list(result_dict.items()):
                output.loc[key, problemID, configID] = value

        # Compute Statistics
        try:
            TestFramework.compute_mosek_error(output, "opt_val",
                                              "mosek_config")
        except (KeyError):
            print(
                "TestFramework.compute_mosek_error: 'mosek_config' or 'opt_val' field not found."
            )
        try:
            TestFramework.compute_performance(output, "solve_time")
        except (KeyError):
            print(
                "TestFramework.compute_performance: 'solve_time' field not found."
            )
        return output
Ejemplo n.º 4
0
f2=DataFrame(data,columns=['name','price','marks'])
print(f2)

f3=DataFrame(data,columns=['name','marks','price'],index=['a','b','c'])
print(f3)


f4=DataFrame(data={"lang":{"firstline":"python","secondline":"java"}, "price":{"firstline":8000}})
print(f4)

print(DataFrame(data={"lang":{"firstline":"python","secondline":"java"}, "price":{"firstline":8000}},index=["firstline","secondline","thirdline"]))

print(f3['name'])

f6=DataFrame(data={'username':{'first':'wangxing','second':'dadiao'},'age':{'first':24,'second':25}},columns=['username','age','sex']);
print(f6)

f6['sex']='man'
print(f6)

f6['age']['second'] = 30
print(f6)

rng = pd.date_range('1/1/2013',periods=5,freq='D')
data = np.random.randn(5, 4)
cols = ['A','B','C','D']
df1, df2, df3 = pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols)
pf = pd.Panel({'df1':df1,'df2':df2,'df3':df3});
print(pf['df1'])
Ejemplo n.º 5
0
 def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}):
     if raw_data.shape[2] == 0:
         return pd.Panel(items=factor_names, major_axis=dts, minor_axis=ids)
     return raw_data.loc[:, dts, ids]
Ejemplo n.º 6
0
def initialize(context):

    set_benchmark('000300.XSHG')

    set_option('use_real_price', True)

    set_order_cost(OrderCost(open_tax=0, close_tax=0.001, \

                             open_commission=0.0003, close_commission=0.0003,\

                             close_today_commission=0, min_commission=5), type='stock')

    #context.portfolios=['000001.XSHE', '000002.XSHE', '000008.XSHE', '000009.XSHE', '000027.XSHE', '000039.XSHE', '000060.XSHE', '000061.XSHE', '000063.XSHE', '000069.XSHE', '000100.XSHE', '000156.XSHE', '000157.XSHE', '000166.XSHE', '000333.XSHE', '000338.XSHE', '000402.XSHE', '000413.XSHE', '000415.XSHE', '000423.XSHE', '000425.XSHE', '000503.XSHE', '000538.XSHE', '000540.XSHE', '000555.XSHE', '000559.XSHE', '000568.XSHE', '000623.XSHE', '000625.XSHE', '000627.XSHE', '000630.XSHE', '000651.XSHE', '000671.XSHE', '000686.XSHE', '000709.XSHE', '000712.XSHE', '000718.XSHE', '000725.XSHE', '000728.XSHE', '000738.XSHE', '000750.XSHE', '000768.XSHE', '000776.XSHE', '000778.XSHE', '000783.XSHE', '000792.XSHE', '000793.XSHE', '000800.XSHE', '000826.XSHE', '000839.XSHE', '000858.XSHE', '000876.XSHE', '000895.XSHE', '000917.XSHE', '000938.XSHE', '000963.XSHE', '000977.XSHE', '000983.XSHE', '001979.XSHE', '002007.XSHE', '002008.XSHE', '002024.XSHE', '002027.XSHE', '002049.XSHE', '002065.XSHE']

    context.portfolios = get_index_stocks('000300.XSHG')

    g.n = 1

    g.m = 5

    g.waitdays = 15

    g.stock_nday_out = {}  # buy n days and out

    # wait the aa to stablize.

    #tmp = history(2, unit='1d', field='close', security_list='000300.XSHG', df=True, skip_paused=False, fq='pre')

    #print(tmp)

    multidx = pd.MultiIndex.from_product([context.portfolios, [0, 1]])

    # 是否已发送了order

    g.fired = False

    g.P = np.matrix('10 0;0 10')

    g.Ppanel = pd.Panel(np.zeros([len(context.portfolios), 2, 2]),
                        items=context.portfolios)

    for value in context.portfolios:

        g.Ppanel[value] = g.P

    g.c = 0.01

    g.lmd = 0.9

    g.daytolive = 5  # max days to hold a position

    g.aa = pd.DataFrame(np.zeros([1, multidx.size]), columns=multidx)

    g.aadataframe = pd.DataFrame(np.zeros([1, multidx.size]), columns=multidx)

    g.slot = 5

    g.long_position = 0

    #g.subcash = context.stock_account.cash/context.slot

    #run_daily(ffb_wlx, 'every_bar')

    #run_daily(stop_retreat,'open')

    #run_daily(stoploss,'open')

    run_daily(exit_fixdays, 'open')

    run_daily(trade, 'open')
Ejemplo n.º 7
0
    def fetch(self,
              tickers,
              fields=None,
              date=None,
              date_from=None,
              date_to=None,
              freq='D',
              only_data=True,
              static=False):
        """Fetch data from TR DWE.

           tickers - ticker or list of tickers
           fields  - list of fields.
           date    - date for a single-date query
           date_from, date_to - date range (used only if "date" is not specified)
           freq    - frequency of data: daily('D'), weekly('W') or monthly('M')
           only_data - if True then metadata will not be returned
           static  - if True "static" request is created (i.e. not a series).
                     In this case 'date_from', 'date_to' and 'freq' are ignored

           NB! in case list of tickers is requested, pandas.Panel is returned.

           Some of available fields:
           P  - adjusted closing price
           PO - opening price
           PH - high price
           PL - low price
           VO - volume, which is expressed in 1000's of shares.
           UP - unadjusted price
           OI - open interest

           MV - market value
           EPS - earnings per share
           DI - dividend index
           MTVB - market to book value
           PTVB - price to book value
           ...

           The full list of data fields is available at http://dtg.tfn.com/.
        """
        if static:
            query = self.construct_request(tickers, fields, date, freq='REP')
        else:
            query = self.construct_request(tickers, fields, date, date_from,
                                           date_to, freq)

        raw = self.request(query)

        if static:
            data, metadata = self.parse_record_static(raw)
        elif isinstance(tickers, basestring) or len(tickers) == 1:
            data, metadata = self.parse_record(raw)
        elif hasattr(tickers, '__len__'):
            metadata = pd.DataFrame()
            data = {}
            for indx in range(len(tickers)):
                dat, meta = self.parse_record(raw, indx)
                data[tickers[indx]] = dat
                metadata = metadata.append(meta, ignore_index=False)

            data = pd.Panel(data).swapaxes('items', 'minor')
        else:
            raise DatastreamException(
                ('First argument should be either ticker or '
                 'list of tickers'))

        if only_data:
            return data
        else:
            return data, metadata
Ejemplo n.º 8
0
#/bin/python
"""This is where sub parts of code will be tested"""

import pandas
import gym

env = gym.make("Pong-v0")
obs = env.reset()
pan = pandas.Panel(obs)
df = pan.swapaxes(0, 2).to_frame()
rm = df.rolling(12, 1, center=True).mean()
rst = df.rolling(12, 1, center=True).std()
print(rm)
print(rst)
Ejemplo n.º 9
0
import numpy as np
import pandas as pd

dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
panel = pd.Panel({'one': df, 'two': df - df.mean()})
print(panel)

s = df['A']
print(s[dates[5]])
print(panel['two'])

print(df)
df[['B', 'A']] = df[['A', 'B']]
print(df)

'''
警告
pandas aligns all AXES when setting Series and DataFrame from .loc, and .iloc. This will not modify df 
because the column alignment is before value assignment.
'''
print(df[['A', 'B']])
df.loc[:, ['B', 'A']] = df[['A', 'B']]
print(df[['A', 'B']])
# 正确的方法
df.loc[:, ['B', 'A']] = df[['A', 'B']].values
print(df[['A', 'B']])
Ejemplo n.º 10
0
def SedSim_Main_Simulation(Num_Realizations, T, Input_Data_File, element_stochastic_components, SystemObjects, Element_Dict,
                           Flushing_Group_Dict, Stochastic_Sim, Parameter_Input_Dictionary, simulation_dates_no_leap,
                           Time_Series_Output_Dictionary, Sim_Dur, simulation_dates, Col_Names, Simulation_mode,
                           Output_Object_Dict, var_sub_list, element_export_list, Sampled_Parameter_Dict,
                           Synthetic_Inflow_dataframe_name_LIST, Synthetic_Inflows_dictionary, op_policy_params=None,
                           optimization=0):

    # Inputs:
    # Notes: All inputs are generated automatically in the main top-level pysedsim.py file through various function calls there.

    try:
        stochastic_flow_list = Parameter_Input_Dictionary['1']['Locations']
    except KeyError:
        stochastic_flow_list = []

    distribution_name = {}
    for i in SystemObjects["Ordered Simulation List"]:
        distribution_name[i] = None

    for rz in range(Num_Realizations):
        # Create all objects (instances of classes) to be simulated (reservoirs, reaches, etc.). SystemObjects is fed in and then updated.
        [SystemObjects, Flushing_Group_Dict, element_stochastic_components] = System_Object_Creation(T, Input_Data_File,
                                                                                                     element_stochastic_components,
                                                                                                     SystemObjects,
                                                                                                     Element_Dict,
                                                                                                     Flushing_Group_Dict,
                                                                                                     stochastic_flow_list,
                                                                                                     op_policy_params=op_policy_params)

        # If this is a monte carlo, set relevant values for all objects (e.g., daily inflows at junction for this simulation) that were set
        # in the monte carlo function.
        if Stochastic_Sim == 1:
            for i in SystemObjects["Ordered Simulation List"]:
                # Loop through System Objects. Every time you hit a junction, set the junction incremental flow variable equal to the next
                # Synthetic flow realization column from the Synthetic_Inflows_dictionary.
                try:
                    distribution_name[i] = Sampled_Parameter_Dict[i]['2']  # Need to feed this to junction calibration routine if it exists.
                except KeyError:
                    pass
                if SystemObjects[i].Element_Sub_Dict["Type"] == "Junction":
                    for keys in Synthetic_Inflows_dictionary:
                        if i == keys:
                            # We have a match: system object name is identical to the name of the dataframe for which synthetic flows
                            # exist in Synthetic_Inflows_dictionary. Take the rz-th (e.g., 1 through 100th) member of the list
                            # corresponding to junction name of interest. Use .values so existing variable is not converted to pandas DF.
                            SystemObjects[i].Q_incremental = Synthetic_Inflows_dictionary[keys][
                                Synthetic_Inflow_dataframe_name_LIST[keys][rz]].values
                            SystemObjects[i].Q_incremental = SystemObjects[i].Q_incremental.astype(float)  # Cast as float avoids errors.
                            SystemObjects[i].Q_out_unreg_for_param_calib = SystemObjects[i].Q_incremental  # Must follow above step.
                            break
                    if '5' in Sampled_Parameter_Dict[i].keys():
                        SystemObjects[i].sed_beta = Sampled_Parameter_Dict[i]['5'][rz]
                    if '3' in Sampled_Parameter_Dict[i].keys():
                        # Set both the annual sediment load and cumulative annual sediment load. The user provides the cumulative load,
                        # but it is necessary to temporarily set the annual load before an adjustment takes place below.
                        SystemObjects[i].Annual_SED_LOAD = Sampled_Parameter_Dict[i]['3'][rz]
                        SystemObjects[i].Cum_Annual_SED_LOAD = Sampled_Parameter_Dict[i]['3'][rz]
                elif SystemObjects[i].Element_Sub_Dict["Type"] == "Reservoir":
                    if '7' in Sampled_Parameter_Dict[i].keys():
                        SystemObjects[i].Sed_Trapping_Curve_Spec = Sampled_Parameter_Dict[i]['7'][rz]
                    if '8' in Sampled_Parameter_Dict[i].keys():
                        SystemObjects[i].density_SS = Sampled_Parameter_Dict[i]['8'][rz]
                    if '10' in Sampled_Parameter_Dict[i].keys():
                        SystemObjects[i].E_Sed_Curve_Adjustment(Sampled_Parameter_Dict[i]['10'][rz])
                    if '12' in Sampled_Parameter_Dict[i].keys():
                        if 'Sluicing' in SystemObjects[i].Operating_Policy.keys():
                            SystemObjects[i].stoch_trap_adjust = Sampled_Parameter_Dict[i]['12'][rz]
                    if '13' in Sampled_Parameter_Dict[i].keys():
                        if 'Flushing' in SystemObjects[i].Operating_Policy.keys():
                            SystemObjects[i].Operating_Policy["Flushing"].W_fb_coeff = Sampled_Parameter_Dict[i]['13'][rz]
                elif SystemObjects[i].Element_Sub_Dict["Type"] == "Bypass Structure":
                    if '11' in Sampled_Parameter_Dict[i].keys():
                        SystemObjects[i].Bypass_Fraction = Sampled_Parameter_Dict[i]['11'][rz]

        i = 0  # initialize counter

        # Only execute this portion during the first realization.
        # Check to see that incremental sediment load calibration is required anywhere.
        incremental_calibration = 0  # Default. Will be reset below if applicable.
        for i in SystemObjects["Ordered Simulation List"]:
            if SystemObjects[i].Element_Sub_Dict["Type"] == "Junction":
                if SystemObjects[i].calibration_preference == 1:
                    incremental_calibration = 1
                    break

        # Check to see that channel carrying capacity sediment load calibration is required anywhere.
        carrying_capacity_calibration = 0  # Set default. Will be reset below if applicable.
        for i in SystemObjects["Ordered Simulation List"]:
            if SystemObjects[i].Element_Sub_Dict["Type"] == "Reach":
                if SystemObjects[i].calibration_preference == 1:
                    carrying_capacity_calibration = 1
                    break
                else:
                    carrying_capacity_calibration = 0

        # Loop to store incremental annual sediment loads.
        if incremental_calibration == 1:
            for i in SystemObjects["Ordered Simulation List"]:
                if SystemObjects[i].Cum_Annual_SED_LOAD == 0:
                    # Not a junction, so by definition no incremental load exists. However, cumulative sediment load data need to be stored
                    # for use in channel carrying capacity calibration.
                    for item in SystemObjects[i].Element_Sub_Dict["Inflow Elements"]:
                        # Determine daily element inflows, use it in master caller.
                        SystemObjects[i].Cum_Annual_SED_LOAD += SystemObjects[item].Cum_Annual_SED_LOAD
                else:
                    # Element is a junction. Do nothing, as cumulative sediment data were provided by user for this junction. Still need to
                    # determine the incremental annual sediment load, though.
                    for item in SystemObjects[i].Element_Sub_Dict["Inflow Elements"]:
                        SystemObjects[i].Annual_SED_LOAD -= SystemObjects[item].Cum_Annual_SED_LOAD
                    SystemObjects[i].Calibration_Incremental_Sediment_Load(element_stochastic_components[i], distribution_name[i])
        # Carrying capacity calibration - determination of cumulative daily flow passing each point.
        if carrying_capacity_calibration == 1:
            for i in SystemObjects["Ordered Simulation List"]:
                # Do the reach carrying capacity calibration. First determine cumulative daily element
                for item in SystemObjects[i].Element_Sub_Dict["Inflow Elements"]:
                    SystemObjects[i].Q_out_unreg_for_param_calib += SystemObjects[item].Q_out_unreg_for_param_calib
                if SystemObjects[i].Element_Sub_Dict["Type"] == "Reach":
                    SystemObjects[i].Calibration_Reach_Sediment_Carrying_Capacity()

        # Add dredging inflow elements to reservoirs that are the "Dredging Outflow Element" of any reservoirs being dredged.
        for i in SystemObjects["Ordered Simulation List"]:
            try:
                if SystemObjects[i].Element_Sub_Dict["Dredging Outflow Element"] is not None:
                    # Dredging exists, so add it to destination elements list.
                    SystemObjects[SystemObjects[i].Element_Sub_Dict["Dredging Outflow Element"]].Element_Sub_Dict[
                        "Dredging Inflow Elements"].append(i)
            except KeyError:
                pass  # No dredging exists for reservoir

        # Main PySedSim simulation loop. Loops through objects in order at each time step. Simulation begins at time t = 0, though time t=0
        # values for storage-type variables are loaded during instantiation of classes (reservoirs, reaches, etc. have initial/time zero
        # values loaded there).
        for t in range(0, T):
            for i in SystemObjects["Ordered Simulation List"]:
                # Before simulating next object, locate data that is to be shared among elements, stored in each
                # element's self.elem_xfer_output_dict dictionary. Then store in destination element's input dictionary.
                for j in SystemObjects["Ordered Simulation List"]:
                    if j in SystemObjects["Ordered Simulation List"][0:SystemObjects["Ordered Simulation List"].index(i) + 1]:
                        # If element j has been simulated in time step t already.
                        try:
                            # See if object j has data to transfer to object i. Transfer if so. In case element that needs to transfer
                            # data is downstream and hence hasn't yet been simuluated, need to check this again later.
                            SystemObjects[i].elem_xfer_input_dict[j] = SystemObjects[j].elem_xfer_output_dict[i]
                        except KeyError:
                            pass
                Flow_in = 0
                Sed_in = 0
                for item in SystemObjects[i].Element_Sub_Dict["Inflow Elements"]:
                    # Determine daily element inflows, use it in master caller.  Locate the outflows from all the upstream elements,
                    # whose names are contained in Element_Dict.
                    Flow_in += SystemObjects[item].Element_Sub_Dict["Daily Water Outflows"][i]
                    Sed_in += SystemObjects[item].Element_Sub_Dict["Daily Sediment Outflows"][i]
                    if SystemObjects[item].Element_Sub_Dict["Type"] == "Bypass Structure":
                        # If "i" is a junction for which "item" is a bypass upstream, set water and sediment inflow rates for
                        # reservoir/bypass
                        SystemObjects[i].Qbypass = SystemObjects[item].Q_bypass[t]
                        SystemObjects[i].SSWbypass = SystemObjects[item].SS_W_Bypass[t]
                        SystemObjects[i].QReservoir = SystemObjects[item].Res_Q_in[t]
                        SystemObjects[i].SedReservoir = SystemObjects[item].Res_SS_W_in[t]
                        SystemObjects[i].VinReservoir = SystemObjects[item].Res_V_in[t]
                if SystemObjects[i].Element_Sub_Dict["Type"] == "Junction":
                    # If junction splits to 2+ downstream elements, if any are reservoirs, send back reservoir water level to junction
                    # for purposes of computing distribution of flow as a function of flow and downstream reservoir water level.
                    for item in SystemObjects[i].Element_Sub_Dict["Outflow Elements"]:
                        try:
                            SystemObjects[i].DS_res_WSE = SystemObjects[item].Element_Sub_Dict["Natural Bypass Water Level"][i]
                        except KeyError:
                            pass
                    Flow_in += SystemObjects[i].Q_incremental[t]  # Add incremental flows only if element is a junction
                    Sed_in += SystemObjects[i].Incremental_Sed_Load_Junction[t]
                if SystemObjects[i].Element_Sub_Dict["Type"] == "Reservoir":
                    # Execute all main routines for time t for reservoir from here.
                    SystemObjects[i].Master_Method_Caller(t, Flow_in, Sed_in, Flushing_Group_Dict)
                    Flushing_Group_Dict = SystemObjects[i].Flushing_Dictionary  # Update Flushing Group Dict [t] in case reservoirs are grouped.
                else:
                    # For all other elements, execute all primary routines of the element for time t from here.
                    SystemObjects[i].Master_Method_Caller(t, Flow_in, Sed_in)

                # Now that element's sediment mass balance has been simulated, if there's any dredged sediment, send it to destination
                # element's BS_W
                try:
                    SystemObjects[SystemObjects[i].Element_Sub_Dict["Dredging Outflow Element"]].BS_W[t + 1] += \
                    SystemObjects[i].Operating_Policy["Dredging"].Sediment_Load_Removed_Daily[t]
                except KeyError:
                    pass  # no dredging exists for reservoir i

            for i in SystemObjects["Ordered Simulation List"]:
                for j in SystemObjects["Ordered Simulation List"]:
                    try:
                        # See if object j has data to transfer to object i. Transfer if so. In case element that needs to transfer
                        # data is downstream and hence hasn't yet been simuluated, need to check this again later.
                        SystemObjects[i].elem_xfer_input_dict[j] = SystemObjects[j].elem_xfer_output_dict[i]
                        # Before proceeding to next time step, for all reservoirs, re-run energy calculations to account for flow
                        # at downstream junction impacting tailwater, if this reservoir is one that has a downstream junction
                        # with multiple inflow elements.
                        if SystemObjects[i].Element_Sub_Dict["Type"] == "Reservoir":
                            if SystemObjects[i].re_calc_energy == 1:
                                SystemObjects[i].Import_External_Element_State(t)
                                SystemObjects[i].Hydropower_Calculations(t)
                    except KeyError:
                        pass

        # If this is the first simulation, can now initialize an output dictionary of dataframes.
        if rz == 0:
            num_states = {}  # Stores num. state variables that are stored for each system element. Each element type has diff. number.
            state_list = {}  # Stores list of state var names for each object instance. Can only be done upon completion of simulation
            state_list_excel = {}  # Stores list of state var names for each object instance, truncated so name will fit into a worksheet name.
            if element_export_list is None:
                # User did not indicate for which elements to export output, so export all elements.
                element_export_list = SystemObjects["Ordered Simulation List"]
            else:
                # User did indicate for which elements to export output. To avoid misspellings, only export those
                # elements that were spelled correctly (according to names defined in Network Connectivity sheet).
                element_export_list = list(set(element_export_list) & set(SystemObjects["Ordered Simulation List"]))
            for i in element_export_list:
                outlets_list = []
                if var_sub_list is None:
                    # User did not indicate for which elements to export output, so export all time series state
                    # variables that apply to each element.
                    user_export_indicated = 0  # User did not indicate what to export.
                    var_list = SystemObjects[i].__dict__.keys()
                else:
                    # User did indicate for which variables to export output. To avoid misspellings, only store in
                    # Time_Series_Output_Dictionary those variables that were spelled correctly.
                    user_export_indicated = 1  # User did indicate what variables to export.
                    var_list = list(set(SystemObjects[i].__dict__.keys()) & set(var_sub_list))
                    # Account for additional time series variables user may want to export not stored in __dict__,
                    # and add back into var list.
                    try:
                        outlets_list = list(set(SystemObjects[i].Orifices.__dict__.keys()) & set(var_sub_list))
                        var_list += outlets_list
                    except AttributeError:
                        pass  # Element has no orifice attribute.
                num_states[i] = 0  # Initialize before counting how many time series variables exist.
                state_list[i] = []  # Initialize empty list to store state variables for each object
                state_list_excel[i] = []  # Initialize empty list to store state variables for each object
                for item in range(len(var_list)):
                    # Use correct object to export time series (orifices require opening up a reservoir object to
                    # grab underlying time series)
                    if var_list[item] not in outlets_list:
                        object_switcher = SystemObjects[i]
                    else:
                        object_switcher = SystemObjects[i].Orifices

                    if type(getattr(object_switcher, var_list[item])) == np.ndarray:
                        if user_export_indicated == 1:
                            # Assume user has correctly specified a time series style variable's name.
                            num_states[i] += 1
                            state_list[i].append(var_list[item])  # Add variable name to time series state list
                            state_list_excel[i].append(var_list[item][0:30])  # Only keep first 30 charac, as name will be exported to excel.
                        else:
                            # PySedSim is internally selecting time series variables for export. Need to make sure
                            # those variables selected are actually time series, and not just numpy arrays.
                            if (len(getattr(object_switcher, var_list[item])) == Sim_Dur or len(
                                    getattr(object_switcher, var_list[item])) == Sim_Dur + 1):
                                num_states[i] += 1
                                state_list[i].append(var_list[item])  # Add variable name to time series state list
                                state_list_excel[i].append(var_list[item][0:30])  # Only keep first 30 charac, as name will be exported to excel.
                Time_Series_Output_Dictionary[i] = pd.Panel(np.zeros((num_states[i], Sim_Dur, Num_Realizations)), items=state_list[i],
                                                            major_axis=simulation_dates, minor_axis=Col_Names)

        # Simulation of length Sim_Dur is now complete for Realization_i. Store data in output dict/dataframe.
        # Loop through SystemObjects and states/variables to store for this realization.
        if Simulation_mode == 'debug':
            Output_Object_Dict[rz] = {}  # Initialize sub-dict for each realization, if in debug mode.
        # Loop through each object's attributes, identify time series arrays, and store them in the Time Series Output Dictionary.
        outlets_list = ['Q_downstream', 'Q_overflow', 'Q_diversion', 'Q_low_level_outlet', 'Q_controlled',
                        'Q_turbines', 'Q_mid_level_outlet']
        for element in element_export_list:
            for i in state_list[element]:
                # Use correct object to export time series (orifices require opening up a reservoir object to
                # grab underlying time series)
                if i not in outlets_list:
                    object_switcher = SystemObjects[element]
                else:
                    object_switcher = SystemObjects[element].Orifices

                if len(getattr(object_switcher, i)) == Sim_Dur:
                    # Attribute is an array of length Sim_Dur, so handle it accordingly.
                    Time_Series_Output_Dictionary[element][i][Col_Names[rz]] = getattr(object_switcher, i)[0:Sim_Dur]
                elif len(getattr(object_switcher, i)) == (Sim_Dur + 1):
                    # Attribute is an array of length Sim_Dur + 1, so handle it accordingly.
                    Time_Series_Output_Dictionary[element][i][Col_Names[rz]] = getattr(object_switcher, i)[1:Sim_Dur+1]
                else:
                    pass  # User does not want to store this array in output dictionary.

            # Store SystemObjects output in a dictionary, if in debug mode
            if Simulation_mode == 'debug':
                Output_Object_Dict[rz][element] = SystemObjects[element]

        if optimization == 0:
            # Only print info for each simulation if not operating in coupled simulation-optimization mode
            logging.info("Simulation Realization {0} Complete.".format((rz+1)))

    return state_list_excel, Time_Series_Output_Dictionary, Output_Object_Dict
Ejemplo n.º 11
0
def read_gome2_l3(filepath, lat=None, lon=None, dist_tolerance=56e3):
    """
    Read GOME-2 SIF Level 2 data (whole file or single point).

    Parameters
    ----------
    @TODO: to be completed

    Return
    ------
    panel_sif : pandas.Panel
        A 3D panel structure, the first dimension is the variable name
        (also known as 'item'), the second one is the latitude index, and
        the third one is the longitude.
        Note that 4D and N-dimensional panels have been deprecated. For
        concatenating multiple panel data objects to form a time series,
        please use the `xarray` package.

    df_sif : pandas.DataFrame
        If point extraction

    Raise
    -----
    RuntimeError
        If the file is not found.

    """
    # from filename, parse the date string and the satellite name
    filename = os.path.basename(filepath)
    if 'MOB' in filename:
        satellite = 'MetOp-B'
    else:
        satellite = 'MetOp-A'

    date_str = filename.split('_')[-2][0:8]
    date_str = '-'.join([date_str[0:4], date_str[4:6], date_str[6:8]])

    # read the data and create `pandas.DataFrame` for reformatting
    nc_fid = netCDF4.Dataset(filepath, 'r')

    variable_names = [
        key for key in nc_fid.variables
        if key not in ['latitude', 'longitude']
    ]
    n_lat, n_lon = nc_fid.variables['SIF_740'][:].shape
    latitude = nc_fid.variables['latitude'][:]
    longitude = nc_fid.variables['longitude'][:]

    # cast latitude and longitude as 2D arrays
    grid_lat = np.repeat(np.array([latitude]).T, n_lon, axis=1)
    grid_lon = np.repeat(np.array([longitude]), n_lat, axis=0)

    if lat is None or lon is None:
        # won't do the point extraction and will return the whole dataset
        panel_sif = pd.Panel(items=variable_names,
                             major_axis=np.arange(n_lat),
                             minor_axis=np.arange(n_lon))

        for var in variable_names:
            panel_sif[var] = nc_fid.variables[var][:]

        panel_sif.rename(items={
            'cos(SZA)': 'cos_SZA',
            'Par_normalized_SIF_740': 'PAR_normalized_SIF_740',
            'Par_normalized_SIF_740_std': 'PAR_normalized_SIF_740_std',
            'Counts': 'counts'
        },
                         inplace=True)

        panel_sif['latitude'] = grid_lat
        panel_sif['longitude'] = grid_lon

        # 'counts' should be integers
        panel_sif['counts'] = panel_sif['counts'].astype(np.int64)
        # use proper NaN notation for missing data
        panel_sif.replace(-999., np.nan, inplace=True)
        # store some meta data information
        panel_sif._metadata = {
            'date': date_str,
            'filename': filename,
            'satellite': satellite,
            'level': 3
        }

        return panel_sif
    else:
        df_sif = pd.DataFrame(columns=variable_names)
        great_arc_dist = great_arc(
            [lat, lon],
            np.vstack([grid_lat.flatten(),
                       grid_lon.flatten()]).T)
        min_dist = np.nanmin(great_arc_dist)

        if (min_dist > dist_tolerance) or np.isnan(min_dist):
            # @TODO: add warning information
            # will return a blank dataframe
            df_sif['latitude'] = []
            df_sif['longitude'] = []
        else:
            nearest_point_index = np.nanargmin(great_arc_dist)
            nearest_point_indices = (nearest_point_index // n_lon,
                                     nearest_point_index % n_lon)
            df_sif = df_sif.set_value(0, 'latitude',
                                      grid_lat[nearest_point_indices])
            df_sif = df_sif.set_value(0, 'longitude',
                                      grid_lon[nearest_point_indices])

            for var in variable_names:
                df_sif = df_sif.set_value(
                    0, var, nc_fid.variables[var][:][nearest_point_indices])

            df_sif.rename(columns={
                'cos(SZA)': 'cos_SZA',
                'Par_normalized_SIF_740': 'PAR_normalized_SIF_740',
                'Par_normalized_SIF_740_std': 'PAR_normalized_SIF_740_std',
                'Counts': 'counts'
            },
                          inplace=True)

        # 'counts' should be integers
        df_sif['counts'] = df_sif['counts'].astype(np.int64)
        # use proper NaN notation for missing data
        df_sif.replace(-999., np.nan, inplace=True)
        # store some meta data information
        df_sif._metadata = {
            'date': date_str,
            'filename': filename,
            'satellite': satellite,
            'level': 3
        }

        return df_sif
Ejemplo n.º 12
0
 def set_datasource(self, panel_hisdat, dict_fenshi, panel_fiveminhisdat):
     self.panel_hisdat = panel_hisdat
     if 0: self.panel_hisdat = pd.Panel()
     self.dict_fenshi = dict_fenshi
     if 0: self.dict_fenshi = dict()
     self.panel_fiveminhisdat = panel_fiveminhisdat
Ejemplo n.º 13
0
def doload(patient, incl_test, downsample):
    dir = 'clips/' + patient + '/'
    dict = {}

    #load files in numerical order
    files = os.listdir(dir)
    files2 = []

    for i in range(len(files)):
        qp = files[i].rfind('_') + 1
        files2.append(files[i][0:qp] +
                      (10 - len(files[i][files[i].rfind('_') + 1:])) * '0' +
                      files[i][qp:])

    print('Reading from', dir, len(files), len(files2))
    t = {key: value for key, value in zip(files2, files)}
    files2 = [f for f in t.keys()]
    files2.sort()
    f = [t[i] for i in files2]

    j = 0
    for i in f:

        if not 'test' in i or incl_test:
            seg = i[i.rfind('_') + 1:i.find('.mat')]
            segtype = i[i[0:i.find('_segment')].rfind('_') +
                        1:i.find('_segment')]
            d = scipy.io.loadmat(dir + i)
            if j == 0:
                #print(d['channels'][0,0])
                cols = list(
                    range(len(d['channels'][0, 0]))
                )  #list(d['channels'][0,0])#list(range(len(d['channels'][0,0])))
                cols = cols + ['time']

            if 'inter' in i or 'test' in i:
                l = -3600.0  #np.nan
            else:
                #print i
                l = d['latency'][0]

            df = pd.DataFrame(np.append(
                d['data'].T,
                l + np.array([range(len(d['data'][1]))]).T / d['freq'][0], 1),
                              index=range(len(d['data'][1])),
                              columns=cols)

            if downsample:
                if np.round(d['freq'][0]) == 5000:
                    df = df.groupby(lambda x: int(np.floor(x / 20.0))).mean()
                if np.round(d['freq'][0]) == 500:
                    df = df.groupby(lambda x: int(np.floor(x / 2.0))).mean()
                if np.round(d['freq'][0]) == 400:
                    df = df.groupby(lambda x: int(np.floor(x / 2.0))).mean()

                df['time'] = df['time'] - (df['time'][0] - np.floor(
                    df['time'][0])) * (df['time'][0] > 0)

            dict.update({segtype + '_' + seg: df})

            j = j + 1

    data = pd.Panel(dict)

    return data
Ejemplo n.º 14
0
def AL(data, labels, test_size, n_label, num_experiments):
    experiments = {}
    for i in range(num_experiments):
        result = active_learning(data, labels, test_size, n_labeled)
        experiments["Experiment" + str(i)] = result
    return pd.Panel(experiments)
Ejemplo n.º 15
0
def get_data_cube(symbol,
                  field,
                  start,
                  end=None,
                  freq='1d',
                  style='sat',
                  adj=None,
                  **kwargs):
    '''
        :param symbol: [list or str] 支持股票、基金、期货(包括连续合约,主力合约、次主力合约)、期权、指数,股票基金。
        :param field: [list or str] 支持行情数据、424因子数据、众筹因子
        :param start: [str or datetime] 返回数据的开始日期,支持"2017-02-02"的字符串格式,也支持Python 中时间和日期格式。
        :param end: [str or datetime] 默认值为当天,返回数据的截止日期,支持"2017-02-02"的字符串格式,也支持Python 中时间和日期格式
        :param freq: [str] 默认值为d。支持日(1d),多频率的分钟线(1m,5m,15m,30m,60m)
        :param adj: [str] 复权方式 默认为空值,'pre'指代前复权 
        :param style: [str] 数据返回的类型,默认值sat。支持ast/sat/tas,分钟线不支持'tas'。
                                                                其中'a'表示'attribute'、's'表示symbol、't'表示时间。
                        如'ast'就表示返回的Panel中的键是attribute,其值为列为symbol、行为time的DataFrame

    '''
    entry_time = time.time()
    uid = str(uuid.uuid1())
    # 权限控制
    if not (is_pro_user() or is_enterprise_user()):
        print(u"该API仅限优矿专业版、企业版使用,请联系优矿客服 [email protected],谢谢!")
        return

    # format symbol
    symbol = __format_str_to_arr(symbol, "symbol")
    if not symbol:
        return

    # format field
    field = __format_str_to_arr(field, "field")
    if not field:
        return

    # validate freq
    if isinstance(freq, basestring):
        if freq not in ['1d', '1m', 'd', 'm', '5m', '15m', '30m', '60m']:
            print(u"目前只支持'1d','1m','5m', '15m', '30m', '60m'六个频率类型,暂不支持:%s。" %
                  freq)
            return
    else:
        print(u"参数freq的值支持str类型,请校验后重新输入。")

    # validate adj
    if adj is not None and adj != 'pre':
        print(u"目前adj参数只支持'pre',表示行情前复权,无法识别传入的adj参数值,默认返回未复权行情数据。")

    # parse date
    start_date = uniform_date(start)
    if start_date is None:
        return

    stocks = AssetService.findout_stocks(symbol)
    funds = AssetService.findout_funds(symbol)
    index = AssetService.findout_indexes(symbol)
    futures = AssetService.findout_futures(symbol)

    # 分钟线限制
    if freq in ['1m', '5m', '15m', '30m', '60m', 'm']:
        if (stocks or funds
                or index) and start_date < datetime.datetime(2009, 1, 1):
            print(
                u"使用分钟线:目前股票、基金、指数支持的最早start为2009-01-01,期货支持的最早start为2010-01-01,自动修改为2009-01-01。"
            )
            start_date = datetime.datetime(2009, 1, 1)
        elif start_date < datetime.datetime(2010, 1, 1):
            print(u"使用分钟线:期货支持的最早start为2010-01-01,自动修改为2010-01-01。")
            start_date = datetime.datetime(2010, 1, 1)

    # validate field
    mkt_daily_fields = set(field) & MKT_DAILY_FIELDS
    mkt_minute_fields = (set(field) & MKT_INTRADAY_FIELDS)
    mkt_fields = mkt_daily_fields | mkt_minute_fields
    if set(field) - set(mkt_fields):
        factors = translate_factors(list(set(field) - set(mkt_fields)),
                                    start_date.strftime('%Y%m%d'), **kwargs)
    else:
        factors = []
    fs_fields = list(set(field) & set(FS_FIELD))
    rest = set(field) - set(mkt_fields) - set(factors) - set(fs_fields)
    if rest:
        print(u"传入的参数fields中有 %s无法识别,请校验后重试。" % list(rest))

    lastest_stock_tdate = TODAY
    lastest_future_tdate = TODAY

    # parse end date
    # 用来智能判断今日行情是否到位,未到位则没有今天的这一行
    if end is None:
        data_due_date = os.getenv('data_due_date')
        if data_due_date:
            end_date = uniform_date(data_due_date)
        else:
            end_date = uniform_date(TODAY)
    else:
        end_date = uniform_date(end)
    if end_date is None:
        return

    if end_date >= uniform_date(TODAY) and get_trading_days(
            TODAY_STR, TODAY_STR):

        if stocks or funds or index:
            if freq in ['d', '1d']:
                lastest_stock_tdate = get_updated_date('daily_adj')
            elif freq in ['1m', '5m', '15m', '30m', '60m', 'm']:
                lastest_stock_tdate = get_updated_date('min_data')
        if futures:
            lastest_future_tdate = get_updated_date('future')
        if min([
                lastest_stock_tdate.strftime('%Y%m%d'),
                lastest_future_tdate.strftime('%Y%m%d')
        ]) < TODAY_STR:
            print(u"今日行情还没有入库,默认获取截止到上一个交易日的数据。")
            end_date = uniform_date(YESTERDAY)

    if end_date < start_date:
        print(u"传入的start晚于end时间,请修正后重试。")
        return
    trading_days = [
        datetime.datetime.strptime(x, '%Y%m%d') for x in get_td_from_cache(
            start_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d'))
    ]
    if len(trading_days) == 0:
        print(u"传入的日期区间 start:%s, end:%s 不包含交易日,无数据返回" % (start, end))
        return
    start_date_str = start_date.strftime('%Y%m%d')
    end_date_str = end_date.strftime('%Y%m%d')

    # validate symbol
    stock_fund_tdates_dict = AssetService.get_stock_fund_tdates(stocks + funds)
    filtered_stock_fund = []
    # 删除股票和基金中间不在交易区间的
    for univ in stocks + funds:
        if univ in stock_fund_tdates_dict:
            for one_tuple in stock_fund_tdates_dict[univ]:
                if str(one_tuple[0]) <= end_date_str and \
                        (one_tuple[1] is None or str(one_tuple[1]) > start_date_str):
                    filtered_stock_fund.append(univ)
                    break
    # 对于股票和基金没有就是无法交易


#         else:
#             filtered_stock_fund.append(univ)
    if len(set(stocks) - set(filtered_stock_fund)) > 0:
        print(u"传入的股票列表中:%s在给定的时间段内没有交易,数据不会出现在结果中。" %
              list(set(stocks) - set(filtered_stock_fund)))
    if len(set(funds) - set(filtered_stock_fund)) > 0:
        print(u"传入的基金列表中:%s在给定的时间段内没有交易,数据不会出现在结果中。" %
              list(set(funds) - set(filtered_stock_fund)))

    filtered_futures = []
    if futures:
        future_tdates_dict = AssetService.get_future_tdates(futures)
        # 删除futures中间不在交易区间的期货

        for future in futures:
            if future in future_tdates_dict:
                if str(future_tdates_dict[future][0]) < end_date_str and \
                        str(future_tdates_dict[future][1]) > start_date_str:
                    filtered_futures.append(future)
            else:
                # 比如连续合约暂时查不到
                filtered_futures.append(future)

        if len(set(futures) - set(filtered_futures)) > 0:
            print("传入的期货列表中:%s在给定的时间段内没有交易,数据不会出现在结果中。" %
                  list(set(futures) - set(filtered_futures)))

    # 去掉不合法的symbol
    invalid_symbol = list(
        set(symbol) - (set(stocks) | set(futures) | set(index) | set(funds)))
    if invalid_symbol:
        print("无法识别传入的symbol参数中的%s,请检查后重试。" % invalid_symbol)

    # 整理最后合法的symbol
    symbol = list(
        set(filtered_stock_fund) | set(filtered_futures) | set(index))
    if len(symbol) == 0:
        return pd.Panel()

    HelloUser({
        "symbol": symbol,
        'field': field,
        'start': start_date.strftime('%Y%m%d'),
        'end': end_date.strftime('%Y%m%d'),
        'freq': freq,
        'style': style,
        'adj': adj,
        'uid': uid
    })
    if freq in ['d', '1d']:

        market_service = MarketService.create_with_service(
            symbol,
            mkt_daily_fields,
            mkt_minute_fields,
            factors,
            fs_fields,
            adj=True if adj == 'pre' else False,
            **kwargs)
        market_service.batch_load_daily_data(trading_days)
        data = market_service.slice(symbols=symbol,
                                    fields=field,
                                    end_date=end_date,
                                    freq='d',
                                    start_date=start_date,
                                    style=style,
                                    rtype='frame',
                                    time_range=None)
        HelloUser({
            "symbol": symbol,
            'field': field,
            'start': start_date.strftime('%Y%m%d'),
            'end': end_date.strftime('%Y%m%d'),
            'freq': freq,
            'style': style,
            'adj': adj,
            'uid': uid,
            'cost': time.time() - entry_time
        })
        return pd.Panel(data).replace([None], np.nan)
    if freq in ['1m', '5m', '15m', '30m', '60m', 'm']:
        if 'IFZ0' in symbol:
            print("IFZ0暂不支持分钟线数据。")
            symbol.remove('IFZ0')

        mkt_daily_fields = list(set(mkt_daily_fields) - set(mkt_minute_fields))

        if mkt_daily_fields:
            print("传入字段:%s为日线字段,暂时没有分钟相关的数据。" % mkt_daily_fields)
        market_service = MarketService.create_with_service(
            symbol, [],
            mkt_minute_fields,
            factors,
            fs_fields,
            adj=True if adj == 'pre' else False,
            **kwargs)
        market_service.batch_load_daily_data(trading_days)
        market_service.rolling_load_minute_data(trading_days, None, freq)
        tick_roller = TickRoller(market_service)
        data_min = tick_roller.slice(prepare_dates=trading_days,
                                     end_time=MAX_END_DATE,
                                     time_range=MAX_MINUTE_LENGTH,
                                     fields=field,
                                     symbols=symbol,
                                     style=style,
                                     rtype='frame')
        HelloUser({
            "symbol": symbol,
            'field': field,
            'start': start_date.strftime('%Y%m%d'),
            'end': end_date.strftime('%Y%m%d'),
            'freq': freq,
            'style': style,
            'adj': adj,
            'uid': uid,
            'cost': time.time() - entry_time
        })

        return pd.Panel(data_min).replace([None], np.nan)


plt.scatter(x_1, y)
A = 0.281661 + 0.282083 * x_1 + -0.432326 * 5.142249 # i.e: alpha_mean + beta_1_mean * x_1 + beta_2_mean * mean(x_2)
zx = np.exp(A)/(1+np.exp(A)) # Inverse logit of A
plt.scatter(x_1, zx, c = 'red') # draws a curve based on prediction from logistic regression model

plt.scatter(x_2, y)
B = 0.281661 + 0.282083 * 4.98559 + -0.432326 * x_2 # i.e: alpha_mean + beta_1_mean * mean(x_1) + beta_2_mean * x_2
# Vaules were obtained from the pandas print out shown below
zy = np.exp(B)/(1+np.exp(B)) # Inverse logit of B
plt.scatter(x_2, zy, c = 'red')

# Use pandas three dimensional Panel to represent the trace:
trace = pd.Panel({k: v.squeeze(0) for k, v in samples.items()})
trace.axes[0].name = 'Variable'
trace.axes[1].name = 'Iteration'
trace.axes[2].name = 'Chain'
 
# Point estimates:
print("Mean:")
print(trace.to_frame().mean())
 

# Bayesian equal-tailed 95% credible intervals:
print("Credible Intervals:")
print(trace.to_frame().quantile([0.05, 0.95]))
 

def plot(trace, var):
Ejemplo n.º 17
0
 def as_panel(self):
     return pd.Panel(self.response_map)
Ejemplo n.º 18
0
def backtest_olmar(STOCKS):
    df_dict = OrderedDict()
    try:
        print("Using Yahoo Finance API......")
        for ticker in STOCKS:
            ticker_df = yf.Ticker(ticker)
            df = ticker_df.history(period="5y")
            df.columns = df.columns.str.strip().str.lower().str.replace(
                ' ', '_')
            df_dict[ticker] = df
        data = pd.Panel(df_dict)
    except:
        print("Error in implimenting Yahoo Finance API")
    try:
        print("*****Using Local Data File******")
        data_dir = "/home/gontse/zipline_dash/data/price_data.csv"
        df_prices = pd.read_csv(data_dir)
        df_prices = df_prices.set_index(['ticker', 'date'])
        for ticker in STOCKS:
            df = df_prices.xs(ticker, level="ticker")
            df.index = pd.to_datetime(df.index, utc=True)
            #df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
            df_dict[ticker] = df
        data = pd.Panel(df_dict)
    except:
        print("Could not retrieve local data file")

    def initialize(algo, eps=1, window_length=5):
        algo.stocks = STOCKS
        algo.tickers = [symbol(ticker) for ticker in algo.stocks]
        algo.m = len(algo.stocks)
        algo.price = {}
        algo.b_t = np.ones(algo.m) / algo.m
        algo.eps = eps
        algo.window_length = window_length

        algo.set_commission(commission.PerShare(cost=0))
        algo.set_slippage(slippage.FixedSlippage(spread=0))

    def handle_data(algo, data):
        m = algo.m
        x_tilde = np.zeros(m)
        b = np.zeros(m)

        #moving average price for each asset
        mavgs = data.history(algo.tickers, 'price', algo.window_length,
                             '1d').mean()
        for i, ticker in enumerate(algo.tickers):
            price = data.current(symbol(ticker.symbol), "price")
            x_tilde[i] = mavgs[ticker] / price

        #olmar algo 2
        x_bar = x_tilde.mean()
        #market relative deviation
        mar_rel_dev = x_tilde - x_bar
        #expected return with current portflio
        exp_return = np.dot(algo.b_t, x_tilde)
        weight = algo.eps - exp_return
        variability = (np.linalg.norm(mar_rel_dev))**2

        if variability == 0.0:
            step_size = 0
        else:
            step_size = max(0, weight / variability)

        b = algo.b_t + step_size * mar_rel_dev
        b_norm = simplex_projection(b)
        np.testing.assert_almost_equal(b_norm.sum(), 1)

        rebalance_portfolio(algo, data, b_norm)
        algo.b_t = b_norm

    def rebalance_portfolio(algo, data, desired_port):
        #rebalance portfolio
        for i, ticker in enumerate(algo.tickers):
            algo.order_target_percent(symbol(ticker.symbol), desired_port[i])

    def simplex_projection(v, b=1):
        v = np.asarray(v)
        p = len(v)
        v = (v > 0) * v
        u = np.sort(v)[::-1]
        sv = np.cumsum(u)

        rho = np.where(u > (sv - b) / np.arange(1, p + 1))[0][-1]
        theta = np.max([0, (sv[rho] - b) / (rho + 1)])
        w = (v - theta)
        w[w < 0] = 0
        return w

    portfolio = zipline.run_algorithm(data=data,
                                      start=datetime(2016, 1, 15, 0, 0, 0, 0,
                                                     pytz.utc),
                                      end=datetime(2017, 1, 2, 0, 0, 0, 0,
                                                   pytz.utc),
                                      initialize=initialize,
                                      capital_base=100000.0,
                                      handle_data=handle_data)
    return portfolio
Ejemplo n.º 19
0
dv_props = {
    'start_date': start,
    'end_date': end,
    'symbol': ','.join(stock_symbol),
    'fields': check_factor,
    'freq': 1,
    "prepare_fields": True
}

dv.init_from_config(dv_props, data_api=ds)
dv.prepare_data()

# 读取所有dayu因子,保存在Factorlist中
df = pd.read_excel('Factors_Data.xlsx', sheetname=None)
df = pd.Panel(df)
FactorList = set({})
for i in range(df.loc[:, :, 'name'].T.values.shape[0]):
    FactorList |= set(df.loc[:, :, 'name'].T.values[i])
FactorList = list(FactorList)

i = 0
for name in FactorList:
    dv.add_field(name)
    i = i + 1
    print('导入进度%s' % (i / len(FactorList)))
    print('总体进度%s' % (i / (3 * len(FactorList))))

alpha_signal = list(set(dv.fields) & set(FactorList))
dv.add_field('sw1')
Ejemplo n.º 20
0
def pd_3darray_to_csv(input_array, filename_output):
    stacked = pd.Panel(input_array.swapaxes(
        1, 2)).to_frame().stack().reset_index()
    stacked.columns = ['x', 'y', 'z', 'value']
    stacked.to_csv(filename_output, index=False)
    print("Generated CSV for filters weights")
Ejemplo n.º 21
0
                if algorithm == 'ips':
                    for sh in sheets:
                        Matrix_results = pd.read_excel(Matrix_results_name,
                                                       sheet_name=sh)
                        df_rolled = Matrix_results.iloc[:180, :]
                        df_rolled = np.roll(
                            df_rolled, -2 * ref_angle,
                            0)  #roll a 0 sie l prefreed es el 45
                        df_rolled = pd.DataFrame(df_rolled)
                        df_rolled[df_rolled < 0] = 0
                        dfs_ips[SUBJECT_USE_ANALYSIS + '_' + sh] = df_rolled

        #####
        #####
        panel_v = pd.Panel(dfs_visual)
        df_visual = panel_v.mean(axis=0)
        df_visual.columns = [
            float(df_visual.columns[i]) * 2
            for i in range(0, len(df_visual.columns))
        ]

        panel_i = pd.Panel(dfs_ips)
        df_ips = panel_i.mean(axis=0)
        df_ips.columns = [
            float(df_ips.columns[i]) * 2
            for i in range(0, len(df_ips.columns))
        ]

        df_heatmaps = {}
        df_heatmaps['ips'] = df_ips
Ejemplo n.º 22
0
import pandas as pd
import numpy as np
#putting both files into dataframes
df1 = pd.read_excel('employee-number-1.xlsx', 'Sheet1', na_values=['NA'])
df2 = pd.read_excel('employee-number-2.xlsx', 'Sheet1', na_values=['NA'])
#order by employee number
df1.sort(columns="employee number")
df1 = df1.reindex()
df2.sort(columns="employee number")
df2 = df2.reindex()


#Diff function to show what changes are
def report_diff(x):
    return x[0] if x[0] == x[1] else '{} ---> {}'.format(*x)


#merge two datasets into a Panel
diff_panel = pd.Panel(dict(df1=df1, df2=df2))
Ejemplo n.º 23
0
def format_data():

    if not os.path.isfile('zipline_panel.pickle'):

        if not os.path.isfile('FTSE100_tickers.pickle'):

            tickers = get_data.save_FTSE100_tickers()
            data_dict = get_data.get_yahoo_data()

        else:

            with open('FTSE100_tickers.pickle', 'rb') as handle:
                tickers = pickle.load(handle)

            with open('FTSE100_data_dict.pickle', 'rb') as handle:
                data_dict = pickle.load(handle)

        data = OrderedDict()

        for ticker in tickers:
            data_dict[ticker] = data_dict[ticker][[
                'Open', 'High', 'Low', 'Close', 'Volume'
            ]]

            counter = 1

            while counter != len(data_dict[ticker].index):

                if data_dict[ticker].index[counter] != data_dict[ticker].index[
                        counter - 1] + timedelta(days=1) or data_dict[
                            ticker].at[data_dict[ticker].index[counter],
                                       'Volume'] == 0.0:

                    new_row = [
                        data_dict[ticker].at[data_dict[ticker].index[counter -
                                                                     1],
                                             'Close'],
                        data_dict[ticker].at[data_dict[ticker].index[counter -
                                                                     1],
                                             'Close'],
                        data_dict[ticker].at[data_dict[ticker].index[counter -
                                                                     1],
                                             'Close'],
                        data_dict[ticker].at[data_dict[ticker].index[counter -
                                                                     1],
                                             'Close'], 0
                    ]

                    data_dict[ticker].loc[data_dict[ticker].index[counter - 1]
                                          + timedelta(days=1)] = new_row

                    data_dict[ticker].sort_index(inplace=True)

                counter += 1

            data[ticker] = data_dict[ticker]

        with open('zipline_panel.pickle', 'wb') as handle:
            pickle.dump(data, handle)

    else:

        if not os.path.isfile('FTSE100_tickers.pickle'):
            tickers = get_data.save_FTSE100_tickers()
        else:
            with open('FTSE100_tickers.pickle', 'rb') as handle:
                tickers = pickle.load(handle)

        with open('zipline_panel.pickle', 'rb') as handle:
            data = pickle.load(handle)

    panel = pd.Panel(data)
    panel.minor_axis = ['open', 'high', 'low', 'close', 'volume']
    panel.major_axis = panel.major_axis.tz_localize(pytz.utc)

    print('Data formatted for zipline...')

    return tickers, panel
# coding=utf-8
import numpy as np
import pandas as pd
from numpy import log
import os

path = "../Data"
files = os.listdir(path)
store = pd.Panel()


# x中d天内最小的值
def ts_min(df, window=10):
    return df.rolling(window).min()


# x中d天内最大的值
def ts_max(df, window=10):
    return df.rolling(window).max()


# 表示x值的最新值减去x值在d天前的值
def delta(df, period=1):
    return df.diff(period)


# 表示某股票x值在横截面上的升序排名序号,并将排名归一到[0,1]的闭区间
def rank(df):
    return df.rank(axis=1, pct=True)

Ejemplo n.º 25
0
def rolling_cov_pairwise(df, *args, **kwargs):
    d = {}
    for c in df.columns:
        d[c] = pd.rolling_cov(df[c], df, *args, **kwargs)
    p = pd.Panel(d)
    return p.transpose(1, 0, 2)
Ejemplo n.º 26
0
            V = day_data['V'].resample('1min').sum().dropna()
            # print V.shape
            V_dist = V / np.sum(V)
            MI = day_data['dMI'].resample('1min').sum().dropna()
            prc_ind = day_data['P'].resample('1min').apply(prcInd).dropna()
            his_df = pd.DataFrame({
                'V': V,
                'V_dist': V_dist,
                'MI': MI,
                'prc_ind': prc_ind
            })
            his_df.index = his_df.index.time
            his_dict[date] = his_df

        panel = pd.Panel(his_dict)
        his_dist = panel.minor_xs('V_dist').apply(np.nanmean, axis=1).values
        if len(his_dist) == 237:
            his_dist = np.insert(his_dist, 120, 0)

        panel_v = panel.values[:, :, [0, 1, 3]]  #day*minute*[MI,V,prc_ind]
        min_v = np.zeros(panel_v.shape[2])
        len_v = np.zeros(panel_v.shape[2])
        for i in range(panel_v.shape[2]):
            min_v[i] = np.nanmin(panel_v[:, :, i])
            len_v[i] = np.nanmax(panel_v[:, :, i]) - np.nanmin(panel_v[:, :,
                                                                       i])
            panel_v[:, :, i] = (panel_v[:, :, i] - min_v[i]) / len_v[i]

        x = np.apply_along_axis(lambda x: 0.7 * x[0] + 0.1 * x[1] + 0.2 * x[2],
                                2, panel_v)
Ejemplo n.º 27
0
    def get_global_panel(self, start, end, period=300, features=('close', )):
        """
        :param start/end: linux timestamp in seconds
        :param period: time interval of each data access point
        :param features: tuple or list of the feature names
        :return a panel, [feature, coin, time]
        """
        start = int(start - (start % period))
        end = int(end - (end % period))
        coins = self.select_coins(start=end - self.__volume_forward -
                                  self.__volume_average_days * DAY,
                                  end=end - self.__volume_forward)
        self.__coins = coins
        for coin in coins:
            self.update_data(start, end, coin)

        if len(coins) != self._coin_number:
            raise ValueError("the length of selected coins %d  %d" %
                             (len(coins), self._coin_number))

        logging.info("feature type list is %s" % str(features))
        self.__checkperiod(period)

        time_index = pd.to_datetime(list(range(start, end + 1, period)),
                                    unit='s')
        panel = pd.Panel(items=features,
                         major_axis=coins,
                         minor_axis=time_index,
                         dtype=np.float32)

        connection = sqlite3.connect(DATABASE_DIR)
        try:
            for row_number, coin in enumerate(coins):
                for feature in features:
                    # NOTE: transform the start date to end date
                    if feature == "close":
                        sql = (
                            "SELECT date+300 AS date_norm, close FROM History WHERE"
                            " date_norm>={start} and date_norm<={end}"
                            " and date_norm%{period}=0 and coin=\"{coin}\"".
                            format(start=start,
                                   end=end,
                                   period=period,
                                   coin=coin))
                    elif feature == "open":
                        sql = (
                            "SELECT date+{period} AS date_norm, open FROM History WHERE"
                            " date_norm>={start} and date_norm<={end}"
                            " and date_norm%{period}=0 and coin=\"{coin}\"".
                            format(start=start,
                                   end=end,
                                   period=period,
                                   coin=coin))
                    elif feature == "volume":
                        sql = (
                            "SELECT date_norm, SUM(volume)" +
                            " FROM (SELECT date+{period}-(date%{period}) "
                            "AS date_norm, volume, coin FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                            " GROUP BY date_norm".format(
                                period=period, start=start, end=end,
                                coin=coin))
                    elif feature == "high":
                        sql = (
                            "SELECT date_norm, MAX(high)" +
                            " FROM (SELECT date+{period}-(date%{period})"
                            " AS date_norm, high, coin FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                            " GROUP BY date_norm".format(
                                period=period, start=start, end=end,
                                coin=coin))
                    elif feature == "low":
                        sql = (
                            "SELECT date_norm, MIN(low)" +
                            " FROM (SELECT date+{period}-(date%{period})"
                            " AS date_norm, low, coin FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                            " GROUP BY date_norm".format(
                                period=period, start=start, end=end,
                                coin=coin))
                    else:
                        msg = ("The feature %s is not supported" % feature)
                        logging.error(msg)
                        raise ValueError(msg)
                    serial_data = pd.read_sql_query(sql,
                                                    con=connection,
                                                    parse_dates=["date_norm"],
                                                    index_col="date_norm")
                    panel.loc[feature, coin,
                              serial_data.index] = serial_data.squeeze()
                    panel = panel_fillna(panel, "both")
        finally:
            connection.commit()
            connection.close()
        return panel
    obj = cal_obj(signal, name, period, quantile)
    plt.show()


def signal_data(signal, name, period=5, quantile=5):
    obj = cal_obj(signal, name, period, quantile)
    return obj.signal_data


signals_dict = {
    a: signal_data(neutralize_dict[a], a, 20)
    for a in alpha_signal
}

ic_pn = pd.Panel(
    {a: analysis.ic_stats(signals_dict[a])
     for a in signals_dict.keys()})

alpha_performance = round(ic_pn.minor_xs('return_ic'), 2)
print(alpha_performance)

alpha_IR = alpha_performance.loc["Ann. IR"]
alpha_IC = alpha_performance.loc["IC Mean"]

good_alpha = alpha_IC[(alpha_IC >= 0.03) & (alpha_IR >= 0.25)]

good_alpha_dict = {g: float('%.2f' % good_alpha[g]) for g in good_alpha.index}

good_alpha_dict

#查看银子行业特点(最优周期)
					# run it
					all_data.update( get_metrics( base_path, variable, model, scenario, begin, end, mask, domain_name, ncpus ) )

		# write it out to disk
		if not os.path.exists( output_path ):
			os.makedirs( output_path )
		
		prefix = '_'.join([ variable, project, 'decadal', 'summaries', str(begin_out), str(end_out) ])

		# its LONG FORMAT output with all datas in rows for a single var/metric
		output_filename = os.path.join( output_path, prefix + '.json' )
		with open( output_filename, 'w' ) as out_json:
			json.dump( all_data, out_json )

		# now some panel-y stuff with the output JSON
		panel = pd.Panel( deepcopy( all_data ) ).copy()
		metrics = ['mean','max','min','stdev']
		for metric in metrics:	
			df = panel[ :, metric, : ].T
			df = df[ [ str(i) for i in range(1,12+1) ] ] # sort the months
			# sort the model combos
			# df = df.reindex_axis([ '_'.join([s,v,m]) for v,m,s in itertools.product(scenarios, variables, models) ], 0)
			# strip variable and underscore
			# df.index = [ ' '.join(i.split('_')[:-1]) for i in df.index ]
			output_filename = os.path.join( output_path, prefix + '_' + metric +'.csv' )
			df.to_csv( output_filename, sep=',' )


# def get_metrics( ):
# 		# mod
# 		modeled_files = glob.glob( os.path.join( base_path, model, scenario, variable, '*.tif' ) )
Ejemplo n.º 30
0
    def read_curves_file(self, fname=None):
        if fname == None:
            fname = self.fname
        if fname == None:
            raise RuntimeError("No file defined for analysis!")
        f = open(fname, "r")
        category = ""
        countA = 0
        countB = 0
        countC = 0
        countD = 0
        countE = 0

        for line in f:
            if re.search("\(A\)", line):
                category = "A"
                countA += 1
            elif re.search("\(B\)", line):
                category = "B"
                countB += 1
            elif re.search("\(C\)", line):
                category = "C"
                countC += 1
            elif re.search("\(D\)", line):
                category = "D"
                countD += 1
            elif re.search("\(E\)", line):
                category = "E"
                countE += 1

            if category == "A" and self.is_data(line):
                group_label = 'groupA'
                co_keys = range(1, 4)
                float_data = range(4, 9)
                str_data = []
                splitter = self.line_prep(line, co_keys, float_data, str_data)
                self.add_data(splitter, group_label, co_keys, float_data,
                              str_data)
            elif category == "B" and self.is_data(line):
                group_label = 'groupB'
                co_keys = range(1, 4)
                float_data = range(4, 10)
                str_data = []
                splitter = self.line_prep(line, co_keys, float_data, str_data)
                self.add_data(splitter, group_label, co_keys, float_data,
                              str_data)
            elif category == "C" and self.is_data(line):
                group_label = 'groupC'
                co_keys = range(1, 4)
                float_data = range(4, 12)
                str_data = []
                splitter = self.line_prep(line, co_keys, float_data, str_data)
                self.add_data(splitter, group_label, co_keys, float_data,
                              str_data)
            elif category == "E" and self.is_data(line):
                group_label = 'groupE'
                co_keys = [1, 2]
                float_data = range(3, 7)
                str_data = []
                splitter = self.line_prep(line, co_keys, float_data, str_data)
                key = make_key(splitter)
                if key - math.floor(key) > 0.01:  # we have an x.5 value
                    splitter.insert(1, '---')
                    splitter.insert(1, '---')
                self.add_data(splitter, group_label, co_keys, float_data,
                              str_data)

        for label in self.group_labels:
            dfs = {}
            for (key, name) in zip(self.setup[label].keys(),
                                   self.setup[label].values()):
                dfs[name] = pd.DataFrame(self.prep_data[label][key])
            self.panels[label] = pd.Panel(dfs)

        return (countA, countB, countC, countD, countE)