def _get_value(self): """Parse values based on "m$n", ";" and ")" delimiters""" RV = [] # first value, between "m$n" and ";" # add +4 because "m$n" delimiter has 3 chars, plus 1 white space value_start = self.row[0].find("m$n") + 4 value_end = self.row[0].find(";") # get substring, convert to float and append str_value = self.row[0][value_start:value_end] float_value = convert_to_float(str_value) RV.append(float_value) # second value, between last "m$n" and ")" # add +4 because "m$n" delimiter has 3 chars, plus 1 white space value_start = self.row[0].rfind("m$n") + 4 value_end = self.row[0].find(")") # get substring, convert to float and append str_value = self.row[0][value_start:value_end] float_value = convert_to_float(str_value) RV.append(float_value) return RV
def get_rights_issue(ticker, start_date, end_date): ''' 获取股票已实施配股数据。 Parameters ---------- ticker '600340' start_date '20100101' end_date '20150101' Returns -------- DataFrame index ex_rights_date columns 'ex_rights_date','rights_issue_per_stock','rights_issue_price', 'transfer_rights_issue_per_stock','transfer_price' (除权日,每股配股,配股价,每股转配,每股转配价) ''' sql_select = ''' SELECT [除权日] ,[配股比例分子] ,[配股价格] ,[转配比例分子] ,[转让费] FROM [BasicData].[dbo].[Yi_RightsIssue] WHERE [除权日] is not null AND [stockcode] = '%s' ''' % (ticker) cur.execute(sql_select) data = cur.fetchall() columns = [ 'ex_rights_date', 'rights_issue_numerator', 'rights_issue_price', 'transfer_rights_issue_numerator', 'transfer_rights_issue_fee' ] df = pd.DataFrame(data, columns=columns) df = df.fillna(0) df['ex_rights_date'] = matlab_time_convert(df['ex_rights_date']) df['rights_issue_price'] = convert_to_float(df['rights_issue_price']) df['rights_issue_per_stock'] = \ convert_to_float(df['rights_issue_numerator']) / 10.0 df['transfer_rights_issue_per_stock'] = \ convert_to_float(df['transfer_rights_issue_numerator']) / 10.0 df['transfer_price'] = \ df['rights_issue_price'] + convert_to_float(df['transfer_rights_issue_fee']) df = df[[ 'ex_rights_date', 'rights_issue_per_stock', 'rights_issue_price', 'transfer_rights_issue_per_stock', 'transfer_price' ]] df = df.set_index('ex_rights_date', drop=False) df = df.sort_index() df = df[start_date:end_date] return df
def update_rover(Rover, data): # Retrieve current kinematic and control values Rover.vel = convert_to_float(data["speed"]) # meters/sec Rover.pos = [ convert_to_float(pos.strip()) for pos in data["position"].split(';') ] Rover.yaw = convert_to_float(data["yaw"]) Rover.pitch = convert_to_float(data["pitch"]) Rover.roll = convert_to_float(data["roll"]) Rover.throttle = convert_to_float(data["throttle"]) Rover.steer = convert_to_float(data["steering_angle"]) # Initialize start time and sample positions if Rover.start_time == None: Rover.start_time = time.time() Rover.total_time = 0 samples_xpos = np.int_([ convert_to_float(pos.strip()) for pos in data["samples_x"].split(';') ]) samples_ypos = np.int_([ convert_to_float(pos.strip()) for pos in data["samples_y"].split(';') ]) Rover.samples_pos = (samples_xpos, samples_ypos) Rover.samples_to_find = np.int(data['sample_count']) Rover.homePos = (Rover.pos[0], Rover.pos[1]) Log("Home=" + str(Rover.homePos)) else: tot_time = time.time() - Rover.start_time if np.isfinite(tot_time): Rover.total_time = tot_time # Near sample flag Rover.near_sample = np.int(data["near_sample"]) # Picking up flag Rover.picking_up = np.int(data["picking_up"]) # Update number of rocks collected Rover.samples_collected = Rover.samples_to_find - np.int( data["sample_count"]) # Get the current image from the center camera of the rover imgString = data["image"] image = Image.open(BytesIO(base64.b64decode(imgString))) Rover.img = np.asarray(image) # Create dict conta8iner for various processed images Rover.procImage = {} Rover.procImage['POVRaw'] = Rover.img #LogRoverState(Rover, data) # Return updated Rover and separate image for optional saving return Rover, image
def get_dividend(ticker, start_date, end_date): ''' 获取股票时间段内实施的分红送股转增数据。 Parameters ---------- ticker '600340' start_date '20100101' end_date '20150101' Returns -------- DataFrame index XD_date columns XD_date,dividend_per_share,multiplier (除权除息日,每股分红,分红后每股乘数) ''' sql_select = ''' SELECT [stockcode] ,[除权除息日] ,[送股比例分子] ,[转增比例分子] ,[派息比例分子_税后] FROM BasicData.dbo.Yi_Dividend WHERE stockcode = '%s' AND [numtime] is not Null ''' % (ticker) cur.execute(sql_select) data = cur.fetchall() columns = [ 'ticker', 'XD_date', 'stock_dividend_numerator', 'transfer_numerator', 'cash_dividend_numerator_after_tax' ] df = pd.DataFrame(data, columns=columns) df = df.fillna(0) df['dividend_per_share'] = convert_to_float( df[u'cash_dividend_numerator_after_tax']) / 10.0 df['multiplier'] = 1 + (convert_to_float(df[u'stock_dividend_numerator']) + \ convert_to_float(df[u'transfer_numerator'])) / 10.0 df['XD_date'] = matlab_time_convert(df['XD_date']) df = df[['XD_date', 'dividend_per_share', 'multiplier']].set_index('XD_date', drop=False) df = df.sort_index() df = df[start_date:end_date] return df
def __init__(self, elem_lst): self.cons_no = elem_lst[1] #用户编号(关联主键) self.cons_id = elem_lst[0] #用户标识 self.cons_sort_code = elem_lst[10] #用户分类 if elem_lst[10] not in sort_dict: self.cons_sort_code = "others" self.contract_cap = None #合同容量 self.contract_cap = utils.convert_to_float(elem_lst[5]) self.elec_addr = elem_lst[2] #用电地址 self.elec_type_code = elem_lst[4] #用电类别 self.hec_industry_code = elem_lst[7] #高耗能行业类别 if len(elem_lst[7]) == 0: self.hec_industry_code = "null" self.load_attr_code = elem_lst[6] #负荷性质 if len(elem_lst[6]) == 0: self.load_attr_code = "null" self.org_no = elem_lst[9] #供电单位编号 self.status_code = elem_lst[8] #用户状态 if len(elem_lst[8]) == 0: self.status_code = "null" self.trade_code = elem_lst[3] #行业分类 if elem_lst[3] not in trade_dict: self.trade_code = "others" self.urban_rurl_flag = elem_lst[11] #城乡类别 if len(elem_lst[11]) == 0: self.urban_rurl_flag = "null"
def read_stage_length(stage_soup): ## remove all non alphanumeric, unicode safe stage_info = stage_soup.find('div', class_='entryHeader').find('h2') stage_length = convert_to_float( stage_info.find_all('span')[2].text.replace('(', '').replace( ')', '').replace('k', '')) return stage_length
def main(): dirname = os.path.dirname(__file__) output_dirname = os.path.join(dirname, 'results') try: os.stat(output_dirname) except: os.mkdir(output_dirname) file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) #features = X[0,:] X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) m = MeanImputation(X) m.train() m.transform() sc = Scaling(X) sc.train() sc.transform() l = LogisticRegression(X=X, y=y) l.train()
def _get_value(self): """Parse values from tbl_row, last two cells in row.""" value_1945 = None value_1946 = None # if cant convert to float, is missing try: value_1945 = convert_to_float(self.row[3]) except: pass # if cant convert to float, is missing try: value_1946 = convert_to_float(self.row[4]) except: pass return [value_1945, value_1946]
def _get_quantity(self): """Parse quantities from tbl_row, first two cells after country name.""" quantity_1945 = None quantity_1946 = None # if cant convert to float, is missing try: quantity_1945 = convert_to_float(self.row[1]) except: pass # if cant convert to float, is missing try: quantity_1946 = convert_to_float(self.row[2]) except: pass return [quantity_1945, quantity_1946]
def get_trial_metadata_dictionaries_partial(self, experiment_number, experimentdict): '''This creates a dictionary of dictionaries that is organized as such: {Trial-#:{'evid': 12345, 'magnitude': 4.02, 'distance': 124}}. This makes it much easier for the information stored to be parsed through by utilizing a double index to get the specific piece of of information about the specific Trial.''' experimentId = experiment_id[experiment_number] cache_evid_dict = {} cache_ml_dict = {} cache_distance_dict = {} for trial in experimentdict: requestDict = self.make_request("GET","/REST/Project/353/Experiment/%s/Trial/%s" % (experimentId, experimentdict[trial])) data = requestDict['data'] evid = utils.parse_description('evid:', data) magnitude = utils.parse_description('ml:', data) distance = utils.parse_description('distance:', data) cache_evid_dict[trial] = utils.convert_to_long(evid) cache_ml_dict[trial] = utils.convert_to_float(magnitude) cache_distance_dict[trial] = utils.convert_to_float(distance) nees_logging.log_cache_invalid_cache_variables(trial, cache_evid_dict[trial],cache_ml_dict[trial],cache_distance_dict[trial]) return cache_evid_dict, cache_ml_dict, cache_distance_dict
def __init__(self, elem_lst): self.cons_no = elem_lst[0] #用户编号 self.ymrcvbl_ym = None #应收年月 self.ymrcvbl_ym = utils.convert_to_date_YM('09_ARC_A_RCVBL_FLOW.TSV', elem_lst[1]) self.org_no = elem_lst[2] #供电单位编号 self.pay_code = elem_lst[3] #缴费方式 self.t_pq = 0.0 #总电量 self.t_pq = utils.convert_to_float(elem_lst[4]) self.rcvbl_amt = 0.0 #应收金额 self.rcvbl_amt = utils.convert_to_float(elem_lst[5]) self.rcved_amt = 0.0 #实收金额 self.rcved_amt = utils.convert_to_float(elem_lst[6]) self.status_code = elem_lst[7] #费用状态 self.rcvbl_penalty = 0.0 #应收违约金 self.rcvbl_penalty = utils.convert_to_float(elem_lst[8]) self.rcved_penalty = 0.0 #实收违约金 self.rcved_penalty = utils.convert_to_float(elem_lst[9]) self.risk_level_code = elem_lst[10] #风险等级 self.owe_amt = 0.0 #电费金额 self.owe_amt = utils.convert_to_float(elem_lst[11]) self.cons_sort_code = elem_lst[12] #用户分类 self.elec_type_code = elem_lst[13] #用电类别 self.ctl_mode = elem_lst[14] #费控方式
def get_trial_metadata_dictionaries_partial(project_id, experiment_id, experimentdict): '''This creates a dictionary of dictionaries that is organized as such: {Trial-#:{'evid': 12345, 'magnitude': 4.02, 'distance': 124}}. This makes it much easier for the information stored to be parsed through by utilizing a double index to get the specific piece of of information about the specific Trial.''' cache_evid_dict = {} cache_ml_dict = {} cache_distance_dict = {} for trial in experimentdict: request = "%s%s/Experiment/%s/Trial/%s" % (neeshub_project_path, project_id, experiment_id, experimentdict[trial]) authentic_request = utils.authenticate_request(request) requestDict = conn.request('GET', authentic_request) data = requestDict['data'] evid = utils.parse_description('evid:', data) magnitude = utils.parse_description('ml:', data) distance = utils.parse_description('distance:', data) cache_evid_dict[trial] = utils.convert_to_long(evid) cache_ml_dict[trial] = utils.convert_to_float(magnitude) cache_distance_dict[trial] = utils.convert_to_float(distance) nees_logging.log_cache_invalid_cache_variables(trial, cache_evid_dict[trial],cache_ml_dict[trial],cache_distance_dict[trial]) return cache_evid_dict, cache_ml_dict, cache_distance_dict
def standardize_value(self, cleansed_dict, ingredient_texts): uom_indxs = [ i for i, v in enumerate(cleansed_dict['tokens']) if v['type'] == 'unit_of_measure' ] for indx, uom_indx in enumerate(uom_indxs): try: uom = cleansed_dict['tokens'][uom_indx]['standard_token'] ing = ingredient_texts[indx] # Unit of measure is mostly the previous token # So taking uom_value index as 1 less than uom index uom_value_indx = uom_indx - 1 value = cleansed_dict['tokens'][uom_value_indx][ 'standard_token'] vtype = cleansed_dict['tokens'][uom_value_indx]['type'] if vtype != 'value': # Getting the previous token if the current token # is not of type "value" uom_value_indx = uom_value_indx - 1 if cleansed_dict['tokens'][uom_value_indx][ 'type'] == 'value': value = cleansed_dict['tokens'][uom_value_indx][ 'standard_token'] else: continue if not value: continue sku = ing + '_' + uom std_values_dict = self.valid_skus.get(sku, {}) std_values = sorted(std_values_dict.keys()) float_value = convert_to_float(value) if not float_value: continue if float_value not in std_values: for i, v in enumerate(std_values): if float_value < v: v = std_values_dict[v] uom_dict = cleansed_dict['tokens'][uom_value_indx] uom_dict["standard_token"] = v if indx == 0: cleansed_dict['unit_of_measure'] = uom cleansed_dict['unit_of_measure_value'] = v break except IndexError: continue return cleansed_dict
def standardize_value_old(self, cleansed_dict, ingredient_texts): ingredient = cleansed_dict['ingredient'] uom = cleansed_dict['unit_of_measure'] value = cleansed_dict['unit_of_measure_value'] if not value: return cleansed_dict sku = ingredient + '_' + uom std_values_dict = self.valid_skus.get(sku, {}) std_values = sorted(std_values_dict.keys()) float_value = convert_to_float(value) if float_value not in std_values: for i, v in enumerate(std_values): if float_value < v: value = std_values_dict[v] break if cleansed_dict['unit_of_measure_value'] != value: uom_indxs = [ i for i, v in enumerate(cleansed_dict['tokens']) if v['type'] == 'unit_of_measure' ] for i, uom_indx in enumerate(uom_indxs): try: uom = cleansed_dict['tokens'][uom_indx]['standard_token'] ing = ingredient_texts[i] if sku == (ing + '_' + uom): uom_value_indx = uom_indx - 1 uom_dict = cleansed_dict['tokens'][uom_value_indx] uom_dict["standard_token"] = value cleansed_dict['tokens'][uom_value_indx] = uom_dict if i == 0: cleansed_dict['unit_of_measure_value'] = value except IndexError: break return cleansed_dict
def _make_profit_statements(self, code, years): self.profits = ts.get_profit_statement(code) self.profit_statements = { year: utils.convert_to_float(self.profits[year]) for year in years }
def main(): ''' Use this script to run experiments and fine-tune the algoritms ''' # Load the dataset file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() # Remove useless features (not numeric + bad regressors). to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) # Impute missing values m = MeanImputation(X) m.train() m.transform() # Scale the variables sc = Scaling(X) sc.train() sc.transform() # Split the dataset in a training and testing set sp = SplitTrainTest(X, y) sp.Split() X_train = sp.X_train y_train = sp.y_train X_test = sp.X_test y_test = sp.y_test # Train a logistic regression model l = LogisticRegression(X=X_train, y=y_train) l.train() # Compute the confusion matrix over the training set y_predicted = l.predict() cm1 = ConfusionMatrix(y_train, y_predicted) cm1.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the training set ****************' ) print('\n') cm1.Print() # Compute the confusion matrix over the testing set y_predicted = l.predict(X_test) cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels) cm2.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the testing set ****************' ) print('\n') cm2.Print()
def main(): file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) m = MeanImputation(X) m.train() m.transform() sc = Scaling(X) sc.train() sc.transform() sp = SplitTrainTest(X, y) sp.Split() X_train = sp.X_train y_train = sp.y_train X_test = sp.X_test y_test = sp.y_test l = LogisticRegression(X=X_train, y=y_train, optimizer='sgd', optimizer_params={ 'alpha': 0.5, 'n': 5, 'batch_size': 16 }) l.train() y_predicted = l.predict() cm1 = ConfusionMatrix(y_train, y_predicted) cm1.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the training set ****************' ) print('\n') cm1.Print() y_predicted = l.predict(X_test) cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels) cm2.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the testing set ****************' ) print('\n') cm2.Print()
def latest_equity(self): bs = utils.convert_to_float(self.balance_sheet.balances.iloc[:, 1]) return bs[bsheet.balance_sheet_index['equity']]
import pandas as pd import numpy as np from utils import convert_ids, convert_ids, convert_to_float, to_json movies_metadata_df1 = pd.read_csv('../data/movies_metadata.csv' , converters={ 'id': lambda x: convert_ids(x) , 'imdb_id': lambda x: convert_ids(x) ,'popularity': lambda x: convert_to_float(x) ,'genres': lambda x: to_json(x)} , usecols=['id', 'original_title' , 'genres' #'homepage' , 'overview', 'popularity', 'poster_path' , 'release_date', 'revenue', 'runtime' , 'spoken_languages', 'title' , 'vote_average', 'vote_count'] , dtype={'populariy': np.float64} , parse_dates=True, low_memory=False) movies_lookup_df = pd.read_csv('../data/movies_metadata.csv' , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)} ,usecols=['id', 'title'], low_memory=False) ##################################### ##SVD DATA SET movies_df = pd.read_csv('../data/movies_metadata.csv' , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)} ,usecols=['id', 'original_title', 'belongs_to_collection' , 'budget', 'genres', 'homepage' ,'imdb_id', 'overview', 'popularity', 'poster_path' , 'production_companies','release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video',
def test_convert_to_float(): assert utils.convert_to_float("23.45") == 23.45 assert utils.convert_to_float("0") == 0.0 assert utils.convert_to_float("-0") == 0.0 assert utils.convert_to_float("-23") == -23.0 assert utils.convert_to_float(245) == 245.0 assert utils.convert_to_float(-245) == -245.0 assert utils.convert_to_float(-245.8457) == -245.8457 assert utils.convert_to_float(0) == 0.0 assert utils.convert_to_float("43.53") == 43.53 assert utils.convert_to_float("25sn") is None assert utils.convert_to_float("s45") is None assert utils.convert_to_float("string") is None
def main(): dirname = os.path.dirname(__file__) dirname_prediction = os.path.join(dirname, 'results') file_name = sys.argv[1] file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] index_position = d.data_set[0].index('Index') indexes = np.array( [d.data_set[i][index_position] for i in range(len(d.data_set))])[1:] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) #features = X[0,:] X = convert_to_float(X[1:, ]) m = MeanImputation(X, path_to_mean_imputation=os.path.join( dirname_prediction, 'mean_imputation.json')) m.transform() sc = Scaling(X, path_to_scaling=os.path.join(dirname_prediction, 'scaling.json')) sc.transform() l = LogisticRegression(X=X, path_to_beta=os.path.join(dirname_prediction, 'beta.json')) predictions = l.predict() dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, 'resources/houses.csv') with open(file_name, 'w+') as outfile: writer = csv.writer(outfile, delimiter=',') writer.writerow(['Index', 'Hogwarts House']) for i in range(len(indexes)): writer.writerow([indexes[i], predictions[i]])
def _make_cash_flows(self, code, years): cash_flows = ts.get_cash_flow(code) self.cash_flows = { year: utils.convert_to_float(cash_flows[year]) for year in years }
def _make_balance_sheets(self, code, years): self.balances = ts.get_balance_sheet(code) self.balance_sheets = { year: utils.convert_to_float(self.balances[year]) for year in years }