def _init_(self): self.train_df = pd.Dataframe() self.train_data = np.empty self.train_label = np.empty self.validation_df = pd.Dataframe() self.validation_data = np.empty self.validation_label = np.empty self.test_df = pd.Dataframe() self.test_data = np.empty self.test_label = np.empty
def ROE(self, datatype: Universes, ticker): if datatype is Universes.SNP: NI_data = pd.Dataframe( self.all_data[datatype][f"{ticker} {'EARN_FOR_COMMON'}"]) Mcap_data = pd.Dataframe( self.all_data[datatype][f"{ticker} {'TOTAL_EQUITY'}"]) ROE = NI_data / Mcap_data return ROE.fillna(method='ffill') else: print("An ETF can't have an ROE. I am in src.Datarepository.ROE") raise KeyError
def leverage(self, datatype: Universes, ticker): if datatype is Universes.SNP: TA_data = pd.Dataframe( self.all_data[datatype][f"{ticker} {'TOTAL_ASSETS'}"]) TE_data = pd.Dataframe( self.all_data[datatype][f"{ticker} {'TOTAL_EQUITY'}"]) leverage = TA_data / TE_data return leverage.fillna(method='ffill') else: print( "An ETF can't have a Leverage Ratio. I am in src.Datarepository.ROE" ) raise KeyError
def file_mappings(nctc_config): logger.info("Mapping files in %s to successful assemblies" % nctc_config.ftp_root_dir) root_dir = nctc_config.ftp_root_dir all_paths = _get_all_paths(root_dir) automatic_gffs = _parse_automatic_gffs(all_paths, nctc_config.automatic_gffs_dir, nctc_config.automatic_gffs_url) manual_embls = _parse_manual_embls(all_paths, nctc_config.automatic_embls_dir, nctc_config.automatic_embls_url) manual_gffs = _parse_manual_gffs(all_paths, nctc_config.automatic_gffs_dir, nctc_config.automatic_gffs_url) return (pd.Dataframe(automatic_gffs), pd.Dataframe(manual_embls), pd.Dataframe(manual_gffs))
def __int__(self, all_actions, state, alpha=0.01, gamma=0.9, eplison=0.8): self.all_actions = all_actions #动作集合 self.gamma = gamma self.alpha = alpha #学习率,默认0.01 self.eplison = eplison #贪婪率,默认0.8 self.q_table = pd.Dataframe(colums=self.all_actions, dtype=np.float64) #定义Q表
def house_predict(): test_json = request.get_json() if test_json: #tem dados if isinstance(test_json, dict): # exemplo único test_raw = pd.Dataframe(test_json, index=[0]) else: test_raw = pd.DataFrame(test_json, columns=test_json[0].keys()) # instanciando classe House pipeline = House() # limpeza dos dados df1 = pipeline.data_cleaning(test_raw) # atributo dos dados df2 = pipeline.feature_engineer(df1) # preparação dos dados df3 = pipeline.data_preparation(df1, df2) # predição dos dados df_response = pipeline.get_prediction(model, test_raw, df3) return df_response else: return Response('{}', status=200, mimetype='application/json') print(test_json)
def npy_to_csv(path): """ **Convert a single .npy file into a .csv file** this function creates a new .csv file from the given .npy file and saves it at the same location with the same name as the .npy file :param path: the path of the .npy file :return: this Method returns 1 on success """ os.chdir(path + '/../') np_array = np.load(path, 'r') filename = ntpath.basename(path) #frame = pd.DataFrame(data=oldfile[1:,1:],index=data[1:,0],cloumns=data[0,1:]) # frame.to_csv(path[:path.rfind('.')]+'.csv',index=False,header=True) pd.Dataframe(np_array).to_csv(filename[:filename.rfind('.')] + '.csv', index=False, header=True) return 1 # this is for testing purposes def switchoption(n, path): switcher = { 1: zip_to_csv(path), 2: zip_to_npy(path), 3: sql_to_csv(path), 4: sql_to_npy(path), 5: csv_to_sql(path), 6: csv_to_npy(path), } return switcher.get(n, " is an invalid option")
def trainRandomForest(gridsearch=False): if gridsearch: random_forest = RandomForestClassifier(class_weight="balanced", verbose=1) parameters = { # "n_estimators": (50, 75, 100, 150), # "max_depth": (10, 15, 20), "n_estimators": (150, 175, 200), "max_depth": (20, 25, 30) # "min_samples_split": (1, 2, 4), # "max_features": (None, "auto", "sqrt", "log2"), # "class_weight": ("balanced", "balanced_subsample", None), } gs = GridSearchCV(estimator=random_forest, param_grid=parameters, scoring="f1", verbose=3) gs.fit(X_TRAIN, Y_TRAIN) print(f"Best parameters : {gs.best_params_}") print(f"Best f1 score : {gs.best_score_}") df = pd.Dataframe(gs.cv_results_) print(df) df.to_csv("gridsearch_results_SVM.csv", sep="\t") model = gs.best_estimator_ else: model = RandomForestClassifier(n_estimators=75, max_depth=10, class_weight="balanced") model.fit(X_TRAIN, Y_TRAIN) return model
def trainSVM(gridsearch=False): """Train SVM model Returns trained model.""" if gridsearch: pipeGS = Pipeline(steps=[ ("scaler", StandardScaler(with_mean=False)), ("svc", SVC(verbose=True, max_iter=10)), ]) parameters = { "svc__C": [0.1, 1, 10, 100], "svc__gamma": [1, 0.1, 0.01, 0.001], "svc__kernel": ["rbf", "poly", "sigmoid"], } gs = GridSearchCV(estimator=pipeGS, param_grid=parameters, scoring="f1", verbose=3) gs.fit(X_TRAIN, Y_TRAIN) print(f"Best parameters : {gs.best_params_}") print(f"Best f1 score : {gs.best_score_}") df = pd.Dataframe(gs.cv_results_) print(df) df.to_csv("gridsearch_results_SVM.csv", sep="\t") pipe = gs.best_estimator_ else: pipe = Pipeline(steps=[ ("scaler", StandardScaler(with_mean=False)), ("svc", SVC(verbose=True, max_iter=10000)), ]) print("TRAINING SVM MODEL") pipe.fit(X_TRAIN, Y_TRAIN) return pipe
def webscrape(self): #make the main soup soup = self.set_main_soup() #get the urls from the page self.get_page_urls() formatter_ = data_formatting(self.url, self.curr_page.content) #progress print("------------------------------") print(len(self.page_urls),"subpages found ") #loop through each url and parse the data data__list = [] for i in range(len(self.page_urls)): self.side_soup = self.get_page_content(self.page_urls[i]) formatter_ = data_formatting(self.page_urls[i], self.side_soup) page_data = formatter_.parse_html() #self.page_data_dict.append(page_data) data__list.append(page_data) #make the json data__list pd.Dataframe(data__list) #json_file = self.make_json() return
def create_dataset_as_supervised(table, sensorId, timesteps=3, limit=True, df_to_csv=False): df_all = pd.Dataframe() if df_to_csv == False: df_all = create_dataset(table, sensorId, limit=limit) if df_to_csv == True: df_all = csv_to_df(sensorId, limit=limit) y = df_all['value'][:14] df = df_all.drop(['date', 'sensortgId', 'id'], axis=1) yt = df['value'].tolist() decide_input(yt, 16, 1) data = series_to_supervised(df, timesteps) data["date"] = df_all["date"] X = np.array(data.iloc[:, :timesteps]) y = np.array(data.iloc[:, timesteps]) X = pd.DataFrame(X, columns=["var(t-3)", "var(t-2)", "var(t-1)"]) X["date"] = np.array(data["date"]) y = pd.DataFrame(y, columns=["value"]) y["date"] = np.array(data["date"]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) return X_train, X_test, y_train, y_test
def parse_by_tag(self, logfile): df = self.__read_csv(self.target_path + "/" + logfile) for row in range(len(df)): uri = df.loc[row, 'uri'] tags = { "..%2F": 'slash', "%3C": 'left_bracket', "%3B": 'semicolon', "%3E": 'right_bracket' } for tag in tags: try: if tag in uri.lower(): path = "tag/" + tags[tag] + ".csv" series = df.loc[row].T self.save_to_csv( pd.Dataframe(series).transpose(), path) except TypeError: print("TypeError", uri) continue except AttributeError: print("attribute:", uri) continue
def prepare(): np.random.seed(813) print('Loading images from' + data_folder + '...') input_count = 0 for filename in os.listdir(data_folder): if (not filename.startswith('00')) and filename.endswith('.mat'): input_count += 1 curr_input = spio.loadmat(data_folder + '/' + filename, struct_as_record = True) img_idata = curr_input['img_idata'] img_ref = curr_input['img_ref'] inputs = {} for x in range(12): inputs['in_ch_' + str(x)] = img_idata[:,:,:,x] in_df = pd.DataFrame(inputs) outputs = {} outputs['water'] = img_ref[:,:,:,0] outputs['fat'] = img_ref[:,:,:,1] outputs['r2'] = img_ref[:,:,:,2] out_df = pd.Dataframe(outputs) else: continue
def dataframe_results(name, session, result_id, ormclass): """ Function to get pandas DataFrames by the result_id Parameters ---------- session : :sqlalchemy:`sqlalchemy.orm.session.Session<orm/session_basics.html>` SQLAlchemy session to the OEDB """ query = session.query(ormclass).filter(ormclass.result_id == result_id) if name == 'Transformer': name = 'Trafo' df = pd.read_sql(query.statement, session.bind, index_col=name.lower() + '_id') if name == 'Link': df['bus0'] = df.bus0.astype(int) df['bus1'] = df.bus1.astype(int) if 'source' in df: source_orm = Source source_query = session.query(source_orm) df.source = df.source.map(id_to_source(source_query)) if str(ormclass)[:-2].endswith('T'): df = pd.Dataframe() return df
def data(): # @TODO: Use a database query to fetch the results and send # the data to your plot # class Bigfoot(db.Model): # __tablename__ = 'Bigfoot' # id = db.Column(db.Integer, primary_key=True) # number = db.Column(db.Integer) # title = db.Column(db.Text) # classification = db.Column(db.Text) # timestamp = db.Column(db.Text) # latitude = db.Column(db.Float) # longtitude = db.Column(db.Float) # def __repr__(self): # return '<Bigfoot %r>' % (self.name) sel = [func.strftime("%Y", Bigfoot.timestamp), func.count(Bigfoot.timestamp)] results = session.query(*sel).\ group_by(func.strftime("%Y", Bigfoot.timestamp)).all() # @TODO: YOUR CODE HERE df = pd.Dataframe(results,columns=['months','sighting']) return jsonify(df.to_dict(orient='records'))
def write_to_xlsxfile(values, filename, sheet_name, headers): writer = pd.ExcelWriter(filename, engine='xlsxwriter') if type(values) == dict: if headers == []: the_dataframe = pd.Dataframe.from_dict(values, orient='index') else: the_dataframe = pd.Dataframe.from_dict(values, orient='index', columns=headers) elif type(values) == list: if headers == []: the_dataframe = pd.Dataframe(values) else: the_dataframe = pd.Dataframe(values, columns=headers) else: return None the_dataframe.t0_excel(writer, sheet_name= sheet_name) writer.close()
def evaluateMonteCarlo(modelInputs, scenarioXLSX, scenarioName, temoa_path, project_path, solver, cases, caseNum): # Unique filename model_filename = scenarioName + '_MC_' + str(caseNum) # Prepare monte carlo inputs cols = ['type', 'variable', 'tech', caseNum] MCinputs = cases.loc[:, cols] MCinputs = MCinputs.rename(columns={caseNum: 'value'}) # Build Model tt.build(modelInputs, scenarioXLSX, scenarioName, model_filename, MCinputs=MCinputs, path=project_path, mc_type='values') # Run Model error = tt.run(model_filename, saveEXCEL=False, temoa_path=temoa_path, debug=True, solver=solver) # Analyze Model (w/ default monte carlo analysis function) if not error: folder = os.path.join(project_path, 'databases') db = model_filename + '.sqlite' results = tt.analyze_db(folder, db, scenario=scenarioName, iteration=caseNum, switch='tech', tod_analysis=True, debug=False) else: results = pd.Dataframe() return results
def split_dates(self ,date_series): """Splits date into month day and year""" df = pd.Dataframe() df['date'] = date_series df[["day", "month", "year"]] = df['date'].str.split("/", expand = True) df.drop(columns = 'date',inplace = True) return df
def shuffle(self): cols = ['chr', 'bStart', 'bEnd', 'name', 'read_count', 'sumCov'] dfTotalShuf = pd.Dataframe(columns=cols) with NamedTemporaryFile('w', dir='/tmp', delete=False) as f: for i in range(1): self.dfPickCol.to_csv(f.name, sep='\t', header=False, index=False) if self.cell == 'HEK': region_file = '../raw_data/hg19.chromSize.noChrY.chr.sorted.bed' elif self.cell == 'U2OS': region_file = '../raw_data/hg19.chromSize.noChrY.num.sorted.bed' cmd = 'bedtools shuffle -seed %s -i %s -g %s' % (i, f.name, region_file) print(cmd) check_call(shlex.split(cmd), stdout = f) dfShuf = pd.read_csv(f.name, header=-1, sep='\t') dfShuf.columns = cols dfShuf['bStart'] = dfShuf.bStart.astype('int') dfShuf['bEnd'] = dfShuf.bEnd.astype('int') print(dfShuf.head()) sumCov = self.dnaseDepth(dfShuf, self.bam_file, f.name) dfShuf['sumCov'] = sumCov dfShuf.to_csv(f.name, sep='\t', header=False, index=False) dfTotalShuf = pd.concat(dfTotalShuf, dfShuf, ignore_index=True) return dfTotalShuf
def write_csv(self, filename): df_out = pd.Dataframe(columns=['family_id', 'assigned_day']) for idx, fam_density in enumerate(self.family_density_matrix): df_out.append([idx + 1, np.argmax(fam_density)], ignore_index=True) df_out.to_csv(path_or_buff=filename, index=False)
def _ensure_data_frame(obj, name): """ obj a python object to be converted to a DataFrame take an object and make sure that it's a pandas data frame """ #we accept pandas Dataframe, and also dictionaries, lists, tuples #we'll just convert them to Pandas Dataframe if isinstance(obj, pd.DataFrame): df = obj elif isinstance(obj, (tuple, list)) : #tuple and list case if len(obj)==0: return pd.Dataframe() firstrow = obj[0] if isinstance(firstrow, (tuple, list)): #multiple-columns colnames = ["c%d" % i for i in range(len(firstrow))] df = pd.DataFrame(obj, columns=colnames) else: #mono-column df = pd.DataFrame(obj, columns=["c0"]) if not isinstance(df, pd.DataFrame) : raise Exception("%s is not a Dataframe, tuple, list, nor dictionary" % name) for col in df: if df[col].dtype==np.int64: df[col] = df[col].astype(np.float) return df
def getDividends(stock, country): try: dividends = investpy.stocks.get_stock_dividends(stock, country) except: raise Exception(f"Couldn\'t get dividends for {stock} in {country}") dividends = pd.Dataframe([]) return dividends
def plant_biomass(g, duration=12, sky=None, df=None, ghi=1, Pm=1): if df is None: cs, raw, agg = illuminate(g, sky=sky) df = pandas.Dataframe(agg) ei = ghi * df.Ei p = light_response(ei, Pm=Pm) return (df.area * p * duration).sum()
def match_affil(affiliation: str, k: int = 3): """ Match affliation to GRID dataset. Return a da """ parsed_affil = parse_affil(affiliation) df = pd.Dataframe([parsed_affil]) indexer = recordlinkage.Index() indexer.add(Full()) candidate_links = indexer.index(df, grid_df) # recordlinkage comparer compare = recordlinkage.Compare() compare.exact("institution", "institution") compare.string("location", "location", method="jarowinkler") compare.string("country", "country", method="jarowinkler") features_df = compare.compute(candidate_links, df, grid_df) features_df["score"] = np.average(features_df, axis=1, weights=[0.6, 0.2, 0.2]) topk_df = features_df[["score"]].reset_index().sort_values( "score", ascending=False).head(k) topk_df = topk_df.merge(grid_df.reset_index(), left_on="level_1", right_on="index").\ drop(labels=["level_0", "level_1", "location"], axis=1) return topk_df.to_dict(orient="records")
def preprocess_text_input(data): # text input # "sex": sSex, # "embarked": "S", # "class": sClass, # "who": sWho, # "adult_male": sAdultMale, # "deck": null, # "embark_town": "Southhampton", # "alone": sAlone, # "pclass": iPClass, # "age": iAge, # "sibsp": iSibsp, # "parch": iParch, # "fare": 1 # # convert to integer columns: pclass sex age sibsp fare embarked who adult_male alone # remove unused columns df = pd.Dataframe(data, columns = ['sex', 'embarked', 'class', 'who, 'adult_male, 'deck', 'embark_town', 'alone', 'pclass', 'age', 'sibsp', 'parch', 'fare']) df = df.drop(['class', 'deck', 'embark_town', 'parch'], axis=1) # convert non-numeric data genders = {"male": 0, "female": 1} df['sex'] = df['sex'].map(genders) ports = {"S": 0, "C": 1, "Q": 2} df['embarked'] = df['embarked'].map(ports) alive = {"yes": 1, "no": 0} df['alive'] = df['alive'].map(alive) who = {"man": 1, "woman": 2, "child": 0} df['who'] = df['who'].map(who) boool = {True: 1, False: 0} df['alone'] = df['alone'].map(boool) df['adult_male'] = df['adult_male'].map(boool) df['age'] = df['age'].astype(int) df.loc[ df['age'] <= 11, 'age'] = 0 df.loc[(df['age'] > 11) & (df['age'] <= 18), 'age'] = 1 df.loc[(df['age'] > 18) & (df['age'] <= 22), 'age'] = 2 df.loc[(df['age'] > 22) & (df['age'] <= 27), 'age'] = 3 df.loc[(df['age'] > 27) & (df['age'] <= 33), 'age'] = 4 df.loc[(df['age'] > 33) & (df['age'] <= 40), 'age'] = 5 df.loc[(df['age'] > 40) & (df['age'] <= 66), 'age'] = 6 df.loc[ df['age'] > 66, 'age'] = 6 df['fare'] = df['fare'].astype(int) df.loc[ df['fare'] <= 7.91, 'fare'] = 0 df.loc[(df['fare'] > 7.91) & (df['fare'] <= 14.454), 'fare'] = 1 df.loc[(df['fare'] > 14.454) & (df['fare'] <= 31), 'fare'] = 2 df.loc[(df['fare'] > 31) & (df['fare'] <= 99), 'fare'] = 3 df.loc[(df['fare'] > 99) & (df['fare'] <= 250), 'fare'] = 4 df.loc[ df['fare'] > 250, 'fare'] = 5 # reorder columns to match NN input df = df[['pclass', 'sex', 'age', 'sibsp', 'fare', 'embarked', 'who', 'adult_male', 'alone']]
def reduceOversample(df_train): new_df_train = pd.Dataframe() for pax in range(0, 8): df_pax = df_train.loc[df_train['PAX'] == pax] if (df_pax.size() > 1000): df_pax = df_pax.sample(df_pax.size() - 500) new_df_train = pd.concat([df_train, pax], axis=1) return df_train
def __init__(self, train=None, test=None, features=None): if features is None: self.features = [] else: self.features = features if train is None: self.train = pandas.Dataframe() else: self.train = train[features] if test is None: self.test = pandas.Dataframe() else: self.test = test[features] self.y=train['Survived']
def create_sims_table(): sims_draw = sim_london(True) sims_play = sim_london(False) sims_tot = sims_draw + sims_play dfs = pd.Dataframe(sims_tot, columns=['handsize', 'play_draw', 'pred']) dfs.to_csv('london_sims_RandomForest.csv', index=False)
def _load_northwest_territories(start_date=datetime(2020, 1, 1), end_date=datetime.today(), verbose=True): """ Parameters: - `start_date` datetime object, the date of the earliest news release to be retrieved. By default, only the releases published before Jan 1 2020 are retrieved - `end_date` datetime object, the date of the latest news release to be retrieved. By default, this is set to the current date - `verbose` boolean, whether or not the function should print updates Returns: a DataFrame containing news releases from the government of the Northwest Territories. """ region = 'Northwest Territories' sub_region = '' url_base = "https://www.gov.nt.ca/" page = 0 rows = [] while True: url = url_base + "en/newsroom?page=" + str(page) if verbose: print("Searching page {}".format(page + 1)) response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') ar_boxes = soup.find_all('div', class_ = re.compile('views-row')) # regex accounts for inconsistent `div` class names if not ar_boxes: return pd.Dataframe(rows, columns=_columns) for box in ar_boxes: boxed_soup = BeautifulSoup(str(box), 'html.parser') # parse each div date_str = boxed_soup.find('span').text ar_date = datetime.strptime(date_str, "%B %d, %Y") if ar_date < start_date: if verbose: print("Stopping search at date {}".format(ar_date)) return pd.DataFrame(rows, columns=_columns) if ar_date > end_date: # Articles that follow the `end_date` parameter are ignored continue title_a = boxed_soup.find('a') title = title_a.text relative_link = title_a['href'] link = url_base + relative_link ar_res = requests.get(link) ar_soup = BeautifulSoup(ar_res.content, 'html.parser') body = ar_soup.find('div', class_ = "field-item even").text row = [ar_date, _country, region, sub_region, link, _src_cat, title, body] rows.append(row) page += 1
def test_shape(self): df1 = pd.Dataframe(np.random.rand(1000, 200)) df2 = pd.Dataframe(np.random.rand(1000, 2000)) df3 = pd.Dataframe(np.random.rand(1000, 4)) df4 = pd.Dataframe(np.random.rand(1000, 230)) s1 = 1 s2 = 100 s3 = 2 s4 = 23 d1 = DatasetAutoencoderLSTM(df=df1, subsamble_coef=s1) d2 = DatasetAutoencoderLSTM(df=df2, subsamble_coef=s2) d3 = DatasetAutoencoderLSTM(df=df3, subsamble_coef=s3) d4 = DatasetAutoencoderLSTM(df=df4, subsamble_coef=s4) for i in range(1000): assert d1[i].shape[0] == 1 and d1[i].shape[1] == df1.shape[1] // s1 assert d2[i].shape[0] == 1 and d2[i].shape[1] == df2.shape[1] // s2 assert d3[i].shape[0] == 1 and d3[i].shape[1] == df3.shape[1] // s3 assert d4[i].shape[0] == 1 and d4[i].shape[1] == df4.shape[1] // s4