def _save_obj_ver_edge(obj_ver_key, obj_key): """Save the ws_version_of edge.""" # The _from is a version of the _to from_id = 'ws_object_version/' + obj_ver_key to_id = 'ws_object/' + obj_key logger.debug(f'Saving ws_version_of edge from {from_id} to {to_id}') save('ws_version_of', [{'_from': from_id, '_to': to_id}])
def craw_stat(self): for c in self.sources: logger.debug(c["city"] + ": " + c["home"]) html = download(c["home"], charset="utf-8") soup = BeautifulSoup(html, "html.parser") deal_div = soup.find("div", {"class": "deal-price"}) deal_price = 0.0 list_price = 0.0 rate = 0.0 vol = 0 if deal_div and deal_div.find("label", {"class": "dataAuto"}): deal_price = deal_div.find("label", {"class": "dataAuto"}).text.strip().encode("utf-8") list_div = soup.find("div", {"class": "listing-price"}) if list_div and list_div.find("label", {"class": "dataAuto"}): list_price = list_div.find("label", {"class": "dataAuto"}).text.strip().encode("utf-8") ul = soup.find("div", {"class": "main"}).findAll("li") for li in ul: if li.find("p").text: if re.findall("客房比", li.find("p").text.encode("utf-8")): rate = li.find("label").text.strip().encode("utf-8") if re.findall("成交", li.find("p").text.encode("utf-8")): vol = li.find("label").text.strip().encode("utf-8") item = {"price": deal_price, "rate": rate, "city": c["city"], "vol": vol, "date": self.date} logger.debug(item) if float(deal_price) <=0.0 or float(rate) <= 0.0: continue; if not self.dao.has_stat(c["city"], self.date): self.dao.insert_stat(item)
def _save_referral_edge(obj_ver_key, obj): """Save the ws_refers_to edge.""" from_id = 'ws_object_version/' + obj_ver_key for upa in obj.get('refs', []): to_id = 'ws_object_version/' + upa.replace('/', ':') logger.debug(f'Saving ws_refers_to edge from {from_id} to {to_id}') save('ws_refers_to', [{'_from': from_id, '_to': to_id}])
def collate(batches): logger.debug('Batches: {}'.format(batches)) batch_list = [] for batch in batches: pair = np.array(batch[0]) negs = np.array(batch[1]) negs = np.vstack((pair[0].repeat(negs.shape[0]), negs)).T # Create arrays pair_arr = np.ones( (pair.shape[0]), dtype=int ) # This sets label to 1 # TODO: Leave label as continuous pair_arr[:-1] = pair[:-1] negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int) negs_arr[:, :-1] = negs all_arr = np.vstack((pair_arr, negs_arr)) batch_list.append(all_arr) batch_array = np.vstack(batch_list) # Return item1, item2, label return (torch.LongTensor(batch_array[:, 0]), torch.LongTensor(batch_array[:, 1]), torch.FloatTensor(batch_array[:, 2]))
def _save_owner_edge(obj_ver_key, info_tup): """Save the ws_owner_of edge.""" username = info_tup[5] from_id = 'ws_user/' + sanitize_arangodb_key(username) to_id = 'ws_object_version/' + obj_ver_key logger.debug(f'Saving ws_owner_of edge from {from_id} to {to_id}') save('ws_owner_of', [{'_from': from_id, '_to': to_id}])
def get_history(self): today = datetime.now().strftime('%y-%m-%d') has = self.dao.has_item("bj", today) if has[0]: logger.debug("find history.") return html = download("http://www.fangchanzixun.com/volume", charset="utf-8") soup = BeautifulSoup(html, "html.parser") table = soup.find('table', attrs={'class': 'table'}) table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') cols = [ele.text.strip() for ele in cols] if len(cols) != 5: continue col_date = cols[0] total = cols[1].encode("utf-8") zhuzai = cols[3].encode("utf-8") date = datetime.strptime(col_date, "%Y-%m-%d").date() info = {"city": "北京", "district": "bj", "total": total, "zhuzai": zhuzai, "date": date} logger.debug(info) has = self.dao.has_item("bj", date) if not has[0]: self.dao.insert_item(info)
def fit(self, features: np.array, labels: np.array) -> None: """Builds a random forest of decision trees. Args: features: labels: Returns: None """ n_rows, n_cols = features.shape for i in range(self.num_trees): logger.debug('{} training tree: {}'.format(self.__class__.__name__, i + 1)) shuffled_row_idx = np.random.permutation(n_rows) shuffled_col_idx = np.random.permutation(n_cols) row_idx = np.random.choice(shuffled_row_idx, int(self.row_subsampling * n_rows), replace=False) col_idx = np.random.choice(shuffled_col_idx, int(self.col_subsampling * n_cols), replace=False) self.col_idxs.append(col_idx) features_subsampled = features[np.ix_(row_idx, col_idx)] labels_subsampled = labels[row_idx] self.trees.append( self.__build_tree__(features_subsampled, labels_subsampled))
def _save_ws_contains_edge(obj_key, info_tup): """Save the ws_workspace_contains_obj edge.""" from_id = 'ws_workspace/' + str(info_tup[6]) to_id = 'ws_object/' + obj_key logger.debug( f'Saving ws_workspace_contains_obj edge from {from_id} to {to_id}') save('ws_workspace_contains_obj', {'_from': from_id, '_to': to_id})
def clean_dfs(self) -> [pd.DataFrame]: training_df = self.original_training_df testing_df = self.original_testing_df # one hot encoding columns whose representation is a dict columns_one_hot_encoding_dict = {'belongs_to_collection': ['id', 'collection'], 'genres': ['id', 'genre'], 'production_countries': ['iso_3166_1', 'prod_count'], 'spoken_languages': ['iso_639_1', 'spoken_lang']} for col, list_specific_col in columns_one_hot_encoding_dict.items(): logger.debug(f'{col} will be one hot encoded as dict') training_df, testing_df = self.__one_hot_encode_representing_as_dict(training_df, testing_df, col, list_specific_col) # special one hot encoding columns for multitude of names inside the column columns_one_hot_encoding_dict = {'production_companies': ['id', 'prod_comp', True, 15], 'Keywords': ['id', 'k', False, 25]} for col, list_specific_col in columns_one_hot_encoding_dict.items(): logger.debug(f'{col} will be one hot encoded') training_df, testing_df = self.__one_hot_encode_famous_names(training_df, testing_df, col, list_specific_col) # one hot encoding columns for information about characters of the movies logger.debug(f'crew will be one hot encoded as item') training_df, testing_df = self.__one_hot_encode_characters(training_df, testing_df, 'crew') # one hot encoding columns whose one row contains only one value logger.debug(f'original_language will be one hot encoded as item') training_df, testing_df = self.__one_hot_encode_representing_as_item(training_df, testing_df, 'original_language') # extract date information logger.debug(f'extract date information will be') training_df, testing_df = self.__extract_date_information(training_df, 'training'), self.__extract_date_information(testing_df, 'testing') return training_df.fillna(0), testing_df.fillna(0)
def _save_inst_of_type_edge(obj_ver_key, info_tup): """Save the ws_obj_instance_of_type of edge.""" from_id = 'ws_object_version/' + obj_ver_key obj_type = info_tup[2] to_id = 'ws_type_version/' + obj_type logger.debug( f'Saving ws_obj_instance_of_type edge from {from_id} to {to_id}') save('ws_obj_instance_of_type', [{'_from': from_id, '_to': to_id}])
def search_workspace(params, meta): start = time.time() params = convert_params.search_workspace(params, meta) result = search(params, meta) result = convert_result.search_workspace(result, params, meta) logger.debug( f"Finished 'search_workspace' method in {time.time() - start}s") return result
def search_types(params, meta): if isinstance(params, list) and len(params) == 1: params = params[0] start = time.time() query = convert_params.search_types(params) search_result = trap_error(lambda: search(query, meta)) result = convert_result.search_types(search_result) logger.debug(f'Finished search_types in {time.time() - start}s') return [result]
def main(obj_data, ws_info, obj_data_v1, conf): """ Index a narrative object on save. We index the latest narratives for: - title and author - cell content - object names and types - created and updated dates - total number of cells """ # Reference for the workspace info type: # https://kbase.us/services/ws/docs/Workspace.html#typedefWorkspace.workspace_info # Reference for the object info type: # https://kbase.us/services/ws/docs/Workspace.html#typedefWorkspace.object_info obj_info = obj_data['info'] obj_id = obj_info[0] obj_metadata = obj_info[-1] if not obj_metadata: raise RuntimeError( f"Cannot index narrative: no metadata for the narrative object. Obj info: {obj_info}" ) [ws_id, _, owner, moddate, _, _, _, _, ws_metadata] = ws_info if not ws_metadata: raise RuntimeError( f"Cannot index narrative: no metadata for the workspace. WS info: {ws_info}" ) if ws_metadata.get('is_temporary') == 'true': logger.debug("Skipping narrative indexing because it is temporary") return is_narratorial = _narrative_is_narratorial(ws_metadata) narrative_title = obj_metadata.get('name') creator = obj_data['creator'] # Get all the types and names of objects in the narrative's workspace. narrative_data_objects = _fetch_objects_in_workspace(ws_id) # Extract all the data we want to index from the notebook cells raw_cells = obj_data['data'].get('cells', []) index_cells = _extract_cells(raw_cells, ws_id) result = { '_action': 'index', 'doc': { 'narrative_title': narrative_title, 'is_narratorial': is_narratorial, 'data_objects': narrative_data_objects, 'owner': owner, 'modified_at': ts_to_epoch(moddate), 'cells': index_cells, 'creator': creator, 'total_cells': len(raw_cells), 'static_narrative_saved': ws_metadata.get('static_narrative_saved'), 'static_narrative_ref': ws_metadata.get('static_narrative'), }, 'index': conf['index_name'], 'id': f"{conf['namespace']}::{ws_id}:{obj_id}", } yield result
def get_objects(params, meta): # KBase convention is to wrap params in an array if isinstance(params, list) and len(params) == 1: params = params[0] start = time.time() query = convert_params.get_objects(params) search_result = trap_error(lambda: search(query, meta)) result = convert_result.get_objects(params, search_result, meta) logger.debug(f'Finished get_objects in {time.time() - start}s') return [result]
def _save_created_with_module_edge(obj_ver_key, prov): """Save the ws_obj_created_with_module edge.""" if not prov or not prov[0] or not prov[0].get('service'): return module_key = get_module_key_from_prov(prov) from_id = 'ws_object_version/' + obj_ver_key to_id = 'ws_module_version/' + module_key logger.debug( f'Saving ws_obj_created_with_module edge from {from_id} to {to_id}') save('ws_obj_created_with_module', [{'_from': from_id, '_to': to_id}])
def _save_prov_desc_edge(obj_ver_key, obj): """Save the ws_prov_descendant_of edge.""" prov = obj.get('provenance') if not prov: return input_objs = prov[0].get('input_ws_objects', []) from_id = 'ws_object_version/' + obj_ver_key for upa in input_objs: to_id = 'ws_object_version/' + upa.replace('/', ':') logger.debug( f'Saving ws_prov_descendant_of edge from {from_id} to {to_id}') save('ws_prov_descendant_of', [{'_from': from_id, '_to': to_id}])
def _save_copy_edge(obj_ver_key, obj): """Save ws_copied_from document.""" copy_ref = obj.get('copied') if not copy_ref: logger.debug('Not a copied object.') return copied_key = copy_ref.replace('/', ':') from_id = 'ws_object_version/' + obj_ver_key to_id = 'ws_object_version/' + copied_key logger.debug(f'Saving ws_copied_from edge from {from_id} to {to_id}') # "The _from object is a copy of the _to object save('ws_copied_from', [{'_from': from_id, '_to': to_id}])
def _save_ws_object(obj_info, ws_info): """Runs at most every 300 seconds; otherwise a no-op.""" wsid = obj_info[6] objid = obj_info[0] key = f"{wsid}:{objid}" logger.debug(f'Saving ws_object with key {key}') save('ws_object', [{ '_key': key, 'workspace_id': wsid, 'object_id': objid, 'is_public': ws_info[6] == 'r', 'deleted': False }])
def _reindex_narrative(obj, ws_info: dict) -> None: obj_type = obj['info'][2] if 'Narrative' in obj_type: return meta = ws_info[-1] if not isinstance(meta, dict) or meta.get('narrative') != '1': logger.debug("This workspace is not a narrative") return wsid = ws_info[0] narr_info = config()['ws_client'].find_narrative(wsid, admin=True) objid = narr_info[0] # Publish an event to reindex the narrative ev = {'evtype': 'REINDEX', 'wsid': wsid, 'objid': objid} kafka.produce(ev, callback=_delivery_report)
def _wait_for_service(url, name, start_time, timeout, params=None): while True: try: logger.info(f'Waiting for {name} service...') requests.get(url, params=params).raise_for_status() logger.info(f'{name} is up!') break except Exception as err: logger.debug(f'Unable to connect to {name}: {err}') time.sleep(5) if (int(time.time()) - start_time) > timeout: raise RuntimeError( f"Failed to connect to all services in {timeout}s. Timed out on {name}." )
def get_embedding(self, nodes): embs = [] emb_weight = self.emb_weights(nodes[:, 0]) emb_weight_norm = self.emb_weights_softmax(emb_weight) for i in range(nodes.shape[1]): logger.debug('center i: {}'.format(i)) embs.append(self.center_embeddings[i](nodes[:, i])) emb_stack = torch.stack(embs) embs_weighted = emb_stack * emb_weight_norm.T.unsqueeze(2).expand_as( emb_stack) emb = torch.sum(embs_weighted, axis=0) return emb
def _save_created_with_method_edge(obj_ver_key, prov): """Save the ws_obj_created_with_method edge.""" if not prov or not prov[0] or not prov[0].get('service'): return method_key = get_method_key_from_prov(prov) from_id = 'ws_object_version/' + obj_ver_key to_id = 'ws_method_version/' + method_key params = prov[0].get('method_params') logger.debug( f'Saving ws_obj_created_with_method edge from {from_id} to {to_id}') save('ws_obj_created_with_method', [{ '_from': from_id, '_to': to_id, 'method_params': params }])
def __build_tree__(self, features: np.array, labels: np.array, depth: int = 0) -> Node: """Build decision tree that learns split functions. Args: features: labels: depth: Returns: Decision tree root node """ n_percentiles = max(2, int(len(labels) / 10)) if self.check_stopping_condition(labels, depth): prob = self.get_probability(labels) return Node(None, None, None, prob) # type: ignore else: logger.debug('Features: {}'.format(features)) logger.debug('Labels: {}'.format(labels)) splits = np.percentile(features, self.get_percentile_list(n_percentiles), axis=0) best_split = None best_split_feat_idx = None best_split_gini_gain = float('-inf') for feat_idx, feat_col in enumerate(features.T): # Transpose to loop through columns logger.debug('Col index: {}'.format(feat_idx)) for split in splits[:, feat_idx]: labels_left = labels[np.where(feat_col < split)] labels_right = labels[np.where(feat_col >= split)] gain = gini_gain(labels, [labels_left, labels_right]) if gain > best_split_gini_gain: best_split_gini_gain, best_split, best_split_feat_idx = gain, split, feat_idx split_left = np.where(features[:, best_split_feat_idx] < best_split) split_right = np.where(features[:, best_split_feat_idx] >= best_split) logger.debug('Split left: {} | right: {}'.format(split_left, split_right)) features_left, features_right = features[split_left], features[split_right] labels_left, labels_right = labels[split_left], labels[split_right] # If either node is empty after splitting if len(labels_left) == 0: node_left = Node(None, None, None, self.get_probability(labels)) # type: ignore node_right = self.__build_tree__(features_right, labels_right, depth + 1) elif len(labels_right) == 0: # pragma: no cover node_left = self.__build_tree__(features_left, labels_left, depth + 1) node_right = Node(None, None, None, self.get_probability(labels)) # type: ignore else: node_left = self.__build_tree__(features_left, labels_left, depth + 1) node_right = self.__build_tree__(features_right, labels_right, depth + 1) return Node(node_left, node_right, lambda feature: feature[best_split_feat_idx] < best_split)
def predict(self, features: np.array) -> np.array: """Returns labels given a set of features. Args: features: Numpy array of features with shape (row x col) Returns: Predicted labels """ labels_list = [] for tree, col_idx in zip(self.trees, self.col_idxs): logger.debug('Col index: {}'.format(col_idx)) features_subsampled = features[:, col_idx] labels_list.append( [tree.split(row) for row in features_subsampled]) labels = np.array(labels_list).mean(axis=0) return labels
def craw_open(self): for c in self.sources: logger.debug(c["city"] + ": " + c["url"]) arr = get_json(c["url"]) if arr and arr["data"]: for item in arr["data"]: if not item["avg_unit_price"]: item["avg_unit_price"] = -1.0 if not item["name"] or not["house_count"]: logger.debug(item) continue row = {"city": c["city"], "district": item["name"].encode("utf-8"), "total": item["house_count"], "price": item["avg_unit_price"], "date": self.date} old = self.dao.get_item(row["city"], row["district"], row["date"]) if not old: self.dao.insert_item(row) elif old[3] < row["total"]: self.dao.update_item(row["city"], row["district"], row["total"], row["price"], row["date"])
def _save_obj_version(key, ver, info_tup, ws_info): objid = info_tup[0] wsid = info_tup[6] obj_name = info_tup[1] hsh = info_tup[8] size = info_tup[9] epoch = ts_to_epoch(info_tup[3]) logger.debug(f"Saving ws_object_version with key {key}") save('ws_object_version', [{ '_key': key, 'workspace_id': wsid, 'object_id': objid, 'version': ver, 'name': obj_name, 'hash': hsh, 'size': size, 'epoch': epoch, 'deleted': False, 'is_public': ws_info[6] == 'r', }])
def _save_workspace(ws_info): """Save the ws_workspace vertex given an object info tuple.""" wsid = ws_info[0] # Workspace info tuple is as follows: # 0 1 2 3 4 5 6 7 8 # [id,name,owner,moddate,maxobjid,user_perms,globalread,lockstat,metadata] metadata = ws_info[-1] logger.debug(f'Saving workspace vertex {wsid}') save( 'ws_workspace', { '_key': str(wsid), 'narr_name': metadata.get('narrative_nice_name', ''), 'owner': ws_info[2], 'max_obj_id': ws_info[4], 'lock_status': ws_info[7], 'name': ws_info[1], 'mod_epoch': ts_to_epoch(ws_info[3]), 'is_public': ws_info[6] == 'r', 'is_deleted': False, 'metadata': metadata })
def get_wangqian(self): html = download(self.url, charset="utf-8") soup = BeautifulSoup(html, "html.parser") total_div = soup.find("span", {"id": "ess_ctr5112_FDCJY_SignOnlineStatistics_totalCount4"}) zhuzai_div = soup.find("span", {"id": "ess_ctr5112_FDCJY_SignOnlineStatistics_residenceCount4"}) date_div = soup.find("span", {"id": "ess_ctr5115_FDCJY_HouseTransactionStatist_timeMark4"}) if total_div and zhuzai_div: total = total_div.text.encode("utf-8") zhuzai = zhuzai_div.text.encode("utf-8") date = date_div.text.strip().encode("utf-8") tmp = date.split("-") if len(tmp) == 3: date = datetime.today().replace(year=int(tmp[0]), month=int(tmp[1]), day=int(tmp[2])) else: logger.error("beijing gov get wangqian.") row = {"city": "北京", "district": "bj", "zhuzai": zhuzai, "total": total, "date": date} has = self.dao.has_item("bj", date) if not has[0]: logger.debug(row) self.dao.insert_item(row)
def _save_type_vertices(obj_info): """Save associated vertices for an object type.""" obj_type = sanitize_arangodb_key(obj_info[2]) (type_module, type_name, type_ver) = get_type_pieces(obj_type) (maj_ver, min_ver) = [int(v) for v in type_ver.split('.')] logger.debug( f'Saving ws_type_version, ws_type, and ws_type_module for {obj_type}') save( 'ws_type_version', { '_key': obj_type, 'type_name': type_name, 'module_name': type_module, 'maj_ver': maj_ver, 'min_ver': min_ver }) save( 'ws_type', { '_key': f'{type_module}.{type_name}', 'type_name': type_name, 'module_name': type_module }) save('ws_type_module', {'_key': type_module})
def forward(self, centers, contexts, neg_contexts): """ Args: center: List of center words context: List of context words neg_context: List of list of negative context words Returns: """ # Calculate positive score emb_centers = [] for i in range(centers.shape[1]): logger.debug('center i: {}'.format(i)) emb_centers.append(self.center_embeddings[i](centers[:, i])) emb_center = torch.mean(torch.stack(emb_centers), axis=0) emb_contexts = [] for i in range(contexts.shape[1]): logger.debug('context i: {}'.format(i)) emb_contexts.append(self.context_embeddings[i](contexts[:, i])) emb_context = torch.mean(torch.stack(emb_contexts), axis=0) emb_neg_contexts = [] neg_contexts = neg_contexts.view(-1, len(self.context_embeddings)) for i in range(neg_contexts.shape[1]): logger.debug('neg context i: {}, {}'.format(i, neg_contexts[:, i])) emb_neg_contexts.append(self.context_embeddings[i]( neg_contexts[:, i])) emb_neg_context = torch.mean(torch.stack(emb_neg_contexts), axis=0) # Next two lines equivalent to torch.dot(emb_center, emb_context) but for batch score = torch.mul(emb_center, emb_context) # Get dot product (part 1) score = torch.sum(score, dim=1) # Get dot product (part2) score = torch.clamp(score, max=10, min=-10) score = -F.logsigmoid(score) # Get score for the positive pairs # Calculate negative score (for negative samples) neg_score = torch.bmm( emb_neg_context.view(emb_center.shape[0], -1, emb_center.shape[1]), emb_center.unsqueeze(2)).squeeze() # Get dot product neg_score = torch.clamp(neg_score, max=10, min=-10) neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1) # Return combined score return torch.mean(score + neg_score)
def index_obj(obj_data, ws_info, msg_data): """ For a newly created object, generate the index document for it and push to the elasticsearch topic on Kafka. Args: obj_data - in-memory parsed data from the workspace object msg_data - json event data received from the kafka workspace events stream. Must have keys for `wsid` and `objid` """ obj_type = obj_data['info'][2] (type_module, type_name, type_version) = ws_utils.get_type_pieces(obj_type) if (type_module + '.' + type_name) in config()['global']['ws_type_blacklist']: # Blacklisted type, so we don't index it return # check if this particular object has the tag "noindex" metadata = ws_info[-1] # If the workspace's object metadata contains a "nosearch" tag, skip it if metadata.get('searchtags'): if 'noindex' in metadata['searchtags']: return # Get the info of the first object to get the creation date of the object. upa = get_upa_from_msg_data(msg_data) try: obj_data_v1 = config()['ws_client'].admin_req( 'getObjects', { 'objects': [{ 'ref': upa + '/1' }], 'no_data': 1 }) except WorkspaceResponseError as err: ws_utils.log_error(err) raise err obj_data_v1 = obj_data_v1['data'][0] # Dispatch to a specific type handler to produce the search document (indexer, conf) = _find_indexer(type_module, type_name, type_version) # All indexers are generators that yield document data for ES. defaults = indexer_utils.default_fields(obj_data, ws_info, obj_data_v1) for indexer_ret in indexer(obj_data, ws_info, obj_data_v1, conf): if indexer_ret['_action'] == 'index': allow_indices = config()['allow_indices'] skip_indices = config()['skip_indices'] if allow_indices is not None and indexer_ret.get( 'index') not in allow_indices: # This index name is not in the indexing whitelist from the config, so we skip logger.debug( f"Index '{indexer_ret['index']}' is not in ALLOW_INDICES, skipping" ) continue if skip_indices is not None and indexer_ret.get( 'index') in skip_indices: # This index name is in the indexing blacklist in the config, so we skip logger.debug( f"Index '{indexer_ret['index']}' is in SKIP_INDICES, skipping" ) continue if '_no_defaults' not in indexer_ret: # Inject all default fields into the index document. indexer_ret = indexer_utils.merge_default_fields( indexer_ret, defaults) yield indexer_ret
def search_objects(params, meta): start = time.time() result = search(params, meta) logger.debug(f"Finished 'search_objects' method in {time.time() - start}s") return result
def static_file(path): logger.debug(path) return message_app.send_static_file(path)
if __name__ == '__main__': # EXTRACTING label_column = 'revenue' original_training_df = tools.get_df_from_csv( constants.path_training_file)[[constants.label_column] + constants.columns_to_process] original_testing_df = tools.get_df_from_csv( constants.path_testing_file)[constants.columns_to_process] # TRANSFORMING pipeline_transforming = PipelineTransforming(original_training_df, original_testing_df) training_df, testing_df = pipeline_transforming.clean_dfs() logger.debug(f'Training shape: {training_df.shape}') logger.debug(f'Testing shape: {testing_df.shape}') logger.debug(f'Training columns: {training_df.columns}') # TRANSFORMING AND LOADING if mode == 'cross_validate_model': rf = RandomForestRegressor(**parameters_rf) # model pipeline_loading.cross_validate_model(training_df, rf) elif mode == 'tune_hyperparameters_grid_search_cv': pipeline_loading.tune_hyperparameters_by_grid_search_cv(training_df) elif mode == 'tune_hyperparameters_randomized_search_cv': pipeline_loading.tune_hyperparameters_by_randomized_search_cv( training_df) elif mode == 'produce_submission_result': rf = RandomForestRegressor(**parameters_rf) # model pipeline_loading.produce_submission_result(training_df, testing_df, rf)