def get_train_and_val(df, val_prop: float): """ Splits into training and validation set, where validation set has 50% negative edges Args: df: val_prop: Returns: """ n_val_samples = int(val_prop * df.shape[0]) logger.info('Eventual required val samples (proportion: {}): {:,}'.format( val_prop, n_val_samples)) train, val = train_val_split(df, n_val_samples) logger.info('Ratio of train to val: {:,}:{:,} ({:.2f})'.format( train.shape[0], val.shape[0], val.shape[0] / (train.shape[0] + val.shape[0]))) neg_samples = create_negative_edges(df, val, n_val_samples) val = combine_val_and_neg_edges(val, neg_samples) train = train[['product1', 'product2', 'weight']].copy() return train, val
def remove_account(): config_obj = read_config_file() if len(config_obj) == 0: logger.info("No config found.") else: available_servers = [] for i in range(0, len(config_obj)): if 'Host' in config_obj[i]: available_servers.append({ 'key': i, 'name': config_obj[i]['Host'], 'value': i, }) host_ques = [{ 'type': 'list', 'name': 'ssh-hosts', 'message': 'Select host to remove:', 'choices': available_servers + retrieve_questions('default_exits') }] selected_host = prompt(host_ques, style=style) default_menu_or_exit(selected_host['ssh-hosts']) confirm_removal = prompt(retrieve_questions('confirm_remove'), style=style) if confirm_removal['remove_confirmation']: config_obj.pop(selected_host['ssh-hosts']) write_config_to_file(config_obj) logger.info("Finished removing config.") else: init()
def run(self): try: logger.info("start wuhan vol crawler.") self.parse() logger.info("end wuhan vol crawler.") except Exception, e: logger.error("error", e)
def warm_up(self, scaler, model, dataloader, cfg, prefix='train'): optimizer = build_optimizer(cfg, model) model.train() cur_iter = 0 while cur_iter < cfg.WARMUP.ITERS: for i, sample in enumerate(dataloader): cur_iter += 1 if cur_iter >= cfg.WARMUP.ITERS: break lr = get_warmup_lr(cur_iter, cfg) for param_group in optimizer.param_groups: param_group['lr'] = lr losses = self.run_step(scaler, model, sample, optimizer, None, None, prefix) if self.cfg.local_rank == 0: template = "[iter {}/{}, lr {}] Total train loss: {:.4f} \n" "{}" logger.info( template.format( cur_iter, cfg.WARMUP.ITERS, round(get_current_lr(optimizer), 6), losses["loss"].item(), "\n".join( ["{}: {:.4f}".format(n, l.item()) for n, l in losses.items() if n != "loss"]), ) ) del optimizer
def process_sample_set(obj_ver_key: str, obj_data: dict) -> None: """ obj_ver_key: object version key obj_data: object data """ # term_bank dictionary for storing arango document information about # already encountered terms. mapping of ontology_term -> arango "_id" field term_bank: Dict[str, str] = {} edges: List[dict] = [] # iterate per sample for sample_info in obj_data['data']['samples']: # retrieve the sample metadata sample = _get_sample(sample_info) sample_version_uuid = _get_sample_version_uuid(sample) # term_bank object and edges list passed by reference # find terms we know are ontology terms _generate_link_information(sample, sample_version_uuid, edges, term_bank) # add creation timestamp for edge link, (same for all edges). created_timestamp = _now_epoch_ms() + 20 * len( edges) # allow 20 ms to transport & save each edge for e in edges: e['created'] = created_timestamp logger.info(f'Writing {len(edges)} sample -> ontology edges ' f'for samples in SampleSet {obj_ver_key}') # save link in bulk operation _save(SAMPLE_ONTOLOGY_COLL, edges)
def get_edges(df): """ Returns a dataframe of products and the weights of the edges between them. Args: df: Returns: """ logger.info('Relationship distribution: \n{}'.format(df['relationship'].value_counts())) df = create_product_pair(df, col_list=['asin', 'related']) logger.info('Product pairs created') df = get_relationship_weights(df, relationship_weights) logger.info('Relationship weights updated') # Aggregate to remove duplicates logger.info('Original no. of edges: {:,}'.format(df.shape[0])) df = df.groupby('product_pair').agg({'weight': 'sum'}).reset_index() logger.info('Deduplicated no. of edges: {:,}'.format(df.shape[0])) # Save edge list df['product1'], df['product2'] = zip(*df['product_pair'].apply(split_product_pair)) df = df[['product1', 'product2', 'weight', 'product_pair']] return df
def remove_account(): config_obj = read_config_file() if len(config_obj) == 0: logger.info("No config found.") else: host_ques = [{ 'type': 'list', 'name': 'git-hosts', 'message': 'Select host to remove:', 'choices': list(config_obj.keys()) + retrieve_questions('default_exits') }] selected_host = prompt(host_ques, style=style) default_menu_or_exit(selected_host['git-hosts']) confirm_removal = prompt(retrieve_questions('confirm_remove'), style=style) if confirm_removal['remove_confirmation']: del config_obj[selected_host['git-hosts']] write_config_to_file(config_obj) logger.info("Account removal successful.") else: init()
def delete_obj(msg): """ Checks that the received object is deleted, since the workspace object delete event can refer to either delete or undelete state changes. """ wsid = msg['wsid'] objid = msg['objid'] if not check_object_deleted(wsid, objid): # Object is not deleted logger.info(f'Object {objid} in workspace {wsid} is not deleted') return # Perform the deletion query = { 'bool': { 'must': [ {'term': {'access_group': wsid}}, {'term': {'obj_id': objid}} ] } } json_body = json.dumps({'query': query}) # Perform the delete_by_query using the elasticsearch http api. resp = requests.post( f"{_ES_URL}/{_IDX}/_delete_by_query", params={'conflicts': 'proceed'}, data=json_body, headers={"Content-Type": "application/json"} ) if not resp.ok: # Unsuccesful request to elasticsearch. raise RuntimeError(f"Error deleting object on elasticsearch:\n{resp.text}") logger.info(f"Deleted elasticsearch documents associated with obj {wsid}/{objid}")
def _generate_features(obj_ver_key, obj_data): d = obj_data['data'] if not d.get('features'): logger.info(f'Genome {obj_ver_key} has no features') return verts = [] edges = [] wsid = obj_data['info'][6] objid = obj_data['info'][0] ver = obj_data['info'][4] # might want to do this in smaller batches if memory pressure is an issue for f in d['features']: feature_key = _clean_key(f'{obj_ver_key}_{f["id"]}') verts.append({ '_key': feature_key, 'workspace_id': wsid, 'object_id': objid, 'version': ver, 'feature_id': f['id'] }) edges.append({ '_key': f'{feature_key}', # make a unique key so overwrites work '_from': f'{_OBJ_VER_COLL}/{obj_ver_key}', '_to': f'{_WS_FEAT_COLL}/{feature_key}' }) logger.info(f'Saving {len(verts)} features for genome {obj_ver_key}') # hmm, this could leave the db in a corrupt state... options are 1) rollback 2) retry 3) leave # rollback is kind of impossible as an error here implies the re api isn't reachable # retry is doable, but should probably be implemented much higher in the stack # So 3 for now # reindexing will overwrite and fix _save(_WS_FEAT_COLL, verts) _save(_WS_FEAT_EDGE_COLL, edges)
def __init__(self, edge_path: str, val_path: str, power: float = 0.75): """ Initializes an Edges object for use in a Dataset. Args: edge_path: Path to numpy array of sequences, where each row is a sequence power: Negative sampling parameter; suggested 0.75 """ self.power = power self.negative_idx = 0 self.n_unique_tokens = 0 self.edges = pd.read_csv(edge_path) self.n_edges = len(self.edges) logger.info('Edges loaded (length = {:,})'.format(self.n_edges)) self.val = pd.read_csv(val_path) logger.info('Validation set loaded: {}'.format(self.val.shape)) self.product_set = self.get_product_set() self.word2id, self.id2word = self.get_mapping_dicts() self.get_product_id_func = np.vectorize(self.get_product_id) self.n_unique_tokens = len(self.word2id) logger.info('No. of unique tokens: {}'.format(self.n_unique_tokens)) save_model(self.word2id, '{}/word2id_edge'.format(MODEL_PATH)) save_model(self.id2word, '{}/id2word_edge'.format(MODEL_PATH)) logger.info('Word2Id and Id2Word created and saved') # Convert product ID strings to integers self.edges = self.prep_edges() logger.info('Edges prepared') # Prepare negative sampling table self.word_freq = self.get_word_freq(self.edges[:, :2]) self.neg_table = self.get_negative_sample_table(self.power)
def train_embeddings(sequences, workers, dimension=128, window=5, min_count=1, negative=5, epochs=3, seed=42): # Logging specific to gensim training import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Initialize model model = Word2Vec(sequences, workers=workers, size=dimension, window=window, min_count=min_count, negative=negative, seed=seed) logger.info('Model initialized') # Train model (No need to retrain model as initialization includes training) # model.train(sequences, total_examples=len(sequences), epochs=epochs) # logger.info('Model trained!') return model
def _generate_GO_links(obj_ver_key, obj_data): d = obj_data['data'] if not d.get('features'): # no features logged already in _generate_features return f_to_go = {} for f in d['features']: # this works for Genome-8.2 to 10.0 in production if _ONTOLOGY_TERMS in f and _ONTOLOGY_GO_KEY in f[_ONTOLOGY_TERMS]: f_to_go[f['id']] = f[_ONTOLOGY_TERMS][_ONTOLOGY_GO_KEY].keys() terms_set = {i for items in f_to_go.values() for i in items} # flatten query_time = _now_epoch_ms() # might want to do this in smaller batches if memory pressure is an issue resolved_terms = _resolve_GO_terms(terms_set, query_time) edges = [] for f in f_to_go: for g in f_to_go[f]: if g not in resolved_terms: logger.info(f"Couldn't resolve GO term {g} in Genome {obj_ver_key} feature {f}") else: featurekey = _clean_key(f'{obj_ver_key}_{f}') edges.append({ '_key': f'{featurekey}::{resolved_terms[g]}::kbase_RE_indexer', '_from': f'{_WS_FEAT_COLL}/{featurekey}', '_to': f'{_GO_TERM_COLL}/{resolved_terms[g]}', 'source': 'kbase_RE_indexer', 'expired': _MAX_ADB_INTEGER }) created_time = _now_epoch_ms() + 20 * len(edges) # allow 20 ms to transport & save each edge for e in edges: e['created'] = created_time logger.info(f'Writing {len(edges)} feature -> GO edges for genome {obj_ver_key}') _save(_WS_FEAT_TO_GO_COLL, edges, on_duplicate='ignore')
def __init__(self, emb_sizes, emb_dim): super().__init__() self.emb_sizes = emb_sizes self.emb_dim = emb_dim # Create embedding layers self.center_embeddings = nn.ModuleList() for k, v in self.emb_sizes.items(): self.center_embeddings.append(nn.Embedding(v, emb_dim, sparse=True)) self.context_embeddings = nn.ModuleList() for k, v in self.emb_sizes.items(): self.context_embeddings.append( nn.Embedding(v, emb_dim, sparse=True)) # Create embedding weighting layer self.emb_weights = nn.Embedding( emb_sizes['product'], len(emb_sizes), sparse=True) # emb_sizes['product'] is total number of products self.emb_weights_softmax = nn.Softmax(dim=1) self.init_emb() logger.info('Model initialized: {}'.format(self))
def get_raw_data(path_raw_data, group_columns, date_column='WorkingDate', target_column=config.TARGET): logger.info(f"Downloading raw data from {path_raw_data}") all_columns = group_columns.copy() all_columns.append(target_column) all_columns.append(date_column) raw_data = pd.read_csv(Path(path_raw_data), sep=";") raw_data[date_column] = pd.to_datetime(raw_data[date_column]) raw_data = raw_data[all_columns] filled_raw_data = (raw_data.set_index(date_column).groupby( group_columns).apply(lambda d: d.reindex( pd.date_range(min(raw_data[date_column]), max(raw_data[date_column]), freq='D'))).drop( group_columns, axis=1).reset_index(group_columns).fillna(0)) filled_raw_data = filled_raw_data.reset_index() filled_raw_data = filled_raw_data.rename(columns={'index': date_column}) return filled_raw_data
def create_random_walk_samples(node_dict, transition_dict, samples_per_node=10, sequence_len=10): random.seed(42) n_nodes = len(node_dict) sample_array = np.zeros((n_nodes * samples_per_node, sequence_len), dtype=int) logger.info('Sample array shape: {}'.format(sample_array.shape)) # For each node for node_idx in range(n_nodes): if node_idx % 100000 == 0: logger.info('Getting samples for node: {:,}/{:,}'.format( node_idx, n_nodes)) # For each sample for sample_idx in range(samples_per_node): node = node_idx # For each event in sequence for seq_idx in range(sequence_len): sample_array[node_idx * samples_per_node + sample_idx, seq_idx] = node node = random.choices( population=transition_dict[node]['product'], weights=transition_dict[node]['probability'], k=1)[0] return sample_array
def run_importer(obj, ws_info, msg): start = time.time() type_, _ = obj['info'][2].split('-') # 2nd var is version if type_ in config()['global']['ws_type_blacklist']: logger.info(f'Skipped RE import of blacklisted type {type_}') else: import_object(obj, ws_info) logger.info(f"Imported an object into RE in {time.time() - start}s.")
def get_forecast_calendar(path_forecast_calendar, date_column='WorkingDate'): logger.info(f"Downloading forecast calendar from {path_forecast_calendar}") forecast_calendar = pd.read_excel(Path(path_forecast_calendar)) forecast_calendar[date_column] = pd.to_datetime( forecast_calendar[date_column]) return forecast_calendar
def _fetch_global_config(config_url): """ Fetch the index_runner_spec configuration file from a URL to a yaml file. """ logger.info(f'Fetching config from url: {config_url}') # Fetch the config directly from config_url with urllib.request.urlopen(config_url) as res: # nosec return yaml.safe_load(res.read())
def get_categories(df: pd.DataFrame) -> pd.DataFrame: df['category_lvl_1'] = df['categories'].apply(get_category_lvl, args=(0, )) df['category_lvl_2'] = df['categories'].apply(get_category_lvl, args=(1, )) df['category_lvl_3'] = df['categories'].apply(get_category_lvl, args=(2, )) df['category_lvl_4'] = df['categories'].apply(get_category_lvl, args=(3, )) logger.info('Categories lvl 1 - 4 prepared') return df
def run(self): try: logger.info("start bj gov crawler.") self.get_wangqian() self.get_history() logger.info("end bj gov crawler.") except Exception, e: logger.error("error", e)
def cross_validate_model(training_df: pd.DataFrame, rf: RandomForestRegressor) -> None: # cross validation scores = cross_val_score(rf, training_df.drop(['revenue'], axis=1), training_df['revenue'], cv=5, scoring='neg_mean_squared_log_error') logger.info(scores)
def start_consumers(self): logger.info('Starting Consumers...') try: self._channel.start_consuming() except KeyboardInterrupt: self._channel.stop_consuming() rbmq.conn.close()
def run(self): try: logger.info("start lianjia crawler") self.craw_stat() self.craw_open() self.crawPriceTrends() logger.info("end lianjia crawler") except Exception, e: logger.error("error", e)
def __init__(self, depth_limit: int = 99): """Initializes a decision tree with depth limit Args: depth_limit: Maximum depth to build the tree """ self.root = None self.depth_limit = depth_limit logger.info('{} initialized with depth limit: {}'.format(self.__class__.__name__, depth_limit))
def load_network(edgelist_path): graph = networkx.read_weighted_edgelist(edgelist_path) logger.info('No of nodes ({:,}) and edges ({:,})'.format( graph.number_of_nodes(), graph.number_of_edges())) # Get dictionary mapping of integer to nodes node_dict = {i: key for i, key in enumerate(graph.nodes.keys())} return graph, node_dict
def wrapper(*args, **kwargs): if 'Api-Key' not in request.headers: return {'status':'error', 'msg': 'Api-Key not present on Request Headers'}, http.HTTPStatus.BAD_REQUEST if request.headers['Api-Key'] != apikey.API_KEY: logger.info('Error: ApiKey not valid') return {'status':'error', 'msg': 'ApiKey not valid'}, http.HTTPStatus.UNAUTHORIZED return fn(*args, **kwargs)
def close_consumer(consumer: Consumer) -> None: """ This will close the network connections and sockets. It will also trigger a rebalance immediately rather than wait for the group coordinator to discover that the consumer stopped sending heartbeats and is likely dead, which will take longer and therefore result in a longer period of time in which consumers can’t consume messages from a subset of the partitions. """ consumer.close() logger.info("Closed the Kafka consumer")
def run(self): try: logger.info("start hangzhou vol crawler.") prev_url = self.get_month_vol(datetime.date.today() - datetime.timedelta(days=1), None) while prev_url: prev_url = self.get_month_vol(None, prev_url) time.sleep(6) logger.info("end hangzhou vol crawler.") except: logger.error("error")
def add_account(email, username, hostname): config_obj = read_config_file() config_obj[hostname] = { "hostName": hostname, "name": username, "email": email } write_config_to_file(config_obj) logger.info("Finished adding account.")
def run(self): try: logger.info("start draw img.") self.draw_vol_1days() self.draw_vol_7days() self.draw_vol_monthly() self.draw_price_trends() logger.info("end draw img.") except: logger.error("error.")
def start_service(wait_for_url, wait_for_name): global container_process global container_out global container_err cmd = "docker-compose --no-ansi up" logger.info(f'Running command:\n{cmd}') container_out = open("container.out", "w") container_err = open("container.err", "w") container_process = subprocess.Popen(cmd, shell=True, stdout=container_out, stderr=container_err) wait_for_service(wait_for_url, wait_for_name)
def collect_samples(item_array, sample_size, n_samples): samples = [] for i in range(0, n_samples): if i % 1000000 == 0: logger.info('Neg sample: {:,}'.format(i)) sample = get_sample(item_array, n_iter=i, sample_size=sample_size) samples.append(sample) return samples
def _pull_docker_image(image): """check if image exists, if not pull it.""" li = _DOCKER.images.list() pulled = False for im in li: if image in im.tags: # id_ = im.id pulled = True if not pulled: logger.info("Pulling %s" % image) _DOCKER.images.pull(image)
def insert(self, data): logger.info(f'Insert data on {self.config.CALC_QUEUE}') conn = self._set_connection() channel = conn.channel() channel.queue_declare(queue=self.config.CALC_QUEUE) channel.basic_publish(exchange='', routing_key=self.config.CALC_QUEUE, body=json.dumps(data.serialize())) conn.close() logger.info(f'Data inserted')
def val_epoch(self, epoch, model, dataloader, optimizer=None, lr_scheduler=None, prefix="val"): model.eval() lossMeter = LossMeter() perfMeter = PerfMeter() with torch.no_grad(): for (imgs, labels) in dataloader: if self.cfg.HALF: imgs = imgs.half() if len(self.device)>1: losses, performances = data_parallel(model, (imgs, labels, prefix), device_ids=self.device, output_device=self.device[-1]) else: imgs = imgs.cuda() labels = [label.cuda() for label in labels] if isinstance(labels,list) else labels.cuda() losses, performances = model(imgs, labels, prefix) lossMeter.__add__(losses) perfMeter.__add__(performances) del imgs, labels, losses, performances avg_losses = lossMeter.average() avg_perf = perfMeter.average() template = "[epoch {}] Total {} loss : {:.4f} " "\n" "{}" logger.info( template.format( epoch,prefix,avg_losses["all_loss"], "\n".join(["{}: {:.4f}".format(n, l) for n, l in avg_losses.items() if n != "all_loss"]), ) ) if self.cfg.TENSORBOARD: # Logging val Loss [self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l, epoch) for n, l in avg_losses.items()] # Logging val performances [self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v, epoch) for k, v in avg_perf.items()] perf_log_str = f"\n------------ Performances ({prefix}) ----------\n" for k,v in avg_perf.items(): perf_log_str += "{:}: {:.4f}\n".format(k, v) perf_log_str += "------------------------------------\n" logger.info(perf_log_str) acc = avg_perf['all_perf'] del avg_losses, avg_perf return acc
def run(self): try: logger.info("start shanghai vol crawler.") html = download(self.url) soup = BeautifulSoup(html, "html.parser") soup.find("div", {}) match = re.findall("出售各类商品房<b>(\\d+)</b>套", html) if match: vol = match[0] ds = re.findall("今日楼市((\\d+)-(\\d+)-(\\d+))", html) date = datetime.now().replace(year=int(ds[0][0]), month=int(ds[0][1]), day=int(ds[0][2])) info = {"city": "上海", "district": "sh", "total": vol, "zhuzai": 0, "date": date} has = self.dao.has_item("sh", date) if not has[0]: self.dao.insert_item(info) logger.info("end shanghai vol crawler.") except Exception, e: logger.error(e)
def get_month_vol(self, dt, url): if not url: url = self.url_pattern % dt.strftime('%Y%m%d') else: match = re.findall("(\\d{8})", url) if not match: return None dt = datetime.datetime(*time.strptime(match[0], '%Y%m%d')[:6]) logger.info(url) html = download(url, charset="utf-8") if not html: return None new_vols = re.findall("ss1\.push\((\\d+)\);", html) old_vols = re.findall("ss2\.push\((\\d+)\);", html) days = re.findall("tickss\.push\((\\d+)\);", html) if len(new_vols) != len(old_vols) or len(days) != len(new_vols): logger.info(new_vols) logger.info(old_vols) logger.info(days) return for i in range(0, len(days), 1): vol_date = dt.replace(dt.year, dt.month, int(days[i])) city = "杭州" district = "杭州" has = self.dao.has_item(district, vol_date) if not has[0] and int(new_vols[i]) > 0 and int(old_vols[i]) > 0: self.dao.insert_item({"city": city, "district": district, "total": new_vols[i], "zhuzai": old_vols[i], "date": vol_date}) soup = BeautifulSoup(html, "html.parser") div_date = soup.find("div", {"class": "date"}); if div_date: if div_date.find("a"): path = div_date.find("a")["href"] if path: return self.host + path return None
old = self.dao.get_item(row["city"], row["district"], row["date"]) if not old: self.dao.insert_item(row) else: self.dao.update_item(city, "月趋势", 0, price, last) month -= 1 if month == 0: year -= 1 month = 12 last = datetime(year, month, 1) def run(self): try: logger.info("start lianjia crawler") self.craw_stat() self.craw_open() self.crawPriceTrends() logger.info("end lianjia crawler") except Exception, e: logger.error("error", e) if __name__ == "__main__": file_dir = os.path.dirname(os.path.abspath(__file__)) db_path = ''.join([file_dir, "/../../db/house.db"]) logger.info("start.") dao = LianjiaDAO(datetime.today(), db_path) tool = Lianjia(dao) tool.crawPriceTrends() logger.info("end.")