def predict_target_with_query( sparql, query, source, timeout=TIMEOUT, limit=LIMIT): """Predicts target with given query. For example for pagerank_bidi: SELECT distinct(?target) ?score { { dbr:Circle ?p ?target .} UNION { ?target ?q dbr:Circle . } ?target dbo:wikiPageRank ?score } ORDER BY DESC(?score) LIMIT 100 """ q = query % {'source': source.n3()} q += '\nLIMIT %d' % limit t, q_res = gp_query._query(sparql, timeout, q) res_rows_path = ['results', 'bindings'] bindings = sparql_json_result_bindings_to_rdflib( get_path(q_res, res_rows_path, default=[]) ) target_scores = [ (get_path(row, [TARGET_VAR]), get_path(row, [Variable('score')])) for row in bindings] # print(target_scores) return target_scores
def _export_datasets(dataset, features, classes, origin, sufix): from itertools import combinations from tasks.linker import params from multiprocessing.pool import Pool nfolds = 3 folds = [i for i in range(nfolds)] partitions = [list(c) + list((set(folds) - set(c))) for c in combinations(folds, 2)] datasets = _fold(dataset, nfolds) for pt in partitions: training = [] for i in pt[:-1]: training.extend(datasets[i]) test = datasets[pt[-1]] name_ = 'all{}{}{}'.format(origin, sufix + '_tr', pt[-1]) filename = get_path('datasets', '{}.arff'.format(name_)) classes_ = [next((v['short_name'] for k, v in params.items() if v['metadata_uri'] == c), None) for c in classes] dataset_ = ([d, classes_] for d in _chunks(training, os.cpu_count())) with Pool(os.cpu_count()) as p: sets_ = p.starmap(_expand, dataset_); dataset_ = [] for s in sets_: dataset_.extend(s) with Pool(os.cpu_count()) as p: dataset_ = p.map(_flatten, dataset_) _save(dataset_, features, 'class', name_, filename) name_ = 'all{}{}{}'.format(origin, sufix + '_tt', pt[-1]) filename = get_path('datasets', '{}.arff'.format(name_)) dataset_ = ([l, classes_] for l in test) with Pool(os.cpu_count()) as p: dataset_ = p.starmap(_concat, dataset_) with Pool(os.cpu_count()) as p: dataset_ = p.map(_flatten, dataset_) _save_test(dataset_, features, 'class', name_, filename)
def __init__(self, data_fold=None, full_dataset=False, out_path=None, in_path=None, output_patches=False, scale=1.5, minSize=(200,200), windowSize=(40,40), stepSize=15): self.scale = scale self.minSize = minSize self.windowSize = windowSize self.output_patches = output_patches self.stepSize = stepSize if self.output_patches == False else 30 self.total_window_num = 0 if data_fold is None: self.data_fold = utils.TRAINING if self.output_patches or full_dataset else utils.VALIDATION else: self.data_fold = data_fold self.in_path = in_path if in_path is not None else utils.get_path(in_or_out=utils.IN, data_fold=self.data_fold, full_dataset=full_dataset) self.img_names = [img_name for img_name in os.listdir(self.in_path) if img_name.endswith('.jpg')] self.img_names = self.img_names[:20] if DEBUG else self.img_names self.detections = Detections() folder_name = 'scale{}_minSize{}-{}_windowSize{}-{}_stepSize{}_dividedSizes/'.format(self.scale, self.minSize[0], self.minSize[1], self.windowSize[0], self.windowSize[1], self.stepSize) self.out_path = out_path if out_path is not None else '{}'.format(utils.get_path(full_dataset=True, in_or_out=utils.OUT, slide=True, data_fold=self.data_fold, out_folder_name=folder_name)) self.evaluation = Evaluation(full_dataset=full_dataset, method='slide',folder_name=folder_name, save_imgs=False, out_path=self.out_path, detections=self.detections, in_path=self.in_path)
def predict_target_with_query(sparql, query, source, timeout=TIMEOUT, limit=LIMIT): """Predicts target with given query. For example for pagerank_bidi: SELECT distinct(?target) ?score { { dbr:Circle ?p ?target .} UNION { ?target ?q dbr:Circle . } ?target dbo:wikiPageRank ?score } ORDER BY DESC(?score) LIMIT 100 """ q = query % {'source': source.n3()} q += '\nLIMIT %d' % limit t, q_res = gp_query._query(sparql, timeout, q) res_rows_path = ['results', 'bindings'] bindings = sparql_json_result_bindings_to_rdflib( get_path(q_res, res_rows_path, default=[])) target_scores = [(get_path(row, [TARGET_VAR]), get_path(row, [Variable('score')])) for row in bindings] # print(target_scores) return target_scores
def _find_conflicting_path(self, conflict): """Return the shortest path commown to two patches.""" p1p = get_path(conflict.first_patch) p2p = get_path(conflict.second_patch) # This returns the shortest path return p1p if len(p1p) <= len(p2p) else p2p
def main(): args = PARSER.parse_args() data_path = get_path(args, "record") model_save_path = get_path(args, "tf_vae", create=True) ensure_validation_split(data_path) _n_train, _avg_frames, mean, var = analyse_dataset(data_path) if args.normalize_images: train_data, val_data = create_tf_dataset(data_path, args.z_size, True, mean, var) else: train_data, val_data = create_tf_dataset(data_path, args.z_size) shuffle_size = 5 * 1000 # Roughly 20 full episodes for shuffle windows, more increases RAM usage train_data = train_data.shuffle(shuffle_size, reshuffle_each_iteration=True).batch(args.vae_batch_size).prefetch(2) val_data = val_data.batch(args.vae_batch_size).prefetch(2) current_time = datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_dir = model_save_path / "tensorboard" / current_time vae = CVAE(args=args) vae.compile(optimizer=vae.optimizer, loss=vae.get_loss()) vae.fit(train_data, validation_data=val_data, epochs=args.vae_num_epoch, callbacks=[ tf.keras.callbacks.TensorBoard(log_dir=str(tensorboard_dir), update_freq=50, histogram_freq=1), LogImage(str(tensorboard_dir), val_data), tf.keras.callbacks.ModelCheckpoint(str(model_save_path / "ckpt-e{epoch:02d}"), verbose=1), ]) vae.save(str(model_save_path))
def fetch_lightsaber_images(hilt, blade, button, pommel): hilt = get_path(hilt, HILT_PATH) blade = get_path(blade, BLADE_PATH) button = get_path(button, BUTTON_PATH) pommel = get_path(pommel, POMMEL_PATH) return (blade, hilt, button, pommel)
def load_board_from_image(file_name): """ Load a csv file and return a 2D list.""" # File does not exist, throw error and exit if not os.path.isfile(utils.get_path() + file_name): print('file does not exist') sys.exit() else: dead_cells = GAMECONFIG['dead_cells'] living_cells = GAMECONFIG['living_cells'] im = Image.open(utils.get_path() + file_name) if im.mode not in ('L', 'RGB'): im = im.convert('RGB') pixel_values = im.load() width, height = im.size make_image.save_width_height(width, height) data = [[dead_cells for i in range(width)] for j in range(height)] for col in range(0, width): for row in range(0, height): if pixel_values[col, row] == (0, 0, 0): data[row][col] = living_cells STATS['initial_population'] += 1 im.close() return data, height, width
def main(pickled_evaluation=False, combo_f_name=None, output_patches=True, detector_params=None, original_dataset=True, save_imgs=True, data_fold=utils.VALIDATION): combo_f_name = None try: opts, args = getopt.getopt(sys.argv[1:], "f:") except getopt.GetoptError: sys.exit(2) print 'Command line failed' for opt, arg in opts: if opt == '-f': combo_f_name = arg assert combo_f_name is not None detector = viola_detector_helpers.get_detectors(combo_f_name) viola = False if data_fold == utils.TRAINING else True in_path = utils.get_path(viola=viola, in_or_out=utils.IN, data_fold=data_fold) #name the output_folder folder_name = ['combo'+combo_f_name] for k, v in detector_params.iteritems(): folder_name.append('{0}{1}'.format(k,v)) folder_name = '_'.join(folder_name) out_path = utils.get_path(out_folder_name=folder_name, viola=True, in_or_out=utils.OUT, data_fold=data_fold) out_path = 'output_viola_uninhabited/' viola = ViolaDetector(pickled_evaluation=pickled_evaluation, output_patches=output_patches, out_path=out_path, in_path=in_path, folder_name = folder_name, save_imgs=save_imgs, detector_names=detector, **detector_params) return viola
def proxy(path): # ensure authorization header is present api_key = authorization_header_exists(request.headers) # retrieve usage plan from api key usage_plan = get_usage_plan(REDIS_CLIENT, USAGE_PLANS.keys(), api_key) # apply usage plan quota and throttling limits max_calls, period_in_seconds, throttling_rate = get_usage_plan_info( USAGE_PLANS, usage_plan) quota_per_seconds(REDIS_CLIENT, api_key, max_calls, period_in_seconds) rate_per_second(REDIS_CLIENT, api_key, throttling_rate) if request.method == "GET": redirect_url = get_path(ALB, path) resp = requests.get(redirect_url) response = get_response(resp) return response elif request.method == "POST": redirect_url = get_path(ALB, path) resp = requests.post(redirect_url, json=request.get_json()) response = get_response(resp) return response elif request.method == "DELETE": redirect_url = get_path(ALB, path) resp = requests.delete(redirect_url).content response = get_response(resp) return response
def main(config_file='config/bert_config.json'): """Main method for training. Args: config_file: in config dir """ # 0. Load config and mkdir with open(config_file) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) get_path(os.path.join(config.model_path, config.experiment_name)) get_path(config.log_path) # if config.model_type in ['rnn', 'lr','cnn']: # build vocab for rnn # build_vocab(file_in=config.all_train_file_path, # file_out=os.path.join(config.model_path, 'vocab.txt')) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) datasets = data.load_train_and_valid_files( train_file=config.train_file_path, valid_file=config.valid_file_path) train_set, valid_set_train, valid_set_valid, train_labels, valid_labels = datasets if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') # torch.distributed.init_process_group(backend="nccl") # sampler_train = DistributedSampler(train_set) sampler_train = RandomSampler(train_set) else: device = torch.device('cpu') sampler_train = RandomSampler(train_set) data_loader = { 'train': DataLoader( train_set, sampler=sampler_train, batch_size=config.batch_size), 'valid_train': DataLoader( train_set, batch_size=config.batch_size, shuffle=False), 'valid_valid': DataLoader( valid_set_valid, batch_size=config.batch_size, shuffle=False), 'train_label': train_labels, 'valid_label': valid_labels } # 2. Build model model = MODEL_MAP[config.model_type](config) #load model states. if config.trained_weight: model.load_state_dict(torch.load(config.trained_weight)) model.to(device) if torch.cuda.is_available(): model = model # model = torch.nn.parallel.DistributedDataParallel( # model, find_unused_parameters=True) # 3. Train trainer = Trainer(model=model, data_loader=data_loader, device=device, config=config) best_model_state_dict = trainer.train() # 4. Save model torch.save(best_model_state_dict, os.path.join(config.model_path, 'model.bin'))
def make_mini(from_fname: str = 'arxiv_data', name: str = 'mini', size: int = 100000, data_dir: str = '.data', batch_size: int = 10000): ''' Make a smaller version of a given dataset, without needing to load the larger dataset into memory. INPUT from_fname: str = 'arxiv_data' The large dataset name: str = 'mini' The name of the smaller dataset, which will be appended to the file name of the larger one size: int = 100000 The number of rows in the small dataset data_dir: str = '.data' The name of the data directory batch_size: int = 10000 How many rows of the large dataset we are processing at a time ''' import pandas as pd import numpy as np from tqdm.auto import tqdm from_path = get_path(data_dir) / f'{from_fname}_pp.tsv' to_path = get_path(data_dir) / f'{from_fname}_{name}_pp.tsv' df = pd.read_csv(from_path, sep='\t', chunksize=batch_size) cats = get_cats(data_dir=data_dir)['id'] nrows = get_nrows(f'{from_fname}_pp.tsv', data_dir=data_dir) text_path = get_path(data_dir) / 'text.tmp' labels_path = get_path(data_dir) / 'labels.tmp' text = np.memmap(text_path, dtype=object, mode='w+', shape=(nrows, 1)) labels = np.memmap(labels_path, dtype=int, mode='w+', shape=(nrows, len(cats))) with tqdm(total=nrows, desc=f'Loading {from_fname}_pp.tsv') as pbar: for idx, row in enumerate(df): text[idx * batch_size:(idx + 1) * batch_size, 0] = row['text'] labels[idx * batch_size:(idx + 1) * batch_size, :] = row[cats] pbar.update(len(row)) rnd_idxs = np.random.choice(nrows, size=size, replace=False) text = text[rnd_idxs, 0] labels = labels[rnd_idxs, :] mini_df = pd.DataFrame(columns=['text'] + cats) mini_df['text'] = text mini_df[cats] = labels mini_df.to_csv(to_path, sep='\t', index=False) text_path.unlink() labels_path.unlink()
def get_credentials(): store = file.Storage(get_path('docs_token')) credentials = store.get() if not credentials or credentials.invalid: flow = client.flow_from_clientsecrets(get_path('docs_credential'), SCOPES) credentials = tools.run_flow(flow, store) return credentials
def preprocess_data(tsv_fname: str = 'arxiv_data', txt_fname: str = 'preprocessed_docs.txt', data_dir: str = '.data', batch_size: int = 1000): ''' Preprocess text data. This merges titles and abstracts and separates tokens by spaces. It saves this into a text file and also saves a dataframe with all the categories. Note that this function uses a constant amount of memory, which is achieved by working in batches and writing directly to the disk. INPUT tsv_fname: str The name of the tsv file containing all the categories, without file extension txt_fname: str The name of the txt file containing the preprocessed texts data_dir: str = '.data' The data directory batch_size: int = 1000 The amount of rows being preprocessed at a time ''' import spacy # Specify the input- and output paths cats_in = get_path(data_dir) / (tsv_fname + '.tsv') cats_out = get_path(data_dir) / (tsv_fname + '_pp.tsv') txt_path = get_path(data_dir) / txt_fname # Load the English spaCy model used for tokenisation nlp = spacy.load('en') tokenizer = nlp.Defaults.create_tokenizer(nlp) # Load in the dataframe, merge titles and abstracts and batch them df = pd.read_csv(cats_in, sep='\t', usecols=['title', 'abstract']) df.dropna(inplace=True) docs = '-TITLE_START- ' + df['title'] + ' -TITLE_END- '\ '-ABSTRACT_START- ' + df['abstract'] + ' -ABSTRACT_END-' del df # Tokenisation loop with tqdm(desc='Preprocessing texts', total=len(docs)) as pbar: with open(txt_path, 'w') as f: for doc in tokenizer.pipe(docs, batch_size=batch_size): f.write(' '.join(tok.text for tok in doc) + '\n') pbar.update() # Add the preprocessed texts to the dataframe as the first column # and save to disk df = pd.read_csv(cats_in, sep='\t').dropna() df.drop(columns=['title', 'abstract'], inplace=True) cats = df.columns.tolist() with open(txt_path, 'r') as f: df['text'] = f.readlines() df = df[['text'] + cats] df.to_csv(cats_out, sep='\t', index=False)
def n(neighbours=False): 'shows your neighbours' conf = utils.load_conf(CJDROUTE_CONF) c = cjdns.connect(password=conf['admin']['password']) STAT_FORMAT = '%s %19s v%-2d %9d %9d %12s %d/%d/%d ' nodestore = list(c.dumpTable()) connections = {} try: for peer in os.listdir(YRD_PEERS): with open(os.path.join(YRD_PEERS, peer)) as f: info = json.load(f) try: connections[info['pk']] = str(info['name']) except KeyError: pass except OSError: pass for peer in c.peerStats(): result = c.nodeForAddr(peer.ip)['result'] route = utils.grep_ns(nodestore, peer.ip) path = utils.get_path(route) setattr(peer, 'path', path) line = STAT_FORMAT % (peer.ip, peer.path, peer.version, peer.bytesIn, peer.bytesOut, peer.state, peer.duplicates, peer.lostPackets, peer.receivedOutOfRange) if hasattr(peer, 'user'): line += repr(peer.user) elif peer.publicKey in connections: line += repr(connections[peer.publicKey]) yield line if neighbours: for i in range(result['linkCount']): link = c.getLink(peer.ip, i) if link and 'child' in link['result']: child = link['result']['child'] route = utils.grep_ns(nodestore, child) version = utils.get_version(route) path = utils.get_path(route) yield ' %s %s v%s' % (child, path, version) else: yield ' -' c.disconnect()
def main(): server_address = ('127.0.0.1', 4443) server = BaseHTTPServer.HTTPServer(server_address, HTTPHandler) server.socket = ssl.wrap_socket(server.socket, keyfile=utils.get_path('cert', 'key.pem'), certfile=utils.get_path( 'cert', 'cert.pem'), server_side=True) server.serve_forever()
def main(config_file='config/bert_config.json'): """Main method for training. Args: config_file: in config dir """ # 0. Load config and mkdir with open(config_file) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) get_path(os.path.join(config.model_path, config.experiment_name)) get_path(config.log_path) # 1. Load data data = Data() datasets = data.load_train_and_valid_files( train_file=config.train_file_path, valid_file=config.valid_file_path) train_set, valid_set_train = datasets if torch.cuda.is_available(): device = torch.device('cuda') sampler_train = RandomSampler(train_set) else: device = torch.device('cpu') sampler_train = RandomSampler(train_set) data_loader = { 'train': DataLoader(train_set, sampler=sampler_train, batch_size=config.batch_size), 'valid_train': DataLoader(valid_set_train, batch_size=config.batch_size, shuffle=False), 'valid_valid': DataLoader(valid_set_train, batch_size=config.batch_size, shuffle=False) } # 2. Build model model = MODEL_MAP[config.model_type](config) #load model states. if config.trained_weight: model.load_state_dict(torch.load(config.trained_weight)) model.to(device) if torch.cuda.is_available(): model = model # model = torch.nn.parallel.DistributedDataParallel( # model, find_unused_parameters=True) # 3. Train trainer = Trainer(model=model, data_loader=data_loader, device=device, config=config) best_model_state_dict = trainer.train() # 4. Save model torch.save(best_model_state_dict, os.path.join(config.model_path, 'model.bin'))
def main(): script_dirname = os.path.abspath(os.path.dirname(__file__)) output_patches = False fold = utils.TRAINING if output_patches else utils.VALIDATION #only use this path to get the names of the files you want to use in_path = utils.get_path(in_or_out=utils.IN, data_fold=fold) in_path_selective = script_dirname+'/' #this is where the files actually live img_names = [img for img in os.listdir(in_path) if img.endswith('jpg')] image_filenames = [in_path_selective+img for img in os.listdir(in_path) if img.endswith('jpg')] #get the proposals k, scale = get_parameters() sim = 'all' color = 'hsv' cmd = 'selective_search' if cmd == 'selective_search': folder_name = 'k{}_scale{}_sim{}_color{}_FIXING/'.format(k, scale, sim, color) else: folder_name = 'selectiveRCNN/' print 'Folder name is: {}'.format(folder_name) with Timer() as t: boxes = get_windows(image_filenames, script_dirname, cmd=cmd, k=k, scale=scale) print 'Time to process {}'.format(t.secs) detections = Detections() detections.total_time = t.secs out_path = utils.get_path(selective=True, in_or_out=utils.OUT, data_fold=fold, out_folder_name=folder_name) evaluation = Evaluation(#use_corrected_roofs=True, report_name='report.txt', method='windows', folder_name=folder_name, out_path=out_path, detections=detections, in_path=in_path) #score the proposals for img, proposals in zip(img_names, boxes): print 'Evaluating {}'.format(img) print("Found {} windows".format(len(proposals))) proposals = selectionboxes2polygons(proposals) detections.set_detections(detection_list=proposals,roof_type='metal', img_name=img) detections.set_detections(detection_list=proposals,roof_type='thatch', img_name=img) print 'Evaluating...' evaluation.score_img(img, (1200,2000)) evaluation.save_images(img) save_training_TP_FP_using_voc(evaluation, img_names, in_path_selective, out_folder_name=folder_name, neg_thresh=0.3) evaluation.print_report() with open(out_path+'evaluation.pickle', 'wb') as f: pickle.dump(evaluation, f)
def _ask_chunk_result_extractor(q_res, _vars, _ret_val_mapping): chunk_res = {} res_rows_path = ['results', 'bindings'] bindings = sparql_json_result_bindings_to_rdflib( get_path(q_res, res_rows_path, default=[]) ) for row in bindings: row_res = tuple([get_path(row, [v]) for v in _vars]) stps = _ret_val_mapping[row_res] chunk_res.update({stp: True for stp in stps}) return chunk_res
async def after_ready(self): with open(get_path('reactions'), 'r', encoding='utf-8') as f: self.reactions = f.read().split('\n') with open(get_path('greetings'), 'r', encoding='utf-8') as f: self.greetings = f.read().split('\n') self.protocol_cog = self.client.get_cog(get_cog('ProtocolCog')['name']) for initial in ('ㅁ', 'ㅇ', 'ㄹ', 'ㄴ'): for final in ('ㅁ', 'ㅇ', 'ㄴ', None): for medial in map(str, CHAR_MEDIALS): self.characters.append( join_jamos_char(initial, medial, final))
def end2end(mcat_ratio: float, epochs: int, dim: int, nlayers: int, fname: str, gpu: bool, name: str, lr: float, batch_size: int, split_ratio: float, vectors: str, data_dir: str, pbar_width: int, wandb: bool, boom_dim: int, dropout: float, ema: float, overwrite_model: bool) -> str: ''' Loads the data, preprocesses it if needed, builds the SHARNN model, trains it and evaluates it. ''' from data import load_data from modules import SHARNN pp_path = get_path(data_dir) / f'{fname}_pp.tsv' if not pp_path.is_file(): from data import preprocess_data raw_path = get_path(data_dir) / f'{fname}.tsv' cats_path = get_path(data_dir) / 'cats.json' mcat_dict_path = get_path(data_dir) / 'mcat_dict.json' if not (raw_path.is_file() and cats_path.is_file() and mcat_dict_path.is_file()): from db import ArXivDatabase db = ArXivDatabase(data_dir=data_dir) db.get_mcat_dict() db.get_cats() if not raw_path.is_file(): db.get_training_df() preprocess_data(data_dir=data_dir) train_dl, val_dl, vocab = load_data(tsv_fname=f'{fname}_pp', batch_size=batch_size, split_ratio=split_ratio, vectors=vectors, data_dir=data_dir) model = SHARNN(dim=dim, nlayers=nlayers, data_dir=data_dir, pbar_width=pbar_width, vocab=vocab, boom_dim=boom_dim, dropout=dropout) if gpu: model.cuda() model = model.fit(train_dl, val_dl, epochs=epochs, lr=lr, mcat_ratio=mcat_ratio, name=name, use_wandb=wandb, ema=ema, overwrite_model=overwrite_model) return model.evaluate(val_dl)
def gen_stats_vector_from_cat_vector(stats_name, size, kinds): """ 为了train_test_id :param stats_name: str, 统计名字 :param size: str, 时间粒度 :param kinds: str, 类别变量种类 :return: """ # 0 读取train_test_data的cat matrix print('gen_stats_vector_from_cat_vector("{}", "{}", "{}")'.format( stats_name, size, kinds)) input_matrix_name = '{}_vector_from_ftr51'.format(kinds) input_sparse_matrix = sparse.load_npz( get_path() + 'Data/Feature/{}.npz'.format(input_matrix_name)).toarray() print('The shape of matrix is ( {}, {}) '.format( input_sparse_matrix.shape[0], input_sparse_matrix.shape[1])) # 1 读取基本数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 2 形成pd.dataframe, 便于分组统计 input_sparse_df = pd.DataFrame(data=input_sparse_matrix) print('2') del input_sparse_matrix gc.collect() input_sparse_df['PERSONID'] = train_test_data['PERSONID'] input_sparse_df['CREATETIME'] = train_test_data['CREATETIME'] # 3 开始统计 output_stats_df = input_sparse_df.groupby('PERSONID').apply( lambda df_person: compute_stats_dict_from_cat_matrix( df_person, stats_name, size)).to_frame('stats_dict').reset_index() print(3) train_test_id = train_test_id.merge(output_stats_df, on=['PERSONID'], how='left') # 4 转化成稀疏矩阵并保存 v = DictVectorizer() # 计算统计向量 stats_sparse_matrix = v.fit_transform(train_test_id['stats_dict'].values) print(4) stats_matrix_name = '{}_{}_vector_by_{}'.format(stats_name, kinds, size) sparse.save_npz( get_path() + 'Data/Feature/{}.npz'.format(stats_matrix_name), stats_sparse_matrix) return stats_matrix_name, 'gen_stats_vector_from_cat_vector("{}", "{}", "{}")'.format( stats_name, size, kinds)
def main(): args = PARSER.parse_args() data_path = get_path(args, "record") model_save_path = get_path(args, "tf_gqn", create=True) ensure_validation_split(data_path) train_data = load_from_tfrecord(data_path, args.gqn_context_size, args.gqn_batch_size, mode='train') test_data = load_from_tfrecord(data_path, args.gqn_context_size, args.gqn_batch_size, mode='test') current_time = datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_dir = model_save_path / "tensorboard" / current_time # lr = tf.optimizers.schedules.ExponentialDecay(mu_i, mu_n, mu_f / mu_i, name="lr_schedule" ) lr = tf.optimizers.schedules.PolynomialDecay(mu_i, mu_n, mu_f, name="lr_schedule") sigma = tf.optimizers.schedules.PolynomialDecay(sigma_i, sigma_n, sigma_f, name="sigma_schedule") optimizer = tf.optimizers.Adam(learning_rate=lr) model = GenerativeQueryNetwork(args.gqn_x_dim, args.gqn_r_dim, args.gqn_h_dim, args.gqn_z_dim, args.gqn_l, name="gqn") model.compile(optimizer, sigma, const_sigma=sigma_f) model.fit(train_data, validation_data=test_data, validation_steps=5, steps_per_epoch=S_epoch, epochs=num_epochs, callbacks=[ tf.keras.callbacks.TensorBoard(log_dir=str(tensorboard_dir), update_freq=20, histogram_freq=1), tf.keras.callbacks.ModelCheckpoint( str(model_save_path / "ckpt-e{epoch:02d}"), save_freq=checkpoint_every, verbose=1), LogImages(tensorboard_dir, test_data), ])
def _launch_simulation_if_needed(self): """If the simulation is not already running, run it with the local godot executable """ if not self.is_godot_launched: self.godot_path_str = get_path(self.godot_path_str, add_absolute=True) self.env_path_str = get_path(self.env_path_str) print(f"environment path: {self.env_path_str}") print(f"godot path: {self.godot_path_str}") command = "{} --main-pack {}".format(self.godot_path_str, self.env_path_str) if not self.is_rendering: command = command + " --disable-render-loop --no-window" self.godot_process = subprocess.Popen(command, shell=True) self.is_godot_launched = True
def _var_subst_chunk_result_ext(q_res, _sel_var_and_vars, _, **kwds): var, _vars = _sel_var_and_vars chunk_res = Counter() res_rows_path = ['results', 'bindings'] bindings = sparql_json_result_bindings_to_rdflib( get_path(q_res, res_rows_path, default=[]) ) for row in bindings: row_res = get_path(row, [var]) count_res = int(get_path(row, [COUNT_VAR], '0')) chunk_res[row_res] += count_res return chunk_res
def _combined_chunk_res(q_res, _vars, _ret_val_mapping): chunk_res = {} res_rows_path = ['results', 'bindings'] bindings = sparql_json_result_bindings_to_rdflib( get_path(q_res, res_rows_path, default=[]) ) for row in bindings: row_res = tuple([get_path(row, [v]) for v in _vars]) stps = _ret_val_mapping[row_res] ask_res = int(get_path(row, [ASK_VAR], '0')) count_res = int(get_path(row, [COUNT_VAR], '0')) chunk_res.update({stp: (ask_res, count_res) for stp in stps}) return chunk_res
def __init__(self, data_fold=None, full_dataset=False, out_path=None, in_path=None, output_patches=False, scale=1.5, minSize=(200, 200), windowSize=(40, 40), stepSize=15): self.scale = scale self.minSize = minSize self.windowSize = windowSize self.output_patches = output_patches self.stepSize = stepSize if self.output_patches == False else 30 self.total_window_num = 0 if data_fold is None: self.data_fold = utils.TRAINING if self.output_patches or full_dataset else utils.VALIDATION else: self.data_fold = data_fold self.in_path = in_path if in_path is not None else utils.get_path( in_or_out=utils.IN, data_fold=self.data_fold, full_dataset=full_dataset) self.img_names = [ img_name for img_name in os.listdir(self.in_path) if img_name.endswith('.jpg') ] self.img_names = self.img_names[:20] if DEBUG else self.img_names self.detections = Detections() folder_name = 'scale{}_minSize{}-{}_windowSize{}-{}_stepSize{}_dividedSizes/'.format( self.scale, self.minSize[0], self.minSize[1], self.windowSize[0], self.windowSize[1], self.stepSize) self.out_path = out_path if out_path is not None else '{}'.format( utils.get_path(full_dataset=True, in_or_out=utils.OUT, slide=True, data_fold=self.data_fold, out_folder_name=folder_name)) self.evaluation = Evaluation(full_dataset=full_dataset, method='slide', folder_name=folder_name, save_imgs=False, out_path=self.out_path, detections=self.detections, in_path=self.in_path)
def __init__(self, alpha, gamma): self.alpha = alpha self.gamma = gamma self.dow = -1 self.cur_discrete_time = 0 self.grid_values = collections.defaultdict(float) # tile coding self.layer_values = collections.defaultdict(float) if sys.platform != 'darwin': self.hung = ctypes.cdll.LoadLibrary(get_path( __file__, "hungnp.so")) else: self.hung = ctypes.cdll.LoadLibrary( get_path(__file__, "hungnpmc.so")) self.hung.MaxProfMatching.restype = ctypes.c_double
async def fetch_all(): """Fetch pdf files.""" with get_path("answers").open() as answers_file: nsolved = sum(1 for line in answers_file) cookies = { 'DYNSRV': 'lin-10-170-0-31', 'PHPSESSID': 'a4fb01c0de27e200683b4d556461b5aa', 'keep_alive': '1119831347%23333574%233PpV0T6RtnqnCB6GNF4PvEH1TiEX1nlc' } headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0', } async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session: coros = [fetch_one(session, num) for num in range(1, nsolved + 1)] results = await asyncio.gather(*coros) files = 0 for num in results: if num is not None: print(f'Saved a file for problem {num}') files += 1 return files
def add(self, cate): url = cate['url'] domain = get_domain(url) subdomains = get_subdomains(url) paths = get_path(url).split('/') query = urlparse.urlparse(url).query if domain not in self.root: self.root[domain] = {'sub':{}, 'path':{}} node = self.root[domain] if len(subdomains) > 1 or len(subdomains) == 1 and subdomains[0] != 'www': for sub in subdomains: if sub not in node['sub']: node['sub'][sub] = {'sub':{}, 'path':{}} node = node['sub'][sub] for path in paths: if path not in node['path']: node['path'][path] = {'path':{}} node = node['path'][path] if query: node['path']['query___' + query] = {'path':{}} node = node['path']['query___' + query] node['cate'] = cate
def __init__(self, handle): ''' Initialize the toolbars and the work surface ''' super(BBoardActivity, self).__init__(handle) self.datapath = get_path(activity, 'instance') self._hw = get_hardware() self._playback_buttons = {} self._audio_recordings = {} self.colors = profile.get_color().to_string().split(',') self._setup_toolbars() self._setup_canvas() self.slides = [] self._setup_workspace() self._buddies = [profile.get_nick_name()] self._setup_presence_service() self._thumbs = [] self._thumbnail_mode = False self._recording = False self._grecord = None self._alert = None self._dirty = False
def plot_loss(): path = utils.get_path(neural=True, in_or_out=utils.OUT, data_fold=utils.TRAINING) path_slide = path+'slide/' path_viola = path+'viola/' for path in [path_slide, path_viola]: for file in os.listdir(path): if file.endswith('_history'): training_loss = list() validation_loss = list() with open(path+file, 'rb') as csv_file: csv_reader = csv.reader(csv_file, delimiter='\t') for i, row in enumerate(csv_reader): if i==0: continue training_loss.append(float(row[1])) validation_loss.append(float(row[2])) plt.plot(training_loss, linewidth=3, label='train loss') plt.plot(validation_loss, linewidth=3, label='valid loss') #plt.title('History of {0}'.format(file[:-(len('_history'))])) plt.legend(loc='best') plt.grid() plt.xlabel("epoch") plt.ylabel("loss") plot_name = path+file+'.jpg' plt.savefig(plot_name) plt.close()
def __init__(self, config, test=False, target=None): self._cwd = os.getcwd() self._root = get_path() self._config = config self._test = test self._target = target try: self._skeleton_path = resource_filename(__name__, os.path.join('skeleton', 'default')) except NotImplementedError: self._skeleton_path = os.path.join(sys.prefix, 'skeleton', 'default') try: self._assetPath = os.path.join(resource_filename(__name__, os.path.join('assets', 'manage.py'))) except NotImplementedError: self._assetPath = os.path.join(sys.prefix, 'assets', 'manage.py') self._projectName = self._config['name'] if 'deployment_path' in self._config: self._deployment_path = self._config['deployment_path'] else: self._deployment_path = '' if 'zip_path' in self._config: self._zip_path = self._config['zip_path'] else: self._zip_path = '' if 'doc_path' in self._config: self._doc_path = self._config['doc_path'] else: self._doc_path = ''
def save_training_TP_FP_using_voc(evaluation, img_names, in_path, out_folder_name=None, neg_thresh=0.3): '''use the voc scores to decide if a patch should be saved as a TP or FP or not ''' assert out_folder_name is not None general_path = utils.get_path(neural=True, data_fold=utils.TRAINING, in_or_out=utils.IN, out_folder_name=out_folder_name) path_true = general_path+'truepos_from_selective_search/' utils.mkdir(path_true) path_false = general_path+'falsepos_from_selective_search/' utils.mkdir(path_false) for img_name in img_names: good_detections = defaultdict(list) bad_detections = defaultdict(list) try: img = cv2.imread(in_path+img_name, flags=cv2.IMREAD_COLOR) except: print 'Cannot open image' sys.exit(-1) for roof_type in utils.ROOF_TYPES: detection_scores = evaluation.detections.best_score_per_detection[img_name][roof_type] for detection, score in detection_scores: if score > 0.5: #true positive good_detections[roof_type].append(detection) if score < neg_thresh: #false positive bad_detections[roof_type].append(detection) for roof_type in utils.ROOF_TYPES: extraction_type = 'good' save_training_FP_and_TP_helper(img_name, evaluation, good_detections[roof_type], path_true, general_path, img, roof_type, extraction_type, (0,255,0)) extraction_type = 'background' save_training_FP_and_TP_helper(img_name, evaluation, bad_detections[roof_type], path_false, general_path, img, roof_type, extraction_type, (0,0,255))
def test(): paths = glob(join(c.TEST_DIR, '*.png')) imgs = run(paths) for i, path in enumerate(paths): save_path = utils.get_path(join(c.SAVE_DIR, 'test/' + basename(path))) utils.save_output([imgs[i]], save_path)
def save_results(self, results): with open( utils.get_path( "data", self.player1.__class__.__name__ + "LosingResults.pkl"), "wb") as f: pickle.dump(results, f)
def scrap_a_day_as_corpus(self): urls = self._get_urls_from_breaking_news() n_successes = 0 docs = [] indexs = [] oid_aids = [] for i, url in enumerate(urls): try: json_dict = scrap(url) content = json_dict.get('content', '') if not content: continue index = '{}\t{}\t{}\t{}'.format( get_path(json_dict['oid'], self.year, self.month, self.date, json_dict['aid']), json_dict.get('sid1',''), json_dict.get('writtenTime', ''), json_dict.get('title', '') ) docs.append(content.replace('\n', ' ').replace('\r\n', ' ').strip()) indexs.append(index) oid_aids.append((json_dict['oid'], json_dict['aid'])) n_successes += 1 except Exception as e: print('Exception: {}\n{}'.format(url, str(e))) continue finally: if i % 1000 == 999: print('\r - {}scraping {} in {} ({} success) ...'.format(self._name + (': ' if self._name else ''), i+1, len(urls), n_successes), flush=True, end='') print('\rScrapped news') return docs, indexs, oid_aids
def __init__(self, parameters=None, models_path=None, model_path=None): """ Initialize the model. We either provide the parameters and a path where we store the models, or the location of a trained model. """ if model_path is None: assert parameters and models_path # Create a name based on the parameters self.parameters = parameters self.name = get_name(parameters) # Model location model_path = os.path.join(models_path, get_path(parameters)) self.model_path = model_path self.parameters_path = os.path.join(model_path, 'parameters.pkl') self.mappings_path = os.path.join(model_path, 'mappings.pkl') # Create directory for the model if it does not exist if not os.path.exists(self.model_path): os.makedirs(self.model_path) # Save the parameters to disk with open(self.parameters_path, 'wb') as f: self.parameters = cPickle.dump(parameters, f) else: assert parameters is None and models_path is None # Model location self.model_path = model_path self.parameters_path = os.path.join(model_path, 'parameters.pkl') self.mappings_path = os.path.join(model_path, 'mappings.pkl') # Load the parameters and the mappings from disk with open(self.parameters_path, 'rb') as f: self.parameters = cPickle.load(f) self.reload_mappings() self.components = {}
def main(): size = 40 total = spanning = 0 graph = Graph(size) with get_path("data", "network.txt").open() as data_file: for u, line in enumerate(data_file): for v, w in enumerate(line.rstrip().split(",")): if v > u and w != "-": w = int(w) total += w graph.insert_edge(u, v, w) # Implement the Prim–Jarník algorithm D = [float("inf")] * size root = 0 # can be any vertex of the graph heap = [(0, root)] seen = [False] * size while heap: w, u = heappop(heap) if not seen[u]: spanning += w seen[u] = True for e in graph.incident_edges(u): v = e.opposite(u) if e.weight < D[v]: D[v] = e.weight heappush(heap, (e.weight, v)) return total - spanning
def main(): N = 40 total = spanning = 0 graph = Graph(N) with get_path("data", "network.txt").open() as data_file: for u, line in enumerate(data_file): for v, w in enumerate(line.rstrip().split(",")): if v > u and w != "-": w = int(w) total += w graph.insert_edge(u, v, w) # Implement the Prim–Jarník algorithm D = [float("inf")] * N root = 0 # can be any vertex of the graph heap = [(0, root)] seen = [False] * N while heap: w, u = heappop(heap) if not seen[u]: spanning += w seen[u] = True for e in graph.incident_edges(u): v = e.opposite(u) if e.weight < D[v]: D[v] = e.weight heappush(heap, (e.weight, v)) return total - spanning
def __init__(self, config): self._cwd = os.getcwd() self._root = get_path() self._config = config self._verify_ssl = True if 'urls' not in self._config: raise MissingKeyError('Could not find url settings in either global or local configuration file.') if 'upload' not in self._config['urls']: raise MissingKeyError('Could not find an upload url in either the global or local configuration file.') else: self._upload_url = self._config['upload_url'] if 'login_url' not in self._config['urls']: self._login_url = self._upload_url else: self._login_url = self._config['login_url'] if 'username' not in self._config['urls']: self._username = raw_input('Please provide the username for your upload server (or leave blank if none is required): ') else: self._username = self._config['urls']['username'].encode() if 'password' not in self._config['urls']: self._password = getpass.getpass('Please provide the password for your upload server (or leave blank if none is required): ') else: self._password = self._config['urls']['password'].encode() self._zip_name = self._config['name'] + '_v' + self._config['version'] + '.zip' self._zip_path = os.path.join(self._cwd, 'build', self._zip_name)
def main(): chars_saved = 0 # From the problem definition: # You can assume that all the Roman numerals in the file contain no more # than four consecutive identical units. with get_path("data", "roman.txt").open() as data_file: for line in data_file: if "VIIII" in line: chars_saved += 3 # VIIII => IX elif "IIII" in line: chars_saved += 2 # IIII => IV if "LXXXX" in line: chars_saved += 3 # LXXXX => XC elif "XXXX" in line: chars_saved += 2 # XXXX => XL if "DCCCC" in line: chars_saved += 3 # DCCCC => CM elif "CCCC" in line: chars_saved += 2 # CCCC => CD return chars_saved
def _load_statistics(): global _statistics filename = get_path('datasets' , 'statistics.json') _statistics = load_data(filename, verbose=False) if not _statistics or not isinstance(_statistics, dict) : _statistics = {} print ('')
def main(): args = parse_args() if args.path: paths = [] for path in args.path: path_obj = get_path("python", path) if not path_obj.exists(): print(f"File not found: {path}") return 1 paths.append(path_obj) else: paths = sorted(get_path("python").glob("problem???.py")) loop = asyncio.get_event_loop() loop.run_until_complete(add_docstrings(paths)) return 0
def __init__(self): import os from utils import get_path self.categories_by_id = None self.categories_by_name = None self._filename = get_path('fs-categories.json') self._api_venues = Foursquare(client_id=_fsc.CLIENT_ID, client_secret=_fsc.CLIENT_SECRET) self.load_categories()
def main(): paths = defaultdict(list) #for fold in [utils.TRAINING, utils.TESTING, utils.VALIDATION]: #for full_dataset in [True, False]: for fold in [utils.VALIDATION]: viola_path = utils.get_path(in_or_out=utils.OUT, viola=True, data_fold=fold) pipe_path = utils.get_path(in_or_out=utils.OUT, pipe=True, data_fold=fold) #for path in [viola_path, pipe_path]: for path in [viola_path]: for folder in os.listdir(path): if os.path.isfile(path+folder+'/report.txt'): paths[fold].append((path, folder)) #print viola_paths process_viola_reports(paths[fold], fold=fold) '''
def predict_query(sparql, timeout, graph_pattern, source, limit=config.PREDICTION_RESULT_LIMIT): """Performs a single query starting at ?SOURCE returning all ?TARGETs.""" assert isinstance(graph_pattern, GraphPattern) assert isinstance(source, Identifier) vars_in_graph = graph_pattern.vars_in_graph if TARGET_VAR not in vars_in_graph: logger.warning( 'graph pattern without %s used for prediction:\n%r', TARGET_VAR.n3(), graph_pattern ) return timeout, [] q = graph_pattern.to_sparql_select_query( projection=[TARGET_VAR], distinct=True, bind={SOURCE_VAR: source}, limit=limit, ) try: t, q_res = _query(sparql, timeout, q) except (SPARQLWrapperException, SAXParseException, URLError): logger.warning( 'Exception occurred during prediction, assuming empty result...\n' 'Query:\n%s\nException:', q, exc_info=1, # appends exception to message ) t, q_res = timeout, {} else: if query_time_soft_exceeded(t, timeout): kind = 'hard' if query_time_hard_exceeded(t, timeout) else 'soft' logger.info( 'prediction query exceeded %s timeout %s:\n%s', kind, t, q ) res = [] res_rows_path = ['results', 'bindings'] bindings = sparql_json_result_bindings_to_rdflib( get_path(q_res, res_rows_path, default=[]) ) for row in bindings: res.append(get_path(row, [TARGET_VAR])) return timeout, set(res)
def save_loss(self): '''Save the plot of the training and validation loss ''' train_loss = [row['train_loss'] for row in self.train_history_] valid_loss = [row['valid_loss'] for row in self.train_history_] plt.plot(train_loss, label='train loss') plt.plot(valid_loss, label='valid loss') plt.legend(loc='best') path = utils.get_path(neural=True, in_or_out=utils.OUT, data_fold=utils.TRAIN) plt.savefig(path+self.net_name+'_loss.png')
def setup_augmented_patches(): ''' No division between different roof sizes: if a roof has a size that is off, we resize it Make them lie down, save patches to folder Augment patches, save them to augmented folder ''' in_path = utils.get_path(in_or_out=utils.IN, data_fold=utils.TRAINING) out_path = utils.get_path(viola=True, in_or_out=utils.IN, data_fold=utils.TRAINING) img_names_list = [img_name for img_name in os.listdir(in_path) if img_name.endswith('.jpg')] for roof_type in ['metal', 'thatch']: for img_id, img_name in enumerate(img_names_list): print 'Processing image: {0}'.format(img_name) img_path = in_path+img_name polygon_list = DataLoader.get_polygons(roof_type=roof_type, xml_name=img_name[:-3]+'xml', xml_path=in_path, padding=) roof_patches = DataLoader.extract_patches(polygon_list, img_path=img_path, grayscale=True) for roof_id, roof_img in enumerate(roof_patches): print 'Processing image {0}: roof {1}'.format(img_id, roof_id) #if it's vertical, make it lie down if roof_img.shape[0] > roof_img.shape[1]: roof_img = DataAugmentation.rotateImage(roof_img, clockwise=True) #write basic positive example to the right folder general_path = '{0}{1}_{2}_{3}'.format(out_path, roof_type, img_name[:-4], roof_id) #calculate and write the augmented images for i in range(4): roof_img_cp = np.copy(roof_img) if i == 1: roof_img_cp = cv2.flip(roof_img_cp,flipCode=0) elif i == 2: roof_img_cp = cv2.flip(roof_img_cp,flipCode=1) elif i==3: roof_img_cp = cv2.flip(roof_img_cp,flipCode=-1) write_to_path = '{0}_flip{1}.jpg'.format(general_path, i) cv2.imwrite(write_to_path, roof_img_cp)
def main(): names = get_path('data', 'names.txt').read_text().split('","') names[0] = names[0][1:] names[-1] = names[-1][:-1] names.sort() values = {c: i for i, c in enumerate(string.ascii_uppercase, 1)} return sum([sum([values[c] for c in name]) * pos for pos, name in enumerate(names, 1)])
def get_detectors(combo_f): detectors = dict() if combo_f.startswith('combo'): detector_file = utils.get_path(viola=True, params=True)+str(combo_f)+'.csv' else: detector_file = utils.get_path(viola=True, params=True)+'combo'+str(combo_f)+'.csv' detectors = defaultdict(list) with open(detector_file, 'r') as csvfile: r = csv.reader(csvfile, delimiter=',') for line in r: if len(line) < 2: continue if line[0] == 'metal': detectors['metal'].append(line[1].strip()) elif line[0] == 'thatch': detectors['thatch'].append(line[1].strip()) else: raise ValueError("Unknown detector type {0}".format(line[0])) return detectors
def main(): words = get_path('data', 'words.txt').read_text().split('","') words[0] = words[0][1:] words[-1] = words[-1][:-1] char_map = {c: i for i, c in enumerate(string.ascii_uppercase, 1)} values = [sum([char_map[char] for char in word]) for word in words] triangle_numbers = set([n*(n+1)//2 for n in range(1, (-1 + int(sqrt(1 + 8*max(values))))//2 + 1)]) return len([v for v in values if v in triangle_numbers])
def vote(): if request.method == "POST" : vote = request.form["vote"] url = "http://gdata.youtube.com/feeds/api/videos?q=%s&max-results=1&v=2&alt=jsonc" % urllib.quote_plus(vote) result = simplejson.load(urllib.urlopen(url)) video_id = result['data']['items'][0]['id'] utils.append(utils.get_path(RADIO_ROOT, 'to_process_votes'), video_id) return render_template("vote.html")
def get_all_books(self, source="ZhangBook", limit=1, **kwargs): spider = Cluster.get_spider() params = { "host": spider['host'], "port": spider['port'], "source": source, "path": "get_all_books" } path = get_path(**params) data = kwargs data["limit"] = limit return requests(path, data=json.dumps(data))
def copy_images(data_fold): if data_fold == utils.TRAINING: prefix = 'training_' elif data_fold == utils.VALIDATION: prefix = 'validation_' in_path = utils.get_path(in_or_out = utils.IN, data_fold=data_fold) pdb.set_trace() for img_name in os.listdir(in_path): if img_name.endswith('jpg') or img_name.endswith('xml'): #move the image over and save it with a prefix subprocess.check_call('cp {} {}'.format(in_path+img_name, prefix+img_name), shell=True)