def test_zip(): """Test contrib.tzip""" with closing(StringIO()) as our_file: a = range(9) b = [i + 1 for i in a] if sys.version_info[:1] < (3,): assert tzip(a, b, file=our_file) == zip(a, b) else: gen = tzip(a, b, file=our_file) assert gen != list(zip(a, b)) assert list(gen) == list(zip(a, b))
def create_mosaic(image_list, flows): assert len(image_list) > 0, "List is empty." _, hm_h, hm_w = heatmap_for_image(image_list[0]).shape img_w, img_h = Image.open(image_list[0]).size flows = flows * (hm_w / img_w, hm_h / img_h) acc_flows = np.cumsum(flows, axis=0) acc_flows = acc_flows.round().astype(int) min_x, min_y = acc_flows.min(axis=0) max_x, max_y = acc_flows.max(axis=0) + (hm_w, hm_h) - 1 acc_flows -= (min_x, min_y) mos_h, mos_w = (max_y - min_y + 1, max_x - min_x + 1) hms_mosaic = np.zeros((2, mos_h, mos_w), dtype=float) pms_mosaic = np.zeros((1, mos_h, mos_w), dtype=float) emb_mosaic = np.zeros((2, mos_h, mos_w), dtype=float) off_mosaic = np.zeros((2, mos_h, mos_w), dtype=float) mask = np.full((mos_h, mos_w), np.finfo(float).tiny) annotations = [] for image, (tx, ty) in tzip(image_list, acc_flows): heatmaps, part_heatmaps, offsets, embeddings = mat_data_for_image( image) hms_mosaic[:, ty:ty + hm_h, tx:tx + hm_w] += heatmaps pms_mosaic[:, ty:ty + hm_h, tx:tx + hm_w] += part_heatmaps emb_mosaic[:, ty:ty + hm_h, tx:tx + hm_w] += embeddings off_mosaic[:, ty:ty + hm_h, tx:tx + hm_w] += offsets mask[ty:ty + hm_h, tx:tx + hm_w] += 1.0 if (locations := annotation_for_image(image)) is not None: locations = locations * (hm_w / img_w, hm_h / img_h) + (tx, ty) annotations.extend(locations)
def ParseFiles(group, files, invalid): Cols = [ '系所代碼', '准考證號碼', '姓名', '學校', '一上班排百分', '一下班排百分', '二上班排百分', '二下班排百分', '三上班排百分', '一上組排百分', '一下組排百分', '二上組排百分', '二下組排百分', '三上組排百分', '一上校排百分', '一下校排百分', '二上校排百分', '二下校排百分', '三上校排百分', 'ParserInfo', 'FilePath' ] L = [] for g, f in tzip(group, files): with open(f, 'rb') as ff: pdf = pdftotext.PDF(ff) try: L.append(eval(f'SSP.Parser_{g}.parse_info(pdf, f)') + [f]) except Exception as e: print(f, g, e) L.append( list(SSP.ScoreSheetParser.parse_generic_info(pdf, f)) + [np.nan] * 16 + [f"Parser {g} failed! Or it's a new pattern!? ", f]) for f in invalid: t = f.split('/') L.append([t[1], t[2]] + [np.nan] * 17 + ['Raw File Not Exist!!', f]) data = pd.DataFrame(L) data.columns = Cols data.to_csv('Out.csv', index=False)
def print_out_bleu_and_meteor_score(predicted_path, expected_path): scores = [('BLEU SCORE-1: ', []), ('BLEU SCORE-2: ', []), ('BLEU SCORE-3: ', []), ('BLEU SCORE-4: ', []), ('METEOR SCORE: ', [])] with open(predicted_path, 'r') as fp_pred, open(expected_path, 'r') as fp_exp: for prediction, expected in tzip(fp_pred, fp_exp): prediction = prediction.split(' ') expected_list = expected.split(' ') scores[0][1].append( sentence_bleu(prediction, expected_list, weights=(1, 0, 0, 0))) scores[1][1].append( sentence_bleu(prediction, expected_list, weights=(0, 1, 0, 0))) scores[2][1].append( sentence_bleu(prediction, expected_list, weights=(0, 0, 1, 0))) scores[3][1].append( sentence_bleu(prediction, expected_list, weights=(0, 0, 0, 1))) scores[4][1].append(meteor_score(prediction, expected)) for score in scores: print(score[0] + str(sum(score[1]) / len(score[1]))) return 0
def sphere_log(data, scales=range(5, 9, 1), anisotropy_factor=5.0): data = asarray(data) scales = asarray(scales) log = empty((len(scales), ) + data.shape, dtype=data.dtype) for slog, scale in (tzip(log, scales)): slog[...] = scale**2 * gaussian_laplace( data, asarray([scale / anisotropy_factor, scale, scale])) peaks = local_minima(log) # SZYX peaks_subset, peaks_list, threshold = get_peaks_subset(log, peaks, scales) return peaks_subset, peaks_list, log, peaks, threshold
def main(args): output_path = set_output_path(args.output_folder) detector = set_detector(args.detector) epoch_limits = get_epoch_limits(detector) #### Load counts and ancillary data from the catalogue ( detids, obsids, obs_dates, det_obs_modes, det_rates, det_rates_err, det_filters, ) = load_data(args.sources_table, detector) #### Create pseudospectra default_rmf_file = set_rmf_file( obs_dates[0], det_obs_modes[0], detector, epoch_limits ) spec_channels, spec_energies, spec_quality, spec_grouping = set_default_spectrum( default_rmf_file ) for ( detid, obsid, obs_date, det_obs_mode, det_rate, det_rate_err, det_filter, ) in tzip( detids, obsids, obs_dates, det_obs_modes, det_rates, det_rates_err, det_filters ): if not_detected(det_rate): continue rsp_file = set_rmf_file(obs_date, det_obs_mode, detector, epoch_limits) arf_file = set_arf_file(detector, det_filter) spec_rate, spec_rate_err = set_spec_count_rates( spec_energies, det_rate, det_rate_err ) spec = set_spec_fits( spec_channels, spec_rate, spec_rate_err, spec_quality, spec_grouping ) spec = update_spec_fits_header( spec, detector, det_filter, rsp_file, arf_file, len(spec_channels) ) save_spec(spec, obsid, detid, detector, output_path)
def process(self): data = pickle.load(open(os.path.join(self.raw_dir, self.name), 'rb')) self.graphs = [] self.labels = [] self.all_seqs = [] self.max_seq_length = 0 self.max_node_id = 0 self.max_num_unique_node = 0 for sequences, y in tzip(data[0], data[1]): i = 0 nodes = {} # dict{15: 0, 16: 1, 18: 2, ...} senders = [] unique_nodes = [] for node in sequences: if node not in nodes: nodes[node] = i unique_nodes.append([node]) i += 1 senders.append(nodes[node]) receivers = senders[:] del senders[-1] # the last item is a receiver del receivers[0] # the first item is a sender g = dgl.graph((senders, receivers), num_nodes=len(unique_nodes)) g.ndata['x'] = torch.tensor(unique_nodes, dtype=torch.long) g.edata['w'] = torch.ones(g.num_edges(), dtype=torch.float) # print(f"\n{g.nodes()}, {g.edges()}, {g.ndata['x'].squeeze()}") self.graphs.append(g) self.all_seqs.append(sequences) self.labels.append(y) if max(sequences) > self.max_node_id: self.max_node_id = max(sequences) if len(unique_nodes) > self.max_num_unique_node: self.max_num_unique_node = len(unique_nodes) if len(sequences) > self.max_seq_length: self.max_seq_length = len(sequences) # Convert the label list to tensor for saving. self.num_graphs = len(self.graphs) self.num_labels = len(self.labels) self.max_labels = max(self.labels) self.labels = torch.LongTensor(self.labels)
def run_vis_completion(): def concat_tile(im_list_2d): return cv2.vconcat( [cv2.hconcat(im_list_h) for im_list_h in im_list_2d]) def scale_to_height(img, height): """幅が指定した値になるように、アスペクト比を固定して、リサイズする。 """ h, w = img.shape[:2] width = round(w * (height / h)) dst = cv2.resize(img, dsize=(width, height)) return dst dir_path = pathlib.Path('./sample_code/sample_images/input') dpaths = sorted(list(dir_path.glob('./*.png'))) cpaths = sorted(list(dir_path.glob('./*.jpg'))) print(dpaths) vis_list = list() for dpath, cpath in tzip(dpaths, cpaths): color = cv2.imread(str(cpath), cv2.IMREAD_COLOR) depth = cv2.imread(str(dpath), cv2.IMREAD_ANYDEPTH) depth_c = one_image_completion(str(dpath)) depth_vis = cv2.convertScaleAbs(np.repeat(depth[:, :, None], 3, axis=2), alpha=(255.0 / 65535.0)) depth_c_vis = cv2.convertScaleAbs(np.repeat(depth_c[:, :, None], 3, axis=2), alpha=(255.0 / 65535.0)) vis = cv2.vconcat([color, depth_vis, depth_c_vis]) vis = scale_to_height(vis, 800) vis_list.append(vis) cv2.imshow('out', vis) cv2.waitKey(100) dt_now = datetime.datetime.now() dt_str = dt_now.strftime('%Y-%m-%d-%H') vis_all = cv2.hconcat(vis_list) cv2.imwrite('./sample_code/sample_images/output/' + dt_str + '.png', vis_all) cv2.imshow('vis all', vis_all) cv2.waitKey(0)
def recommend(self, userlist_path, out_path): model = self._get_model() user_list = np.load( open(conf.root + 'res/user_list_' + str(self.max_len) + '.npy', 'rb')) with open(out_path, 'w') as fout: users = [u.strip() for u in open(userlist_path)] seens = self._get_seens(users) # positive list test_X = self._get_articles(users) mp_list = self.mp.get_recommend_list( seens, topn=self.topn ) # user cold strat 어떻게 해결하지? => 일단 most popular에서 가져옴 for user, articles in tzip(users, test_X): recommend = [] left = [] if len(articles) == 0: # 기존에 읽은 히스토리가 없으면. (MP) recommend = mp_list else: # 기존에 읽은 히스토리가 있으면. (KNN) pred = model.kneighbors([articles]) sim_users = pred[1][0] dist = pred[0][0] for i, u in enumerate(sim_users): if dist[i] != 0: recommend += self.dictionary[user_list[u]] if user in self.dictionary.keys(): recommend = list( set(recommend) - set(self.dictionary[user])) else: recommend = list(set(recommend)) # np.random.shuffle(recommend) if len(recommend) < 100: recommend += mp_list recommend = list(set(recommend)) fout.write('%s %s\n' % (user, ' '.join(recommend[:100])))
def main(args): spec_folder = Path(args.spec_folder) results_folder = Path(args.results_folder) lastsource_file = Path(args.file_lastsource) first_source = get_last_source_fit(lastsource_file) if first_source == 0: check_results_folder(results_folder) obsids, detids, redshifts, nhgals = get_sources_data( args.sources_table, args.racol, args.deccol, args.zcol, args.nhcol, first_source) for obsid, detid, z, nh, current_source in tzip(obsids, detids, redshifts, nhgals, count(first_source)): try: fit_detection(z, nh, obsid, detid, results_folder, spec_folder, args.fixgamma) update_last_source_fit(current_source + 1, lastsource_file) except Exception as e: logging.error(e) logging.error(f"Something went wrong fitting detection {detid}")
def create_panorama(image_list, flows, image_size=(1024, 1024)): assert len(image_list) > 0, "List is empty." img_w, img_h = Image.open(image_list[0]).size hm_w, hm_h = image_size flows = flows * (hm_w / img_w, hm_h / img_h) flows = flows.round().astype(int) acc_flows = np.cumsum(flows, axis=0) acc_flows = acc_flows.astype(int) min_x, min_y = acc_flows.min(axis=0) max_x, max_y = acc_flows.max(axis=0) + (hm_w, hm_h) - 1 acc_flows -= (min_x, min_y) shape = (max_y - min_y + 1, max_x - min_x + 1, 3) panorama = np.zeros(shape, dtype=np.uint8) first_img = read_image(image_list[0], reshape_size=(hm_w, hm_h))[:, :int(hm_h / 2)] tx, ty = acc_flows[0] panorama[ty:ty + hm_h, tx:tx + int(hm_w / 2)] = first_img last_img = read_image(image_list[-1], reshape_size=(hm_w, hm_h))[:, int(hm_h / 2):] tx, ty = acc_flows[-1] panorama[ty:ty + hm_h, tx + int(hm_w / 2):tx + hm_w] = last_img for image, (tx, ty), (dx, _) in tzip(image_list, acc_flows, flows): img = read_image( image, reshape_size=image_size)[:, hm_w - dx - int(hm_w / 2):hm_w - int(hm_w / 2)] panorama[ty:ty + hm_h, tx + hm_w - dx - int(hm_w / 2):tx + hm_w - int(hm_w / 2)] = img return panorama
def zipper(iterable1: Iterable, iterable2, verbose: bool, **kwargs): if not verbose: return zip(iterable1, iterable2) return tzip(iterable1, iterable2, **kwargs)
def create_csv( orig_tsv_file, csv_file, data_folder, accented_letters=False, language="en", ): """ Creates the csv file given a list of wav files. Arguments --------- orig_tsv_file : str Path to the Common Voice tsv file (standard file). data_folder : str Path of the CommonVoice dataset. accented_letters : bool, optional Defines if accented letters will be kept as individual letters or transformed to the closest non-accented letters. Returns ------- None """ # Check if the given files exists if not os.path.isfile(orig_tsv_file): msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file) logger.info(msg) raise FileNotFoundError(msg) # We load and skip the header loaded_csv = open(orig_tsv_file, "r").readlines()[1:] nb_samples = str(len(loaded_csv)) msg = "Preparing CSV files for %s samples ..." % (str(nb_samples)) logger.info(msg) # Adding some Prints msg = "Creating csv lists in %s ..." % (csv_file) logger.info(msg) csv_lines = [[ "ID", "duration", "wav", "wav_format", "wav_opts", "spk_id", "spk_id_format", "spk_id_opts", "wrd", "wrd_format", "wrd_opts", "char", "char_format", "char_opts", ]] # Start processing lines total_duration = 0.0 for line in tzip(loaded_csv): line = line[0] # Path is at indice 1 in Common Voice tsv files. And .mp3 files # are located in datasets/lang/clips/ mp3_path = data_folder + "/clips/" + line.split("\t")[1] file_name = mp3_path.split(".")[-2].split("/")[-1] spk_id = line.split("\t")[0] snt_id = file_name # Reading the signal (to retrieve duration in seconds) if os.path.isfile(mp3_path): info = torchaudio.info(mp3_path) else: msg = "\tError loading: %s" % (str(len(file_name))) logger.info(msg) continue duration = info.num_frames / info.sample_rate total_duration += duration # Getting transcript words = line.split("\t")[2] # !! Language specific cleaning !! # Important: feel free to specify the text normalization # corresponding to your alphabet. if language in ["en", "fr", "it", "rw"]: words = re.sub("[^'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿ]+", " ", words).upper() elif language == "ar": HAMZA = "\u0621" ALEF_MADDA = "\u0622" ALEF_HAMZA_ABOVE = "\u0623" letters = ("ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ" + HAMZA + ALEF_MADDA + ALEF_HAMZA_ABOVE) words = re.sub("[^" + letters + "]+", " ", words).upper() # Remove accents if specified if not accented_letters: nfkd_form = unicodedata.normalize("NFKD", words) words = "".join( [c for c in nfkd_form if not unicodedata.combining(c)]) words = words.replace("'", " ") # Remove multiple spaces words = re.sub(" +", " ", words) # Remove spaces at the beginning and the end of the sentence words = words.lstrip().rstrip() # Getting chars chars = words.replace(" ", "_") chars = " ".join([char for char in chars][:]) # Remove too short sentences (or empty): if len(words) < 3: continue # Composition of the csv_line csv_line = [ snt_id, str(duration), mp3_path, "wav", "", spk_id, "string", "", str(words), "string", "", str(chars), "string", "", ] # Adding this line to the csv_lines list csv_lines.append(csv_line) # Writing the csv lines with open(csv_file, mode="w", encoding="utf-8") as csv_f: csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in csv_lines: csv_writer.writerow(line) # Final prints msg = "%s sucessfully created!" % (csv_file) logger.info(msg) msg = "Number of samples: %s " % (str(len(loaded_csv))) logger.info(msg) msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2))) logger.info(msg)
train_hist['total_time'] = [] print('training start!') start_time = time.time() real = torch.ones(args.batch_size, 1, args.input_size // 4, args.input_size // 4).to(device) fake = torch.zeros(args.batch_size, 1, args.input_size // 4, args.input_size // 4).to(device) for epoch in range(args.train_epoch): epoch_start_time = time.time() G_decoder.train() G_scheduler.step() D_scheduler.step() Disc_losses = [] Gen_losses = [] Con_losses = [] for (x, _), (y, _) in tzip(train_loader_src, train_loader_tgt): e = y[:, :, :, args.input_size:] y = y[:, :, :, :args.input_size] x, y, e = x.to(device), y.to(device), e.to(device) # train D D_optimizer.zero_grad() D_real = D(y) D_real_loss = BCE_loss(D_real, real) G_ = G_decoder(x)[0] D_fake = D(G_) D_fake_loss = BCE_loss(D_fake, fake) D_edge = D(e)
def run( dir_MRI="data/ALFA_PET", dir_PET="data/ALFA_PET", dir_RR="data/Atlas/CL_2mm", outfile="data/ALFA_PET/Quant_realigned.csv", glob_PET="*_PET.nii.gz", glob_MRI="*_MRI.nii.gz", ): """ Args: dir_MRI (str or Path): MRI directory dir_PET (str or Path): PET directory dir_RR (str or Path): Reference regions ROIs directory (standard Centiloid RR from GAAIN Centioid website: 2mm, nifti) outfile (str or Path): Output quantification file Returns: fname (list[str]) greyCerebellum (list[float]) wholeCerebellum (list[float]) wholeCerebellumBrainStem (list[float]) pons (list[float]) """ # PET & MR images lists s_PET_dir = list(tmap(gunzip, Path(dir_PET).glob(glob_PET), leave=False)) s_MRI_dir = list(tmap(gunzip, Path(dir_MRI).glob(glob_MRI), leave=False)) if len(s_PET_dir) != len(s_MRI_dir): raise IndexError("Different number of PET and MR images") eng = get_matlab() dir_spm = fspath(Path(eng.which("spm")).parent) for d_PET, d_MRI in tzip(s_PET_dir, s_MRI_dir): with tic("Step 0: Reorient PET subject"): eng.f_acpcReorientation(d_PET, nargout=0) with tic("Step 0: Reorient MRI subject"): eng.f_acpcReorientation(d_MRI, nargout=0) with tic("Step 1: CorregisterEstimate"): eng.f_1CorregisterEstimate(d_MRI, dir_spm, nargout=0) # Check Reg with tic("Step 2: CorregisterEstimate"): eng.f_2CorregisterEstimate(d_MRI, d_PET, nargout=0) # Check Reg with tic("Step 3: Segment"): eng.f_3Segment(d_MRI, dir_spm, nargout=0) with tic("Step 4: Normalise"): d_file_norm = fspath( Path(d_MRI).parent / ("y_" + Path(d_MRI).name)) eng.f_4Normalise(d_file_norm, d_MRI, d_PET, nargout=0) s_PET = list( map( fspath, Path(dir_PET).glob("w" + (glob_PET[:-3] if glob_PET.lower(). endswith(".gz") else glob_PET)))) res = eng.f_Quant_centiloid(s_PET, fspath(dir_RR), nargout=5) if outfile: with open(outfile, "w") as fd: f = csv_writer(fd) f.writerow(("Fname", "GreyCerebellum", "WholeCerebellum", "WholeCerebellumBrainStem", "Pons")) f.writerows(zip(*res)) return res
def applySmartBatching(self, data, mask, target= None, index= None, text= "Iteration:"): data = np.stack(data) mask = np.stack(mask) if target is not None and index is None: target = target elif index is not None and target is None: index = index else: logging.warning("Provide exactly one of target or index.") def getArrayLength(x): return sum(x != 0) length_array = np.apply_along_axis(getArrayLength, np.stack(data).ndim - 1, np.stack(data)) while length_array.ndim > 1: length_array = np.max(length_array, axis=1) sort_idx = length_array.argsort() length_array = length_array[sort_idx] data = data[sort_idx] mask = mask[sort_idx] if target is not None and index is None: target = target[sort_idx] elif index is not None and target is None: index = index[sort_idx] else: logging.warning("Provide exactly one of target or index.") data_batch = list() mask_batch = list() if target is not None and index is None: target_batch = list() elif index is not None and target is None: index_batch = list() else: logging.warning("Provide exactly one of target or index.") pbar = tqdm(total=len(data), desc="Apply dynamic batching") while len(data) > 0: to_take = min(self.train_batchSize, len(data)) select = random.randint(0, len(data) - to_take) max_batch_len = max(length_array[select:select + to_take]) data_batch += [torch.tensor(data[select:select + to_take][..., :max_batch_len], dtype=torch.long)] mask_batch += [torch.tensor(mask[select:select + to_take][..., :max_batch_len], dtype=torch.long)] if target is not None and index is None: target_batch += [torch.tensor(target[select:select + to_take], dtype=torch.long)] elif index is not None and target is None: index_batch += [torch.tensor(index[select:select + to_take], dtype=torch.long)] else: logging.error("Provide exactly one of target or index.") length_array = np.delete(length_array, np.s_[select:select + to_take], 0) data = np.delete(data, np.s_[select:select + to_take], 0) mask = np.delete(mask, np.s_[select:select + to_take], 0) if target is not None and index is None: target = np.delete(target, np.s_[select:select + to_take], 0) elif index is not None and target is None: index = np.delete(index, np.s_[select:select + to_take], 0) else: logging.warning("Provide exactly one of target or index.") pbar.update(to_take) pbar.close() if target is not None and index is None: return tzip(data_batch, mask_batch, target_batch, desc=text) elif index is not None and target is None: return tzip(data_batch, mask_batch, index_batch, desc=text) else: return tzip(data_batch, mask_batch, desc=text)
def preprocess_data_test( ticker, column_scaler, n_steps=50, lookup_step=1, feature_columns=['low', 'high', 'open', 'volume', 'hour', 'minute']): total_df, dfs = load_data(train=False) # df = dfs[0] # this will contain all the elements we want to return from this function result = dict() result["column_scaler"] = column_scaler # we will also return the original dataframe itself result['total_df'] = total_df.copy() # result['dfs'] = [df.copy() for df in dfs] # # # # make sure that the passed feature_columns exist in the dataframe # # for col in feature_columns: # # assert col in df.columns, f"'{col}' does not exist in the dataframe." # # # # # add date as a column # # if "date" not in df.columns: # # df["date"] = df.index # # if scale: # total_df['low'] = np.log(total_df['low']) # total_df['high'] = np.log(total_df['high']) # total_df['open'] = np.log(total_df['open']) # total_df['close'] = np.log(total_df['close']) # total_df['volume'] = np.log(total_df['volume']) # # column_scaler = {} # # scale the data (prices) from 0 to 1 # for column in feature_columns + ['close']: # scaler = column_scaler[column] # total_df[column] = scaler.fit_transform(np.expand_dims(total_df[column].values, axis=1)) # # # add the MinMaxScaler instances to the result returned # result["column_scaler"] = column_scaler # all_sequence_data = list() print('Creating sequences...') # with tqdm(total=len(total_df)) as t: # for df in dfs: total_df['low'] = np.log(total_df['low']) total_df['high'] = np.log(total_df['high']) total_df['open'] = np.log(total_df['open']) total_df['close'] = np.log(total_df['close']) total_df['volume'] = np.log(total_df['volume']) for column in feature_columns + ['close']: scaler = column_scaler[column] total_df[column] = scaler.transform( np.expand_dims(total_df[column].values, axis=1)) # add the target column (label) by shifting by `lookup_step` total_df['future'] = total_df['close'].shift(-lookup_step) del total_df['close'] # # last `lookup_step` columns contains NaN in future column # # get them before dropping NaNs # last_sequence = np.array(total_df[feature_columns].tail(lookup_step)) # drop NaNs total_df.dropna(inplace=True) sequence_data = list() sequences = deque(maxlen=n_steps) for entry, target in tzip(total_df[feature_columns + ['time']].values, total_df['future'].values): sequences.append(entry) if len(sequences) == n_steps: sequence_data.append([np.array(sequences), target]) # all_sequence_data.extend(sequence_data) # # get the last sequence by appending the last `n_step` sequence with `lookup_step` sequence # # for instance, if n_steps=50 and lookup_step=10, last_sequence should be of 60 (that is 50+10) length # # this last_sequence will be used to predict future stock prices that are not available in the dataset # last_sequence = list([s[:len(feature_columns)] for s in sequences]) + list(last_sequence) # last_sequence = np.array(last_sequence).astype(np.float32) # # add to result # result['last_sequence'] = last_sequence # construct the X's and y's X, y = list(), list() print('Appending...') for seq, target in tqdm(sequence_data): X.append(seq) y.append(target) # convert to numpy arrays X = np.array(X) y = np.array(y) result['X'] = X result['y'] = y # if split_by_date: # # split the dataset into training & testing sets by date (not randomly splitting) # train_samples = int((1 - test_size) * len(X)) # result["X_train"] = X[:train_samples] # result["y_train"] = y[:train_samples] # result["X_test"] = X[train_samples:] # result["y_test"] = y[train_samples:] # if shuffle: # # shuffle the datasets for training (if shuffle parameter is set) # shuffle_in_unison(result["X_train"], result["y_train"]) # shuffle_in_unison(result["X_test"], result["y_test"]) # else: # # split the dataset randomly # # result["X_train"], result["X_test"], result["y_train"], result["y_test"] = train_test_split(X, y, # test_size=test_size, # shuffle=shuffle) # # # get the list of test set dates # dates = result["X_test"][:, -1, -1] # result['dates'] = result['X_test'][:, :, -1] # # [result['total_df'][result['total_df']['time'].isin(dates)] for dates in result['X_test'][:, :, -1]] # # # retrieve test features from the original dataframe # # result["test_df"] = result["df"].loc[dates] print('Creating test df...') dates = result["X"][:, -1, -1] result["test_df"] = result["total_df"][result["total_df"]['time'].isin( dates)] # result["test_df"] = result["total_df"][result["total_df"]['time'].isin(dates)] # # # remove duplicated dates in the testing dataframe # # result["test_df"] = result["test_df"][~result["test_df"].index.duplicated(keep='first')] # # remove dates from the training/testing sets & convert to float32 result["X"] = result["X"][:, :, :len(feature_columns)].astype(np.float32) return result
def train(args, global_model, raw_data_train, raw_data_test): start_time = time.time() user_list = list(raw_data_train[2].keys())[:100] nusers = len(user_list) cluster_models = [copy.deepcopy(global_model)] del global_model cluster_models[0].to(device) cluster_assignments = [ user_list.copy() ] # all users assigned to single cluster_model in beginning if args.cfl_wsharing: shaccumulator = Accumulator() if args.frac == -1: m = args.cpr if m > nusers: raise ValueError( f"Clients Per Round: {args.cpr} is greater than number of users: {nusers}" ) else: m = max(int(args.frac * nusers), 1) print(f"Training {m} users each round") print(f"Trying to split after every {args.cfl_split_every} rounds") train_loss, train_accuracy = [], [] for epoch in range(args.epochs): # CFL if (epoch + 1) % args.cfl_split_every == 0: all_losses = [] new_cluster_models, new_cluster_assignments = [], [] for cidx, (cluster_model, assignments) in enumerate( tzip(cluster_models, cluster_assignments, desc="Try to split each cluster")): # First, train all models in cluster local_weights = [] for user in tqdm(assignments, desc="Train ALL users in the cluster", leave=False): local_model = LocalUpdate(args=args, raw_data=raw_data_train, user=user) w, loss = local_model.update_weights( copy.deepcopy(cluster_model), local_ep_override=args.cfl_local_epochs) local_weights.append(copy.deepcopy(w)) all_losses.append(loss) # record shared weights so far if args.cfl_wsharing: shaccumulator.add(local_weights) weight_updates = subtract_weights(local_weights, cluster_model.state_dict(), args) similarities = pairwise_cossim(weight_updates) max_norm = compute_max_update_norm(weight_updates) mean_norm = compute_mean_update_norm(weight_updates) # wandb.log({"mean_norm / eps1": mean_norm, "max_norm / eps2": max_norm}, commit=False) split = mean_norm < args.cfl_e1 and max_norm > args.cfl_e2 and len( assignments) > args.cfl_min_size print(f"CIDX: {cidx}[{len(assignments)}] elem") print( f"mean_norm: {(mean_norm):.4f}; max_norm: {(max_norm):.4f}" ) print(f"split? {split}") if split: c1, c2 = cluster_clients(similarities) assignments1 = [assignments[i] for i in c1] assignments2 = [assignments[i] for i in c2] new_cluster_assignments += [assignments1, assignments2] print( f"Cluster[{cidx}][{len(assignments)}] -> ({len(assignments1)}, {len(assignments2)})" ) local_weights1 = [local_weights[i] for i in c1] local_weights2 = [local_weights[i] for i in c2] cluster_model.load_state_dict( average_weights(local_weights1)) new_cluster_models.append(cluster_model) cluster_model2 = copy.deepcopy(cluster_model) cluster_model2.load_state_dict( average_weights(local_weights2)) new_cluster_models.append(cluster_model2) else: cluster_model.load_state_dict( average_weights(local_weights)) new_cluster_models.append(cluster_model) new_cluster_assignments.append(assignments) # Write everything cluster_models = new_cluster_models if args.cfl_wsharing: shaccumulator.write(cluster_models) shaccumulator.flush() cluster_assignments = new_cluster_assignments train_loss.append(sum(all_losses) / len(all_losses)) # Regular FedAvg else: all_losses = [] # Do FedAvg for each cluster for cluster_model, assignments in tzip( cluster_models, cluster_assignments, desc="Train each cluster through FedAvg"): if args.sample_dist == "uniform": sampled_users = random.sample(assignments, m) else: xs = np.linspace(-args.sigm_domain, args.sigm_domain, len(assignments)) sigmdist = 1 / (1 + np.exp(-xs)) sampled_users = np.random.choice(assignments, m, p=sigmdist / sigmdist.sum()) local_weights = [] for user in tqdm(sampled_users, desc="Training Selected Users", leave=False): local_model = LocalUpdate(args=args, raw_data=raw_data_train, user=user) w, loss = local_model.update_weights( copy.deepcopy(cluster_model)) local_weights.append(copy.deepcopy(w)) all_losses.append(loss) # update global and shared weights if args.cfl_wsharing: shaccumulator.add(local_weights) new_cluster_weights = average_weights(local_weights) cluster_model.load_state_dict(new_cluster_weights) if args.cfl_wsharing: shaccumulator.write(cluster_models) shaccumulator.flush() train_loss.append(sum(all_losses) / len(all_losses)) # Calculate avg training accuracy over all users at every epoch # regardless if it was a CFL step or not test_acc, test_loss = [], [] for cluster_model, assignments in zip(cluster_models, cluster_assignments): for user in assignments: local_model = LocalUpdate(args=args, raw_data=raw_data_test, user=user) acc, loss = local_model.inference(model=cluster_model) test_acc.append(acc) test_loss.append(loss) train_accuracy.append(sum(test_acc) / len(test_acc)) wandb.log({ "Train Loss": train_loss[-1], "Test Accuracy": (100 * train_accuracy[-1]), "Clusters": len(cluster_models) }) print( f"Train Loss: {train_loss[-1]:.4f}\t Test Accuracy: {(100 * train_accuracy[-1]):.2f}%" ) print(f"Results after {args.epochs} global rounds of training:") print("Avg Train Accuracy: {:.2f}%".format(100 * train_accuracy[-1])) print(f"Total Run Time: {(time.time() - start_time):0.4f}")
fake3 = torch.zeros(args.batch_size, 1, args.input_size // 4, args.input_size // 4).to(device) for epoch in range(args.train_epoch): epoch_start_time = time.time() G_e.eval() D_scheduler.step() D1_scheduler.step() D2_scheduler.step() D3_scheduler.step() Disc_losses = [] Gen_losses = [] Con_losses = [] for (x, _), (y, _), (y1, _), (y2, _), (y3, _) in tzip( train_loader_src, train_loader_tgt, train_loader_tgt1, train_loader_tgt2, train_loader_tgt3): e = y[:, :, :, args.input_size:] y = y[:, :, :, :args.input_size] x, y, e, y1, y2, y3 = x.to(device), y.to(device), e.to(device), y1.to( device), y2.to(device), y3.to(device) # train D D_optimizer.zero_grad() D1_optimizer.zero_grad() D2_optimizer.zero_grad() D3_optimizer.zero_grad() D_real = D(y) D_real_loss = BCE_loss(D_real, real)
np.array([id_tr[i] for i in l_out_label]).astype(int), ) if args.spk_utt_all_combinations: all_combination = list(itertools.product(u_out, l_out)) all_combination_label = list(itertools.product(u_out_label, l_out_label)) u_out = [] l_out = [] u_out_label = np.array([]) l_out_label = np.array([]) for (u, l),(u_label, l_label) in tzip(all_combination,all_combination_label): if u_label != l_label: continue if len(u_out) == 0: u_out = np.array([u]) l_out = np.array([l]) else: u_out = np.append(u_out, [u], axis=0) l_out = np.append(l_out, [l], axis=0) u_out_label = np.append(u_out_label, u_label) l_out_label = np.append(l_out_label, l_label) print("x_vector_l samples after all_combination:", len(l_out)) print("x_vector_u samples after all_combination:", len(u_out))
for cam in CAM_SETS: CAM_PATH = os.path.join(DATASET_PATH, cam) IMG_PATH = os.path.join(CAM_PATH, 'image_2') ANN_PATH = os.path.join(CAM_PATH, 'label_2') # CALIB_PATH = os.path.join(CAM_PATH, 'calib') # CALIB_FILE = os.path.join(CALIB_PATH,'000000.txt') # shutil.move(CALIB_FILE, OUT_CALIB_PATH) img_list = os.listdir(IMG_PATH) img_list.sort(key=lambda x:int(x[:-4]))#这里需要排序,因为listdir是乱序的 ann_list = os.listdir(ANN_PATH) ann_list.sort(key=lambda x:int(x[:-4])) print('moving {} imgs/anns from {} to {}'.format(len(img_list),IMG_PATH,OUT_PATH)) print('with index start with *{}*'.format(count+1)) for img, ann in tzip(img_list,ann_list): count += 1 ann_ori_path = os.path.join(ANN_PATH, ann) ann_dst_path = os.path.join(OUT_ANN_PATH, '{:06d}.txt'.format(count)) img_ori_path = os.path.join(IMG_PATH, img) img_dst_path = os.path.join(OUT_IMG_PATH, '{:06d}.png'.format(count)) shutil.move(ann_ori_path, ann_dst_path) shutil.move(img_ori_path, img_dst_path)
def create_csv(wav_list, csv_file): """ Creates the csv file given a list of wav files. Arguments --------- wav_list : list of str The list of wav files. csv_file : str The path of the output json file """ # Adding some Prints msg = f"Creating csv lists in {csv_file} ..." logger.info(msg) csv_lines = [] # Start processing lines total_duration = 0.0 # Starting index idx = 0 for wav_file in tzip(wav_list): wav_file = wav_file[0] path_parts = wav_file.split(os.path.sep) file_name, wav_format = os.path.splitext(path_parts[-1]) # Peeking at the signal (to retrieve duration in seconds) if os.path.isfile(wav_file): info = torchaudio.info(wav_file) else: msg = "\tError loading: %s" % (str(len(file_name))) logger.info(msg) continue audio_duration = info.num_frames / info.sample_rate total_duration += audio_duration # Actual name of the language language = path_parts[-4] # Create a row with whole utterences csv_line = [ idx, # ID wav_file, # File name wav_format, # File format str(info.num_frames / info.sample_rate), # Duration (sec) language, # Language ] # Adding this line to the csv_lines list csv_lines.append(csv_line) # Increment index idx += 1 # CSV column titles csv_header = ["ID", "wav", "wav_format", "duration", "language"] # Add titles to the list at indexx 0 csv_lines.insert(0, csv_header) # Writing the csv lines with open(csv_file, mode="w", encoding="utf-8") as csv_f: csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in csv_lines: csv_writer.writerow(line) # Final prints msg = f"{csv_file} sucessfully created!" logger.info(msg) msg = f"Number of samples: {len(wav_list)}." logger.info(msg) msg = f"Total duration: {round(total_duration / 3600, 2)} hours." logger.info(msg)
def create_csv(orig_tsv_file, csv_file, data_folder, accented_letters=False, language="en"): """ Creates the csv file given a list of wav files. Arguments --------- orig_tsv_file : str Path to the Common Voice tsv file (standard file). data_folder : str Path of the CommonVoice dataset. accented_letters : bool, optional Defines if accented letters will be kept as individual letters or transformed to the closest non-accented letters. Returns ------- None """ # Check if the given files exists if not os.path.isfile(orig_tsv_file): msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file) logger.info(msg) raise FileNotFoundError(msg) # We load and skip the header loaded_csv = open(orig_tsv_file, "r").readlines()[1:] nb_samples = str(len(loaded_csv)) msg = "Preparing CSV files for %s samples ..." % (str(nb_samples)) logger.info(msg) # Adding some Prints msg = "Creating csv lists in %s ..." % (csv_file) logger.info(msg) csv_lines = [["ID", "duration", "wav", "spk_id", "wrd"]] # Start processing lines total_duration = 0.0 for line in tzip(loaded_csv): line = line[0] # Path is at indice 1 in Common Voice tsv files. And .mp3 files # are located in datasets/lang/clips/ mp3_path = data_folder + "/clips/" + line.split("\t")[1] file_name = mp3_path.split(".")[-2].split("/")[-1] spk_id = line.split("\t")[0] snt_id = file_name # Setting torchaudio backend to sox-io (needed to read mp3 files) if torchaudio.get_audio_backend() != "sox_io": logger.warning( "This recipe needs the sox-io backend of torchaudio") logger.warning("The torchaudio backend is changed to sox_io") torchaudio.set_audio_backend("sox_io") # Reading the signal (to retrieve duration in seconds) if os.path.isfile(mp3_path): info = torchaudio.info(mp3_path) else: msg = "\tError loading: %s" % (str(len(file_name))) logger.info(msg) continue duration = info.num_frames / info.sample_rate total_duration += duration # Getting transcript words = line.split("\t")[2] # Unicode Normalization words = unicode_normalisation(words) # !! Language specific cleaning !! # Important: feel free to specify the text normalization # corresponding to your alphabet. if language in ["en", "fr", "it", "rw"]: words = re.sub("[^’'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿéæœâçèàûî]+", " ", words).upper() if language == "fr": # Replace J'y D'hui etc by J_ D_hui words = words.replace("'", " ") words = words.replace("’", " ") elif language == "ar": HAMZA = "\u0621" ALEF_MADDA = "\u0622" ALEF_HAMZA_ABOVE = "\u0623" letters = ("ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ" + HAMZA + ALEF_MADDA + ALEF_HAMZA_ABOVE) words = re.sub("[^" + letters + "]+", " ", words).upper() elif language == "ga-IE": # Irish lower() is complicated, but upper() is nondeterministic, so use lowercase def pfxuc(a): return len(a) >= 2 and a[0] in "tn" and a[1] in "AEIOUÁÉÍÓÚ" def galc(w): return w.lower( ) if not pfxuc(w) else w[0] + "-" + w[1:].lower() words = re.sub("[^-A-Za-z'ÁÉÍÓÚáéíóú]+", " ", words) words = " ".join(map(galc, words.split(" "))) # Remove accents if specified if not accented_letters: words = strip_accents(words) words = words.replace("'", " ") words = words.replace("’", " ") # Remove multiple spaces words = re.sub(" +", " ", words) # Remove spaces at the beginning and the end of the sentence words = words.lstrip().rstrip() # Getting chars chars = words.replace(" ", "_") chars = " ".join([char for char in chars][:]) # Remove too short sentences (or empty): if len(words.split(" ")) < 3: continue # Composition of the csv_line csv_line = [snt_id, str(duration), mp3_path, spk_id, str(words)] # Adding this line to the csv_lines list csv_lines.append(csv_line) # Writing the csv lines with open(csv_file, mode="w", encoding="utf-8") as csv_f: csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in csv_lines: csv_writer.writerow(line) # Final prints msg = "%s successfully created!" % (csv_file) logger.info(msg) msg = "Number of samples: %s " % (str(len(loaded_csv))) logger.info(msg) msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2))) logger.info(msg)
from tqdm.contrib import tenumerate, tzip, tmap import numpy as np for _ in tenumerate(range(int(1e6)), desc="builtin enumerate"): pass for _ in tenumerate(np.random.random((999, 999)), desc="numpy.ndenumerate"): pass for _ in tzip(np.arange(1e6), np.arange(1e6) + 1, desc="builtin zip"): pass mapped = tmap(lambda x: x + 1, np.arange(1e6), desc="builtin map") assert (np.arange(1e6) + 1 == list(mapped)).all()