def unique_configurations( self, site_distribution, verbose=False, show_progress=False ): """ Find the symmetry inequivalent configurations for a given population of objects. Args: site_distribution (dict): A dictionary that defines the number of each object to be arranged in this system. e.g. for a system with four sites, with two occupied (denoted `1`) and two unoccupied (denoted `0`):: { 1: 2, 0: 2 } verbose (opt:default=False): Print verbose output. show_progress (opt:default=False): Show a progress bar. Setting to `True` gives a simple progress bar. Setting to `"notebook"` gives a Jupyter notebook compatible progress bar. Returns: unique_configurations (list): A list of :any:`Configuration` objects, for each symmetry inequivalent configuration. """ s = flatten_list( [ [ key ] * site_distribution[ key ] for key in site_distribution ] ) total_permutations = number_of_unique_permutations( s ) if verbose: print( 'total number of sites: ' + str( sum( site_distribution.values() ) ) ) print( 'using {:d} symmetry operations.'.format( len( self.symmetry_group.symmetry_operations ) ) ) print( 'evaluating {:d} unique permutations.'.format( total_permutations ) ) generator = unique_permutations( s ) if show_progress: if show_progress=='notebook': generator = tqdm_notebook( generator, total=total_permutations, unit=' permutations' ) else: generator = tqdm( generator, total=total_permutations, unit=' permutations' ) return self.enumerate_configurations( generator, verbose=verbose )
def __new__(cls, iterable=None, desc=None, total=None, leave=True, backend=None, verbose=True): if backend is None: backend = Progressbar.backend if not verbose: backend = "hide" if backend == "tqdm": from tqdm import tqdm return tqdm(iterable=iterable, desc=desc, total=total, leave=leave, ascii=True, ncols=80, file=sys.stdout, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed" "}<{remaining}{postfix}]") # remove rate_fmt elif backend == "tqdm_notebook": from tqdm import tqdm_notebook return tqdm_notebook(iterable=iterable, desc=desc, total=total, leave=leave) elif backend == "pyprind": from pyprind import ProgBar, prog_bar ProgBar._adjust_width = lambda self: None # keep constant width if iterable is None: return ProgBar(total, title=desc, stream=1) else: return prog_bar(iterable, title=desc, stream=1, iterations=total) elif backend == "hide": return NoProgressbar(iterable=iterable) else: raise NotImplementedError("unknown backend")
def progbar(it=None, nb=False, **tqdm_settings): """ Turn any iterable into a progress bar, with notebook version. """ defaults = {'ascii': True, 'smoothing': 0} # Overide defaults with custom tqdm_settings settings = {**defaults, **tqdm_settings} if nb: # pragma: no cover return tqdm.tqdm_notebook(it, **settings) return tqdm.tqdm(it, **settings)
def load_data(last_n_days=30): def load_gz(file): try: if file.split('.')[-1] == 'gz': with gzip.open(file) as f: data = ujson.loads(f.read().decode('utf-8')) else: with open(file, encoding='utf-8') as f: data = ujson.load(f) except: print(f'Error loading file: {file}') return [Window(*v) for v in data] files = {file: os.path.getctime(os.path.join(LOGS, file)) for file in os.listdir(LOGS)} split_date = (dt.fromtimestamp(files[sorted(files.keys())[-1]]) - pd.Timedelta(str(last_n_days) + 'days')).date() data = None days = [] for file in tqdm_notebook(files): if dt.fromtimestamp(files[file]).date() > split_date: day = load_gz(os.path.join(LOGS, file)) day = pd.DataFrame.from_records(day, columns=Window._fields) day['boot'] = pd.Timestamp(day['start_time'].min()) days.append(day) data = pd.concat([*days]) data['start_time'] = data['start_time'].apply(lambda x: pd.Timestamp(x)) data['last_update'] = data['last_update'].apply(lambda x: pd.Timestamp(x)) data['focus_time'] = data['focus_time'].apply(lambda x: pd.Timedelta(x)) data['start_time'] = data['last_update'] - data['focus_time'] def categorize(x, dictionary): for k, v in dictionary.items(): if k.lower() in x.lower(): return v def merge(*lists): ret = lists[0] for l in lists[:-1]: assert len(l) == len(lists[-1]) for i in range(len(lists[0])): for l in lists: if l[i]: ret[i] = l[i] break return ret if data is not None: data['category'] = merge( data['name'].apply(lambda x: categorize(x, categories_name)).values, data['exe'].apply(lambda x: categorize(x, categories_exe)).values, data['exe'].str.split('\\').apply(lambda x: x[-1]).values) # Delete unused columns del data['pid'] del data['cmd'] return data
def on_epoch_begin(self, epoch, logs=None): print('Epoch %d/%d' % (epoch + 1, self.epochs)) if "steps" in self.params: self.use_steps = True self.target = self.params['steps'] else: self.use_steps = False self.target = self.params['samples'] self.prog_bar = tqdm.tqdm_notebook(total=self.target) self.log_values_by_metric = defaultdict(list)
def progressbar(*args, **kwargs): """Uses tqdm progressbar. This function exists for wrapping purposes only. Original docstring follows: ---------------------------------------- %s %s """ try: return tqdm_notebook(*args, **kwargs) except: return tqdm(*args, **kwargs)
def test_validation_loss(decoder, s, generate_batch, val_img_embeds, val_captions_indexed): np.random.seed(300) random.seed(300) val_loss = 0 for _ in tqdm.tqdm_notebook(range(1000)): val_loss += s.run(decoder.loss, generate_batch(val_img_embeds, val_captions_indexed, 32, 20)) val_loss /= 1000. return val_loss
def on_epoch_begin(self, net, X=None, X_valid=None, **kwargs): # Assume it is a number until proven otherwise. batches_per_epoch = self.batches_per_epoch if self.batches_per_epoch == 'auto': batches_per_epoch = self._get_batches_per_epoch(net, X, X_valid) elif self.batches_per_epoch == 'count': # No limit is known until the end of the first epoch. batches_per_epoch = None if self._use_notebook(): self.pbar = tqdm.tqdm_notebook(total=batches_per_epoch) else: self.pbar = tqdm.tqdm(total=batches_per_epoch)
def __iter__(self): state = self.prepareState(self._endpoint, self._filters, **self._prepareStateParams) entries = self._endpoint(sort= self._sort, n= self._n, **self._filters) if self._progbar: try: get_ipython inNotebook = True except NameError: inNotebook = False if not inNotebook: sys.stderr.write("Locating data...") entries = list(entries) if self._progbar and not inNotebook: sys.stderr.write("\r") if self._progbar: try: get_ipython # will fail faster and more reliably than tqdm_notebook entriesIterable = tqdm_notebook(entries, unit= "entries") except (NameError, AttributeError, TypeError): entriesIterable = tqdm(entries, unit= "entries") else: entriesIterable = entries def iterate(): for entry in entriesIterable: try: data = self.parse(entry, state= state) if state is not None else self.parse(entry) yield entry, data except KeyboardInterrupt: self._write('Interrupted while parsing "{}"'.format(entry.path)) break except GeneratorExit: raise GeneratorExit except: self._write('Error while parsing "{}":'.format(entry.path)) self._write( traceback.format_exc() ) # chain the operations together # each function in self._chain is a generator which takes an iterator # (remember that you call a generator to "activate" it: calling a generator returns an iterator) # so end condition for the loop is that `iterate` refers to an iterator iterate = iterate() for do in self._chain: iterate = do(iterate) return iterate
def simulate_notebook(params): num_steps = params['num_steps'] num_dim = params['num_dim'] num_particles = params['num_particles'] A = params['Ddt'] / params['KBT'] B = np.sqrt(2*params['Ddt']) U = params['potential'] Xs = np.zeros(shape=(num_steps, num_dim, num_particles)) Xs[0,:,:] = params['x0'] for t in tqdm_notebook(range(1, num_steps)): drift = np.zeros(shape=(num_dim, num_particles)) for i in range(num_particles): drift[:,i] = A * U.get_force(Xs[t-1,:,i]) noise = B * np.random.normal(size=(num_dim, num_particles)) Xs[t,:,:] = Xs[t-1,:,:] + drift + noise return Xs
def challenge_evaluate_performance(fn): score = 0 for i in tnrange(8, desc="Total"): wave = load_wave("data/secret_tests/challenge_valid_%d"%i) labels = true_labels[i] pred_labels = fn(wave) for j in range(3): # best of 3! score += test_classification_score(wave, labels, pred_labels) for j in tqdm_notebook(xrange(40), desc='Test case %d'%i): sleep(0.1) print "*** Total score: %.2f ***" % score return score
def download_file(url, file_path): r = requests.get(url, stream=True) total_size = int(r.headers.get('content-length')) try: with open(file_path, 'wb', buffering=16*1024*1024) as f: bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True) bar.set_description(os.path.split(file_path)[-1]) for chunk in r.iter_content(32 * 1024): f.write(chunk) bar.update(len(chunk)) bar.close() except Exception: print("Download failed") finally: if os.path.getsize(file_path) != total_size: os.remove(file_path) print("Removed incomplete download")
def progbar(it=None, nb=False, **kwargs): """Turn any iterable into a progress bar, with notebook option Parameters ---------- it: iterable Iterable to wrap with progress bar nb: bool Whether to display the notebook progress bar **kwargs: dict-like additional options to send to tqdm """ defaults = {'ascii': True, 'smoothing': 0.0} # Overide defaults with custom kwargs settings = {**defaults, **kwargs} if nb: # pragma: no cover return tqdm.tqdm_notebook(it, **settings) return tqdm.tqdm(it, **settings)
def download_file(url, file_path): r = requests.get(url, stream=True) total_size = int(r.headers.get('content-length')) bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True) bar.set_description(os.path.split(file_path)[-1]) incomplete_download = False try: with open(file_path, 'wb', buffering=16 * 1024 * 1024) as f: for chunk in r.iter_content(1 * 1024 * 1024): f.write(chunk) bar.update(len(chunk)) except Exception as e: raise e finally: bar.close() if os.path.exists(file_path) and os.path.getsize(file_path) != total_size: incomplete_download = True os.remove(file_path) if incomplete_download: raise Exception("Incomplete download")
def calculate(self, raw, window=1., step=0.25, minmax=False, variance=False, reduction='mean', progressbar=True): if not minmax and not variance: raise Warning('Nothing computed. To compute variance you need to' ' pass `variance=True`, to compute range you need to' ' pass `minmax=True`.') return self data = raw._data window = int(round(window * raw.info['sfreq'])) step = int(round(step * raw.info['sfreq'])) self.window = window self.step = step self.sfreq = raw.info['sfreq'] n_samples = data.shape[1] n_windows = int(np.floor((n_samples - window) / step)) self.ranges = np.zeros(n_windows) if minmax else None self.variances = np.zeros(n_windows) if variance else None reduction = dict(mean=np.mean, max=np.max)[reduction] if progressbar: from tqdm import tqdm_notebook pbar = tqdm_notebook(total=n_windows) # step through data for window_idx in range(n_windows): first = window_idx * step last = first + window data_buffer = data[:, first:last] if minmax: self.ranges[window_idx] = reduction( data_buffer.max(axis=1) - data_buffer.min(axis=1)) if variance: self.variances[window_idx] = reduction(data_buffer.var(axis=1)) if progressbar: pbar.update(1) return self
def _pbar(iterable, desc, leave=True, position=None, verbose='progressbar'): if verbose is not False and \ verbose not in ['progressbar', 'tqdm', 'tqdm_notebook']: raise ValueError('verbose must be one of {progressbar,' 'tqdm, tqdm_notebook, False}. Got %s' % verbose) if verbose == 'progressbar': from mne.utils import ProgressBar pbar = ProgressBar(iterable, mesg=desc, spinner=True) print('') elif verbose == 'tqdm': from tqdm import tqdm pbar = tqdm(iterable, desc=desc, leave=leave, position=position, dynamic_ncols=True) elif verbose == 'tqdm_notebook': from tqdm import tqdm_notebook pbar = tqdm_notebook(iterable, desc=desc, leave=leave, position=position, dynamic_ncols=True) elif verbose is False: pbar = iterable return pbar
def train(self, n_epochs, K = 5): [obs_ph, act_ph, new_obs_ph, rew_ph, terminal_ph, policy_network, old_policy_network, actions, train_policy, train_state_value] = self._graph data_collector = A2CDataCollector(self._sess, actions, obs_ph, 20, 20) for i in tqdm_notebook(range(n_epochs)): self._update_old_network() obs, acts, new_obs, rews, terminal = data_collector.collect_data() for j in range(K): self._sess.run([train_policy],feed_dict={ obs_ph: np.array(obs).reshape(-1, self._obs_dim), act_ph: np.array(acts).reshape(-1), new_obs_ph: np.array(new_obs).reshape(-1, self._obs_dim), rew_ph: np.array(rews).reshape(-1, 1), terminal_ph: np.array(terminal).reshape(-1, 1) }) for j in range(30): self._sess.run([train_state_value],feed_dict={ obs_ph: np.array(obs).reshape(-1, self._obs_dim), act_ph: np.array(acts).reshape(-1), new_obs_ph: np.array(new_obs).reshape(-1, self._obs_dim), rew_ph: np.array(rews).reshape(-1, 1), terminal_ph: np.array(terminal).reshape(-1, 1) }) return data_collector.get_episode_statistics()
def load_lfw_dataset( use_raw=False, dx=80, dy=80, dimx=45, dimy=45): # read attrs df_attrs = pd.read_csv(ATTRS_NAME, sep='\t', skiprows=1) df_attrs = pd.DataFrame(df_attrs.iloc[:, :-1].values, columns=df_attrs.columns[1:]) imgs_with_attrs = set(map(tuple, df_attrs[["person", "imagenum"]].values)) # read photos all_photos = [] photo_ids = [] with tarfile.open(RAW_IMAGES_NAME if use_raw else IMAGES_NAME) as f: for m in tqdm.tqdm_notebook(f.getmembers()): if m.isfile() and m.name.endswith(".jpg"): # prepare image img = decode_image_from_raw_bytes(f.extractfile(m).read()) img = img[dy:-dy, dx:-dx] img = cv2.resize(img, (dimx, dimy)) # parse person fname = os.path.split(m.name)[-1] fname_splitted = fname[:-4].replace('_', ' ').split() person_id = ' '.join(fname_splitted[:-1]) photo_number = int(fname_splitted[-1]) if (person_id, photo_number) in imgs_with_attrs: all_photos.append(img) photo_ids.append({'person': person_id, 'imagenum': photo_number}) photo_ids = pd.DataFrame(photo_ids) all_photos = np.stack(all_photos).astype('uint8') # preserve photo_ids order! all_attrs = photo_ids.merge(df_attrs, on=('person', 'imagenum')).drop(["person", "imagenum"], axis=1) return all_photos, all_attrs
def eval(self,**kwargs): """ evaluate the link Parameters ---------- applywav :boolean Apply waveform to H force : list Force the computation (['sig','ray','Ct','H']) AND save (replace previous computations) alg : 1|'old'|'exp'|'exp2' version of run for signature si_progress: bollean ( False) display progression bar for signatures diffraction : boolean (False) takes into consideration diffraction points ra_number_mirror_cf : int rays.to3D number of ceil/floor reflexions ra_ceil_H: float, (default []) ceil height . If [] : Layout max ceil height If 0 : only floor reflection (outdoor case) If -1 : neither ceil nor floor reflection (2D case) ra_vectorized: boolean (True) if True used the (2015 new) vectorized approach to determine 2drays progressbar: str None: no progress bar python : progress bar in ipython Returns ------- ak : ndarray alpha_k tk : ndarray tau_k Notes ----- update self.ak and self.tk self.ak : ndarray alpha_k self.tk : ndarray tau_k Examples -------- .. plot:: :include-source: >>> from pylayers.simul.link import * >>> L=DLink(verbose=False) >>> aktk = L.eval() See Also -------- pylayers.antprop.signature pylayers.antprop.rays Experimental ------------ alg = 2015 | 20152 (best) vectorized signature research si_reverb : number of reverb in source/target cycle if alg=2015 """ defaults={ 'applywav':True, 'si_progress':False, 'diffraction':True, 'ra_vectorized':True, 'ra_ceil_H':[], 'ra_number_mirror_cf':1, 'force':[], 'alg':1, 'si_reverb':4, 'threshold':0.1, 'verbose':[], 'progressbar':None, } for key, value in defaults.items(): if key not in kwargs: kwargs[key]=value if 'cutoff' not in kwargs: kwargs['cutoff']=self.cutoff else: self.cutoff=kwargs['cutoff'] if 'force' in kwargs: if not isinstance(kwargs['force'],list): if kwargs['force'] == True : kwargs['force'] = ['sig','ray','Ct','H'] else : kwargs['force'] = [] if kwargs['verbose'] != []: self.verbose=kwargs['verbose'] #pdb.set_trace() # must be placed after all the init !!!! if self.verbose : print "checkh5" self.checkh5() if isinstance(kwargs['progressbar'],str): if kwargs['progressbar'] =='notebook': pbar = tqdm.tqdm_notebook(total=100) elif kwargs['progressbar']=='python': pbar = tqdm.tqdm(total=100) elif isinstance(kwargs['progressbar'],tqdm.tqdm): pbar = kwargs['progressbar'] ############ # Signatures ############ if self.verbose : print "Start Signatures" tic = time.time() Si = Signatures(self.L,self.ca,self.cb,cutoff=kwargs['cutoff']) if (self.dexist['sig']['exist'] and not ('sig' in kwargs['force'])): self.load(Si,self.dexist['sig']['grpname']) if self.verbose : print "load signature" else : if kwargs['alg']==1: Si.run(cutoff=kwargs['cutoff'], diffraction=kwargs['diffraction'], threshold=kwargs['threshold'], progress=kwargs['si_progress']) if self.verbose : print "default algorithm" if kwargs['alg']=='exp': TMP=Si.run_exp(cutoff=kwargs['cutoff'], cutoffbound=kwargs['si_reverb']) if self.verbose : print "experimental (ex 2015)" if kwargs['alg']=='exp2': TMP=Si.run_exp2(cutoff=kwargs['cutoff'], cutoffbound=kwargs['si_reverb']) if self.verbose : print "algo exp2 ( ex 20152)" #Si.run6(diffraction=kwargs['diffraction']) # save sig self.save(Si,'sig',self.dexist['sig']['grpname'],force = kwargs['force']) self.Si = Si toc = time.time() if self.verbose : print "Stop signature",toc-tic try: pbar.update(20) except: pass ############ # Rays ############ if self.verbose : print "Start Rays" tic = time.time() R = Rays(self.a,self.b) if self.dexist['ray']['exist'] and not ('ray' in kwargs['force']): self.load(R,self.dexist['ray']['grpname']) else : # perform computation ... # ... with vetorized ray evaluation approach if kwargs['ra_vectorized']: r2d = Si.raysv(self.a,self.b) # ... or with original and slow approach ( to be removed in a near future) else : r2d = Si.rays(self.a,self.b) if kwargs['ra_ceil_H'] == []: ceilheight = self.L.maxheight else: ceilheight = kwargs['ra_ceil_H'] R = r2d.to3D(self.L,H=ceilheight, N=kwargs['ra_number_mirror_cf']) R.locbas(self.L) # ...and save R.fillinter(self.L) C = Ctilde() C = R.eval(self.fGHz) self.save(R,'ray',self.dexist['ray']['grpname'],force = kwargs['force']) self.R = R toc = time.time() if self.verbose : print "Stop rays",toc-tic if self.R.nray == 0: raise NameError('No rays have been found. Try to re-run the simulation with a higher S.cutoff ') try: pbar.update(20) except: pass ############ # Ctilde ############ if self.dexist['Ct']['exist'] and not ('Ct' in kwargs['force']): C=Ctilde() self.load(C,self.dexist['Ct']['grpname']) else : #if not hasattr(R,'I'): # Ctilde... # Find an other criteria in order to decide whether the R has # already been evaluated #pdb.set_trace() C = R.eval(self.fGHz) # ...save Ct self.save(C,'Ct',self.dexist['Ct']['grpname'],force = kwargs['force']) self.C = C try: pbar.update(20) except: pass ############ # H ############ H = Tchannel() if self.dexist['H']['exist'] and not ('H' in kwargs['force']): self.load(H,self.dexist['H']['grpname']) else : # Ctilde antenna Cl=C.locbas(Tt=self.Ta, Tr=self.Tb) #T channel H = C.prop2tran(a=self.Aa,b=self.Ab,Friis=True,debug=True) self.save(H,'H',self.dexist['H']['grpname'],force = kwargs['force']) self.H = H try: pbar.update(20) except: pass if kwargs['applywav']: if self.H.isFriis: self.ir = self.H.get_cir(self.wav.sf) else: self.ir = self.H.get_cir(self.wav.sfg) try: pbar.update(20) except: pass return self.H.ak, self.H.tk
# In[ ]: print('Setting up our DataLoader for training..') train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE) print('Model.train!!!!') model.train() for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm_notebook(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) if OUTPUT_MODE == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif OUTPUT_MODE == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if GRADIENT_ACCUMULATION_STEPS > 1: loss = loss / GRADIENT_ACCUMULATION_STEPS loss.backward() print("\r%f" % loss, end='') tr_loss += loss.item() nb_tr_examples += input_ids.size(0)
clf6 = make_pipeline( SMOTE(random_state=0), KerasClassifier(build_fn=dnn_models1, epochs=25, batch_size=1000, verbose=1)) total_scores_1 = [] total_scores_2 = [] total_scores_3 = [] total_scores_4 = [] total_scores_5 = [] total_scores_6 = [] #hitseq_num = 1 for hitseq_num in tqdm_notebook(range(1, 11)): 온라인_x1 = [] for idx_index, idx_value in enumerate(idx): # hitseq_num개 이상의 클릭 로그를 가진 세션만 추출 if idx_value >= hitseq_num: # 구매여부 변수에서 unique한 유저 아이디와 세션 아이디 하나를 가지고 옴 구매여부_idx = 구매여부.iloc[idx_index, :-1] 구매여부_idx = str(구매여부_idx[0]) + '_' + str(구매여부_idx[1]) # 위에서 가지고 온 유저 아이디와 세션 아이디가 일치하는 데이터만 추출 온라인_x_partial = 온라인[온라인['unique_id'] == 구매여부_idx].iloc[:, 3:-1] # hitseq_num개 이상의 클릭 로그를 가진 세션의 클릭 로그 중에서 hitseq_num까지의 클릭 로그만 사용(추출) # hitseq_num개 이후의 클릭 로그는 버림 온라인_x_partial = np.array( 온라인_x_partial[온라인_x_partial['hit_seq'] <= hitseq_num])
def get_feature_matrix(sales, test, items, list_lags, date_block_threshold): """ This function create the model tablon""" # Create "grid" with columns index_cols = ['shop_id', 'item_id', 'date_block_num'] # For every month we create a grid from all shops/items combinations from that month grid = [] new_items = pd.DataFrame() cur_items_aux=np.array([]) for block_num in sales['date_block_num'].unique(): cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique() cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].append(pd.Series(cur_items_aux)).unique() cur_items_aux = cur_items[pd.Series(cur_items).isin(test.item_id)] grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32')) # Turn the grid into a dataframe grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32) # Add submission shop_id-item_id in order to test predictions test['date_block_num'] = 34 grid = grid.append(test[['shop_id', 'item_id', 'date_block_num']]) # Groupby data to get shop-item-month aggregates gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}}) # Fix column names gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] # Join it to the grid all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0) # Same as above but with shop-month aggregates gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}}) gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0) # Same as above but with item-month aggregates gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}}) gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values] all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0) # Downcast dtypes from 64 to 32 bit to save memory all_data = downcast_dtypes(all_data) del grid, gb gc.collect() # List of columns that we will use to create lags cols_to_rename = list(all_data.columns.difference(index_cols)) shift_range = list_lags for month_shift in tqdm_notebook(shift_range): train_shift = all_data[index_cols + cols_to_rename].copy() train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x train_shift = train_shift.rename(columns=foo) all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0) del train_shift # Don't use old data from year 2013 all_data = all_data[all_data['date_block_num'] >= date_block_threshold] # List of all lagged features fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] # We will drop these at fitting stage to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] # Category for each item item_category_mapping = items[['item_id','item_category_id']].drop_duplicates() all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id') all_data = downcast_dtypes(all_data) gc.collect(); return [all_data, to_drop_cols]
ax1.set_title('Image') ax2.imshow(first_seg[0, :, :, 0], vmin = 0, vmax = 1) ax2.set_title('Prediction') fig.savefig('test_predictions.png') from keras import models, layers fullres_model = models.load_model('fullres_model.h5', compile=False) seg_in_shape = fullres_model.get_input_shape_at(0)[1:3] seg_out_shape = fullres_model.get_output_shape_at(0)[1:3] print(seg_in_shape, '->', seg_out_shape) from tqdm import tqdm_notebook from skimage.morphology import binary_opening, disk out_pred_rows = [] for c_img_name in tqdm_notebook(test_paths): c_path = os.path.join(test_image_dir, c_img_name) c_img = imread(c_path) c_img = np.expand_dims(c_img, 0)/255.0 cur_seg = fullres_model.predict(c_img)[0] cur_seg = binary_opening(cur_seg>0.5, np.expand_dims(disk(2), -1)) cur_rles = multi_rle_encode(cur_seg) if len(cur_rles)>0: for c_rle in cur_rles: out_pred_rows += [{'ImageId': c_img_name, 'EncodedPixels': c_rle}] else: out_pred_rows += [{'ImageId': c_img_name, 'EncodedPixels': None}] gc.collect() c_path = os.path.join(test_image_dir, '000155de5.jpg') c_img = imread(c_path) # 768,768,3
discriminator_loss, discriminator.trainable_variables) generator_optimizer.apply_gradients( zip(generator_gradient, generator.trainable_variables)) discriminator_optimizer.apply_gradients( zip(discriminator_gradient, discriminator.trainable_variables)) def play(sample_noise): result = generator(sample_noise) result = tf.reshape(result, (8, 28, 28)) c = itertools.count(1) plt.figure(figsize=(18, 8)) for image in result: plt.subplot(2, 4, next(c)) plt.imshow(image) plt.show() play(tf.random.normal((8, NOISE_DIM))) sample_noise = tf.random.normal((8, NOISE_DIM)) play(sample_noise) for _ in range(50): for images in tqdm.tqdm_notebook(dataset, total=len(list(dataset))): train_step(images) play(sample_noise) # generator.save('generator.h5') # discriminator.save('discriminator.h5')
def predict(self, BATCH_SIZE=2, CONF_THRESH=0.005, NMS_THRESH=0.45): # CONF_THRESH=0.25,NMS_THRESH=0.45, IOU_THRESH = 0.5 # Step1 - Get Model if (1): if self.MODEL == '' or self.MODEL == None: print(' - 1. Loading model : ', self.MODEL_WEIGHTFILE) self.MODEL = getYOLOv2(self.MODEL_CFGFILE, self.MODEL_WEIGHTFILE) self.MODEL.eval() # Step2 - Get Dataset if (1): with open(self.EVAL_IMAGELIST) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] eval_dataset = VOCDatasetv2(self.EVAL_IMAGELIST, shape=(self.MODEL.width, self.MODEL.height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ])) kwargs = {'num_workers': 1, 'pin_memory': True} eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False, **kwargs) # Step3 - Create File pointers for prediction storage (after removing the older files) if (1): fps = [0] * self.MODEL.num_classes if not os.path.exists(self.EVAL_OUTPUTDIR): os.mkdir(self.EVAL_OUTPUTDIR) else: for i in range(self.MODEL.num_classes): buf = '%s/%s%s.txt' % (self.EVAL_OUTPUTDIR, self.EVAL_PREFIX, self.VOC_CLASSES[i]) if os.path.exists(buf): os.remove(buf) # Should I delete folder and remake?? for i in range(self.MODEL.num_classes): buf = '%s/%s%s.txt' % (self.EVAL_OUTPUTDIR, self.EVAL_PREFIX, self.VOC_CLASSES[i]) fps[i] = open(buf, 'w') lineId = -1 verbose = 0 with torch.no_grad(): val_loss_total = 0.0 with tqdm.tqdm_notebook(total=len(eval_loader) * BATCH_SIZE) as pbar: for batch_idx, (data, target) in enumerate(eval_loader): pbar.update(BATCH_SIZE) t1 = time.time() if self.USE_GPU: data = data.cuda() # target = target.cuda() data, target = Variable(data), Variable(target) output = self.MODEL(data).data t2 = time.time() if self.LOGGER != '': if self.MODEL_LOSS != None: # print (' - [DEBUG] target[target != 0.0]) : ', target[target != 0.0], ' || ', target.dtype) if (len(target[target != 0.0])): try: # print (' - [DEBUG] region_loss : ', self.MODEL_LOSS) val_loss = self.MODEL_LOSS(output, target) val_loss_total += val_loss.data if self.verbose: print(' - loss : ', val_loss) except: traceback.print_exc() pdb.set_trace() else: print(' - No annotations : ', valid_files[lineId]) batch_boxes = get_region_boxes(output, CONF_THRESH, self.MODEL.num_classes, self.MODEL.anchors, self.MODEL.num_anchors, 0, 1) t3 = time.time() for i in range( output.size(0)): # output.size(0) = batch_size t31 = time.time() lineId = lineId + 1 fileId = os.path.basename( valid_files[lineId]).split('.')[0] width, height = get_image_size(valid_files[lineId]) t32 = time.time() # print(valid_files[lineId]) boxes = batch_boxes[i] boxes = nms(boxes, NMS_THRESH) for box in boxes: # box = [x,y,w,h, box_conf, class_conf, cls_id] # Top-Left Corner (xmin, xmax) x1 = (box[0] - box[2] / 2.0) * width # x - w/2 (x = centre of BBox) y1 = (box[1] - box[3] / 2.0) * height # y - h/2 # Top-Right Corner (ymin, ymax) x2 = (box[0] + box[2] / 2.0) * width # x + h/2 y2 = (box[1] + box[3] / 2.0) * height # y + h/2 box_conf = box[4] for j in range(int((len(box) - 5) / 2)): cls_conf = box[5 + 2 * j] cls_id = box[6 + 2 * j] prob = box_conf * cls_conf fps[cls_id].write( '%s %f %f %f %f %f\n' % (fileId, prob, x1, y1, x2, y2) ) # for each class_id, write down [prob, x1,y1,x2,y2] t33 = time.time() if (verbose): print(' -- Time : imread : ', round(t32 - t31, 4), ' || boxes loop : ', round(t33 - t32, 4)) t4 = time.time() # pdb.set_trace() if (0): print(' -- [DEBUG][PASCALVOCEval] Total time : ', round(t4 - t1, 2)) print(' -- [DEBUG][PASCALVOCEval] output time : ', round(t2 - t1, 2)) print(' -- [DEBUG][PASCALVOCEval] boxes time : ', round(t3 - t2, 2)) print(' -- [DEBUG][PASCALVOCEval] file write : ', round(t4 - t3, 2)) if self.LOGGER != '': if self.MODEL_LOSS != None: self.LOGGER.save_value('Total Loss', 'Val Loss', self.LOGGER_EPOCH + 1, val_loss_total / len(eval_loader)) for i in range(self.MODEL.num_classes): fps[i].close() self._do_python_eval()
def GMM_prediction(train, test, target_magic=None, seed=42, trained_parameter_file=None): if target_magic is not None: train = train[train[magic] == target_magic] test = test[test[magic] == target_magic] train.reset_index(drop=True,inplace=True) test.reset_index(drop=True,inplace=True) if trained_parameter_file is not None: trained_parameter = dict(np.load(trained_parameter_file)) # trained_parameter = np.load(trained_parameter_file) else: trained_parameter = {} def get_mean_cov(x,y): max_label = y.astype(int).max() ps = [] ms = [] for i in range(max_label + 1): model = GraphicalLasso() label_i = (y==i).astype(bool) x2 = x[label_i] model.fit(x2) ps.append(model.precision_) ms.append(model.location_) ms = np.stack(ms) ps = np.stack(ps) return ms,ps # INITIALIZE VARIABLES cols = [c for c in train.columns if c not in ['id', 'target']] cols.remove('wheezy-copper-turtle-magic') # BUILD 512 SEPARATE MODELS random_seed_num = 8 GMM_array = [] for r in range(random_seed_num): GMM_array.append([np.zeros(len(train)), np.zeros(len(test))]) for i in tqdm_notebook(range(512) if target_magic is None else [target_magic]): # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I train2 = train[train['wheezy-copper-turtle-magic']==i] test2 = test[test['wheezy-copper-turtle-magic']==i] idx1 = train2.index; idx2 = test2.index train2.reset_index(drop=True,inplace=True) # FEATURE SELECTION sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) k = 3 # cluster_per_class for r in range(random_seed_num): # Initialize # STRATIFIED K-FOLD skf = StratifiedKFold(n_splits=11, random_state=seed+r, shuffle=True) for j, (train_index, test_index) in enumerate(skf.split(train3, train2['target'])): ms_key = "ms_{}_{}_{}".format(i, r, j) ps_key = "ps_{}_{}_{}".format(i, r, j) if ms_key in trained_parameter and ps_key in trained_parameter: ms = trained_parameter[ms_key] ps = trained_parameter[ps_key] else: # MODEL AND PREDICT WITH GMM new_label = np.zeros(len(train_index)) try_cnt = 0 while True: gm = GaussianMixture(random_state=seed+try_cnt+r, n_components=k).fit(train3[train_index,:][train2.loc[train_index]['target'] == 0]) new_label[train2.loc[train_index]['target'] == 0] = gm.predict(train3[train_index,:][train2.loc[train_index]['target'] == 0, :]) gm = GaussianMixture(random_state=seed+try_cnt+r, n_components=k).fit(train3[train_index,:][train2.loc[train_index]['target'] == 1]) new_label[train2.loc[train_index]['target'] == 1] = k + gm.predict(train3[train_index,:][train2.loc[train_index]['target'] == 1, :]) try: ms, ps = get_mean_cov(train3[train_index,:], new_label) except (FloatingPointError,ValueError) as e: try_cnt += 1 continue else: break gm = GaussianMixture(random_state=seed, n_components=2*k, init_params='random', covariance_type='full', tol=0.001,reg_covar=0.001, max_iter=100, n_init=1,means_init=ms, precisions_init=ps) gm.fit(np.concatenate([train3[train_index,:], test3, train3[test_index, :]],axis = 0)) # GMM_array[r][0]: oof # GMM_array[r][1]: preds GMM_array[r][0][idx1[test_index]] += np.sum(gm.predict_proba(train3[test_index,:])[:,k:], axis=1) GMM_array[r][1][idx2] += np.sum(gm.predict_proba(test3)[:,k:], axis=1) / skf.n_splits # oof[idx1[test_index]] += np.sum(gm.predict_proba(train3[test_index,:])[:,k:], axis=1) #/ random_seed_num # preds[idx2] += np.sum(gm.predict_proba(test3)[:,k:], axis=1) / skf.n_splits #/ random_seed_num # GMM_array.append([oof, preds]) # Print cv GMM averaging_oof = np.zeros(len(train)) for array in GMM_array: averaging_oof += (array[0] / random_seed_num) auc = roc_auc_score(train['target'],averaging_oof) print('GMM_random_seed_averaging CV =',round(auc,5)) return GMM_array
def _epoch(self, loader, criterion, optimizer=None, train=False): if train and not optimizer: raise AttributeError("Optimizer should be given for training") if train: self.base_model.train() mode = 'Train' else: self.base_model.eval() mode = 'Eval' losses = AverageMeter() labels = [] outputs = [] for bi, batch in enumerate( tqdm_notebook(loader, desc="{} batches".format(mode), leave=False)): inputs, targets = batch lengths = torch.randint(low=4, high=inputs.shape[2], size=(len(inputs), )) lengths, _ = torch.sort(lengths, descending=True) lengths[0] = inputs.shape[-1] inputs = inputs.permute(0, 2, 1) # Shape: (batch, length, features) if self.data == 'mimic_int': #this is multilabel with labels over time targets = targets[torch.range(0, len(inputs) - 1).long(), :, lengths - 1] targets = torch.argmax(targets, dim=1) elif self.data == 'simulation' or self.data == 'simulation_spike' or self.data == 'simulation_l2x': targets = targets[torch.range(0, len(inputs) - 1).long(), lengths - 1] elif self.data == 'mimic': #does not have labels over time targets = targets[torch.range(0, len(inputs) - 1).long()] input_var = torch.autograd.Variable(inputs) target_var = torch.autograd.Variable(targets) input_var = input_var.to(self.device) target_var = target_var.to(self.device) output, alpha, beta = self.base_model(input_var, lengths) loss = criterion(output, target_var.long()) labels.append(targets) # since the outputs are logit, not probabilities outputs.append(torch.nn.functional.softmax(output).data) # record loss losses.update(loss.item(), inputs.size(0)) # compute gradient and do update step if train: optimizer.zero_grad() loss.backward() optimizer.step() return torch.cat(labels, 0), torch.cat(outputs, 0), losses.avg
unbinned_A_mle = [[] for i in range(len(sig_params))] binned_A_mle = [[] for i in range(len(sig_params))] binned_A_hybrid_mle = [[] for i in range(len(sig_params))] binned_A_50_mle = [[] for i in range(len(sig_params))] binned_A_100_mle = [[] for i in range(len(sig_params))] binned_A_200_mle = [[] for i in range(len(sig_params))] binned_A_400_mle = [[] for i in range(len(sig_params))] binned_A_1000_mle = [[] for i in range(len(sig_params))] binned_A_2000_mle = [[] for i in range(len(sig_params))] cnc_A_mle = [[] for i in range(len(sig_params))] sig_pdf_ROOT = functools.partial(sig_pdf, doROOT=True) tf1_sig_pdf = TF1("tf1_sig_pdf", sig_pdf_ROOT, 2800, 13000, 2) for i, sig_p in enumerate(tqdm_notebook(sig_params, desc='Signal Model')): n_sig = n_bg tf1_sig_pdf.SetParameters(*sig_p) mc_sig = [tf1_sig_pdf.GetRandom() for ns in range(n_sig)] be_sig = bayesian_blocks(mc_sig, p0=0.02) true_sig_bc_bb = get_true_bin_content(be_bg, sig_pdf, sig_p) true_sig_bc_50GeV = get_true_bin_content(be_50GeV, sig_pdf, sig_p) true_sig_bc_100GeV = get_true_bin_content(be_100GeV, sig_pdf, sig_p) true_sig_bc_200GeV = get_true_bin_content(be_200GeV, sig_pdf, sig_p) true_sig_bc_400GeV = get_true_bin_content(be_400GeV, sig_pdf, sig_p) true_sig_bc_1000GeV = get_true_bin_content(be_1000GeV, sig_pdf, sig_p) true_sig_bc_2000GeV = get_true_bin_content(be_2000GeV, sig_pdf, sig_p) be_hybrid = np.sort(np.unique(np.concatenate([be_bg, be_sig])))
def QDA_prediction(train, test, seed=42): cols = [c for c in train.columns if c not in ['id', 'target']] cols.remove('wheezy-copper-turtle-magic') oof = np.zeros(len(train)) preds = np.zeros(len(test)) for i in tqdm_notebook(range(512)): train2 = train[train['wheezy-copper-turtle-magic']==i] test2 = test[test['wheezy-copper-turtle-magic']==i] idx1 = train2.index; idx2 = test2.index train2.reset_index(drop=True,inplace=True) data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])]) pipe = Pipeline([('vt', VarianceThreshold(threshold=2)), ('scaler', StandardScaler())]) data2 = pipe.fit_transform(data[cols]) train3 = data2[:train2.shape[0]]; test3 = data2[train2.shape[0]:] for r in range(30): skf = StratifiedKFold(n_splits=10, random_state=42+r, shuffle=True) for train_index, test_index in skf.split(train2, train2['target']): clf = QuadraticDiscriminantAnalysis(0.5) clf.fit(train3[train_index,:],train2.loc[train_index]['target']) oof[idx1[test_index]] += clf.predict_proba(train3[test_index,:])[:,1] / 30.0 preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits / 30.0 auc = roc_auc_score(train['target'], oof) print(f'AUC: {auc:.5}') result_array = [] for itr in range(4): test['target'] = preds test.loc[test['target'] > 0.955, 'target'] = 1 test.loc[test['target'] < 0.045, 'target'] = 0 usefull_test = test[(test['target'] == 1) | (test['target'] == 0)] new_train = pd.concat([train, usefull_test]).reset_index(drop=True) print(usefull_test.shape[0], "Test Records added for iteration : ", itr) new_train.loc[oof > 0.995, 'target'] = 1 new_train.loc[oof < 0.005, 'target'] = 0 oof2 = np.zeros(len(train)) preds = np.zeros(len(test)) for i in tqdm_notebook(range(512)): train2 = new_train[new_train['wheezy-copper-turtle-magic']==i] test2 = test[test['wheezy-copper-turtle-magic']==i] idx1 = train[train['wheezy-copper-turtle-magic']==i].index idx2 = test2.index train2.reset_index(drop=True,inplace=True) data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])]) pipe = Pipeline([('vt', VarianceThreshold(threshold=2)), ('scaler', StandardScaler())]) data2 = pipe.fit_transform(data[cols]) train3 = data2[:train2.shape[0]] test3 = data2[train2.shape[0]:] random_seed_num = 30 for r in range(random_seed_num): skf = StratifiedKFold(n_splits=10, random_state=seed+r, shuffle=True) for train_index, test_index in skf.split(train2, train2['target']): oof_test_index = [t for t in test_index if t < len(idx1)] clf = QuadraticDiscriminantAnalysis(0.5) clf.fit(train3[train_index,:],train2.loc[train_index]['target']) if len(oof_test_index) > 0: oof2[idx1[oof_test_index]] += clf.predict_proba(train3[oof_test_index,:])[:,1] / random_seed_num preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits / random_seed_num result_array.append([oof2, preds]) auc = roc_auc_score(train['target'], oof2) print(f'AUC: {auc:.5}') return result_array
def todays_scrape(thedir, item, city, cityname, now): zips = pd.read_csv(thedir + 'zipcodes.csv', index_col=0) thezip = zips.loc[zips['City'] == cityname, 'Zipcode'].iloc[0] if cityname == 'Baltimore': thezip = 21211 hdrs = { 'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 ' + '(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } proxies = setup_proxy_rotation() proxy_index = random_proxy(proxies) proxy = proxies[proxy_index] first_url = ('https://' + cityname.lower().replace(' ', '') + '.craigslist.org/search/lac/fuo?postal=' + str(thezip) + '&query=' + item + '&s=' + '0' + '&search_distance=30') # create a new Firefox session #driver = webdriver.Chrome() #driver.implicitly_wait(30) #driver.get(first_url) page = requests.get(first_url, headers=hdrs, proxies=proxy) soup = BeautifulSoup(page.content, 'html.parser') #soup = BeautifulSoup(driver.page_source,'html.parser') # Get total number of couches totalcount = int( str(soup.find('span', class_='totalcount')).split(">")[1].split("<")[0]) badid = [] theid = [] theurl = [] theprice = [] theimgurl = [] time_since_posting = [] # This cycles through the Craigslist search result pages for ipage in tqdm_notebook(range(0, math.floor(totalcount / 120))): #ipage=1 #if 1: next_url = ('https://' + cityname.lower().replace(' ', '') + '.craigslist.org/search/lac/fuo?postal=' + str(thezip) + '&query=' + item + '&s=' + str(120 * ipage) + '&search_distance=30') proxies = setup_proxy_rotation() proxy_index = random_proxy(proxies) proxy = proxies[proxy_index] page = requests.get(next_url, headers=hdrs, proxies=proxy) soup = BeautifulSoup(page.content, 'html.parser') for i in soup.find_all('a', class_='result-image gallery empty'): badid.append(int(str(i).split('/')[-2].split('.')[0])) badcounter = 0 for i in range(len(soup.find_all('a', class_="result-title"))): #i=116 tit = str(soup.find_all('a', class_="result-title")[i]) theid.append(int(tit.split(' ')[3].replace('data-id="', '')[0:-2])) theurl.append(tit.split(' ')[4].split('"')[1]) trow = str(soup.find_all('li', class_='result-row')[i]) theprice.append( int( trow.split('result-meta')[1].split(">")[2].split("<") [0].replace('$', ''))) if ('result-image gallery empty' in str(soup.find_all('li', class_='result-row')[i])): theimgurl.append('bad') badcounter += -1 else: imgid = str( soup.find_all('a', class_='result-image gallery')[ i + badcounter]).split('"')[3].split(',')[0][2:] tturl = (theurl[i].replace(theurl[i].split('/')[-2], imgid + '_300x300')) theimgurl.append('https://images.craigslist.org/' + tturl.split('/')[-2] + '.jpg') # Save image to disk outfile = thedir + city + '/' + item + '_images/' + str( theid[i]) + '.jpg' if not os.path.exists(outfile): urllib.request.urlretrieve(theimgurl[i], outfile) timepost = str(soup.find_all( 'time', class_='result-date')[i]).split('"')[3] mydelta = (now - datetime.strptime(timepost, '%Y-%m-%d %H:%M')) time_since_posting.append(mydelta.days + mydelta.seconds / 60 / 60 / 24) # Get rid of shitty posts boolcompare = [True] * len(theid) for i in range(len(boolcompare)): if theid[i] in badid: boolcompare[i] = False theid = list(np.array(theid)[boolcompare]) theprice = list(np.array(theprice)[boolcompare]) theurl = list(np.array(theurl)[boolcompare]) time_since_posting = list(np.array(time_since_posting)[boolcompare]) theimgurl = list(np.array(theimgurl)[boolcompare]) todays_scrape_df = pd.DataFrame( list(zip(theprice, time_since_posting, theimgurl, theurl)), columns=['price', 'time_since_posting', 'imgurl', 'url'], index=theid) return todays_scrape_df
def first_scrape(thedir, item, city, modify_id=False, modify_url=False, modify_price=False): if not modify_id: (theid, theurl, theprice) = gather_ids(thedir, item, city, cityname) else: theid = modify_id theurl = modify_url theprice = modify_price badid = [] imgurl = [''] * len(np.array(theid)) postdate = [''] * len(np.array(theid)) time_since_posting = [0] * len(np.array(theid)) proxies = setup_proxy_rotation() for i in tqdm_notebook(range(len(theurl))): #if not os.path.exists(thedir+city+'/'+item+'_images/'+str(theid[i])+'.jpg'): #headers = requests.utils.default_headers() #headers['User-Agent'] = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'+ # ' (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36') hdrs = { 'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 ' + '(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } if not (i % 10): # Renew proxies every 50, otherwise this is pretty slow proxies = setup_proxy_rotation() proxy_index = random_proxy(proxies) proxy = proxies[proxy_index] page = requests.get(theurl[i], proxies=proxy, headers=hdrs) singlesoup = BeautifulSoup(page.content, 'html.parser') if len(singlesoup.find_all('meta', property="og:image")) == 0: print('bad ID') badid.append(theid[i]) else: tmp_image_url = str( singlesoup.find_all('meta', property="og:image")[0]).split('"')[1] #if not (i % 50): # print('changing proxy') # proxy_index = random_proxy(proxies) # proxy = proxies[proxy_index] # request = requests.get(tmp_image_url, proxies=proxy, headers={'Connection':'close'}) #else: check_if_exists = None while check_if_exists == None: try: check_if_exists = requests.get(tmp_image_url, proxies=proxy, headers=hdrs) except: print("%% Taking a nap, page check didn't like me") time.sleep(5) if check_if_exists.status_code == 200: # Save the image URL path imgurl[i] = tmp_image_url # Save the post image outfile = thedir + city + '/' + item + '_images/' + str( theid[i]) + '.jpg' if not os.path.exists(outfile): urllib.request.urlretrieve(tmp_image_url, outfile) # Save the post date information adate = str(singlesoup.find('time')).split('"')[3] adate = adate.replace('T', ' ') adate = adate.replace('-', ' ') adate = adate[0:-5] tpostdate = datetime.strptime(adate, '%Y %m %d %H:%M:%S') postdate[i] = (tpostdate.strftime("%d-%m-%Y")) # And time since posting datetime_object = datetime.strptime(adate, '%Y %m %d %H:%M:%S') time_since_posting[i] = ((now - datetime_object).days) else: badid.append(theid[i]) # Get rid of shitty posts boolcompare = [True] * len(theid) for i in range(len(boolcompare)): if theid[i] in badid: boolcompare[i] = False theid = list(np.array(theid)[boolcompare]) theprice = list(np.array(theprice)[boolcompare]) theurl = list(np.array(theurl)[boolcompare]) todays_scrape_df = pd.DataFrame( list(zip(theprice, time_since_posting, imgurl, theurl)), columns=['price', 'time_since_posting', 'imgurl', 'url'], index=theid) return todays_scrape_df
#=============================JUPYTER NOTEBOOK================================== ## Progress bar from tqdm import tqdm_notebook from time import sleep for i in tqdm_notebook(range(100)): sleep(0.01) #------------------------------------------------------------------------------- ## matplotlib inline %matplotlib inline #------------------------------------------------------------------------------- ## change from scientific notation to decimal point in pandas pd.set_option('display.float_format', lambda x: '%.0f' % x) #------------------------------------------------------------------------------- ## #Limiting floats output to 3 decimal points pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #------------------------------------------------------------------------------- ##Ignore warnings import warnings; warnings.filterwarnings('ignore') #------------------------------------------------------------------------------- ## Run SQL queries in Pandas from pysqldf import SQLDF; sqldf = SQLDF(globals()); q = getattr(sqldf, 'execute') import warnings; warnings.filterwarnings('ignore') #------------------------------------------------------------------------------- ## Styling tables in Notebook from IPython.display import HTML
def train(self): for epoch in tqdm_notebook(range(self.epochs), desc='epochs'): # for each epochs, we shuffle the list of all the datasets c = list(zip(self.sample_train, self.sample_target)) shuffle(c) self.sample_train, self.sample_target = zip(*c) loss_total = 0 steps = 0 steps_nnet = 0 print(self.sample_train[0]) # Iterate all songs by the length of sample input (total_songs) and batches (batch_song) for i in tqdm_notebook(range(0, self.total_songs, self.batch_song), desc='MUSIC'): # EXAMPLE: [0,5,10,15,20] FOR TOTAL_SONGS = 20 AND BATCH_SONG = 5 steps += 1 #inputs_nnet_large, outputs_nnet_large = generate_batch_song( # self.sample_input, self.batch_song, start_index=i, fs=self.frame_per_second, # seq_len=seq_len, use_tqdm=False) # We use the function that have been defined here #inputs_nnet_large = np.array(self.note_tokenizer.transform(inputs_nnet_large), dtype=np.int32) #outputs_nnet_large = np.array(self.note_tokenizer.transform(outputs_nnet_large), dtype=np.int32) # EXAMPLE LARGE INPUTS = ARRAY([1,2,3,4],[2,3,4,5],[2,3,4,5],[2,3,4,5],[1,2,3,4]) input_batch = [ y for x in self.sample_train[i:i + self.batch_song] for y in x ] print(input_batch) break output_batch = [ y for x in self.sample_target[i:i + self.batch_song] for y in x ] c = list(zip(input_batch, output_batch)) print(c) sample_in = sample(c, 10000) input_batch, output_batch = zip(*sample_in) print(len(input_batch)) inputs_nnet_large = np.array(input_batch) outputs_nnet_large = np.array(output_batch) # Get an index of all windows in a song index_shuffled = np.arange(start=0, stop=len(inputs_nnet_large)) np.shuffle(index_shuffled) for nnet_steps in tqdm_notebook( range(0, len(index_shuffled), self.batch_nnet_size)): steps_nnet += 1 current_index = index_shuffled[nnet_steps:nnet_steps + self.batch_nnet_size] inputs_nnet, outputs_nnet = inputs_nnet_large[ current_index], outputs_nnet_large[current_index] # To make sure no exception thrown by tensorflow on autograph if len(inputs_nnet) // self.batch_nnet_size != 1: break loss = self.train_step(inputs_nnet, outputs_nnet) loss_total += tf.math.reduce_sum(loss) if steps_nnet % 20 == 0: print("epochs {} | Steps {} | total loss : {}".format( epoch + 1, steps_nnet, loss_total)) checkpoint.save(file_prefix=self.checkpoint_prefix)
#%% bpr_related_subreddits('dogs') #%% users = data['user'].cat.categories.array.to_numpy() #%% write_bpr_recommendations = False #%% user_comments = comments.T.tocsr() if write_bpr_recommendations: # generate recommendations for each user and write out to a file with tqdm.tqdm_notebook(total=len(users)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for userid, username in enumerate(users): for subredditid, score in model.recommend( userid, user_comments): o.write("%s\t%s\t%s\n" % (username, subreddits[subredditid], score)) progress.update(1) #%% [markdown] # ### Sample user recommendations # # We went through the user 'xkcd_transciber' list of subreddits, where he/she commented. Taking a view of the kind of subreddits followed by the user we see that the predictions are good. This is just one sample, we are saving the recommendations for all users in a file and will also write the AUC score function for getting the exact scores for the generated recommendations. #%%
import subprocess import numpy as np import mdtraj as md from tqdm import tqdm_notebook import time print "DSSP (Define Secondary Structure of Proteins) Hydrogen Bonding Algorithm for G protein\n" print "‘H’ : Helix ‘G’ : 3-helix (3/10 helix)\n'E’ : Beta ladder ‘B’ : Beta-bridge\n‘C’ : Random Coil ‘T’ : Hydrogen bonded turn\n" gro_reference = '/Users/tue53144/Gprotein/gro/p8600/protein_only.gro' traj_references = '/Volumes/Vav6/PROJ8600/RUN%d/CLONE%d/frame%d.xtc' nruns = 1 nclones = 40 trajs = [] dssp = [] for run in tqdm_notebook(range(nruns), desc='Loading Trajs.'): time.sleep(0.01) for clone in range(nclones): nframes = int( subprocess.check_output( 'ls /Volumes/Vav6/PROJ8600/RUN%d/CLONE%d/frame*.xtc | wc -l' % (run, clone), shell=True)) frame = nframes - 1 if frame >= 0: #Doing all frames would take ~7 hours... for frm in range(frame): loadtrajs = md.load(traj_references % (run, clone, frm), top=gro_reference) if loadtrajs.time[-1] >= 0: trajs.append(loadtrajs)
def interpolate_catalog_sb(cat, bandname='r', radtype='eff', sbname='sbeff_r', radname='rad_sb', loopfunc=lambda x:x): """ Takes a DECaLS tractor catalog and adds r-band half-light surface brightness to it. ``radtype`` can be "eff" for model-determined reff, or a angle-unit quantity for a fixed aperture SB For details/tests that this function works, see the "DECALS low-SB_completeness figures" notebook. """ bandidx = decam_band_name_to_idx[bandname] if 'decam_apflux' in cat.colnames: r_ap_fluxes = cat['decam_apflux'][:, bandidx, :] elif 'apflux_' + bandname in cat.colnames: r_ap_fluxes = cat['apflux_' + bandname] else: raise ValueError('found no valid {}-band apflux column!'.format(bandname)) assert r_ap_fluxes.shape[-1] == 8, 'Column does not have 8 apertures' expflux_r = np.empty_like(r_ap_fluxes[:, 0]) rad = np.empty(len(r_ap_fluxes[:, 0])) ap_sizesv = DECALS_AP_SIZES.to(u.arcsec).value intr = interpolate.BarycentricInterpolator(ap_sizesv, [0]*len(ap_sizesv)) if loopfunc == 'ProgressBar': from astropy.utils.console import ProgressBar loopfunc = lambda x: ProgressBar(x) elif loopfunc == 'NBProgressBar': from astropy.utils.console import ProgressBar loopfunc = lambda x: ProgressBar(x, ipython_widget=True) elif loopfunc == 'tqdm': import tqdm loopfunc = lambda x: tqdm.tqdm(x) elif loopfunc == 'tqdm_notebook': import tqdm loopfunc = lambda x: tqdm.tqdm_notebook(x) for i in loopfunc(range(len(r_ap_fluxes))): f = r_ap_fluxes[i] if radtype != 'eff': r = radtype elif cat['type'][i] == 'PSF ': if 'decam_psfsize' in cat.colnames: r = cat['decam_psfsize'][i, bandidx] else: r = cat['psfsize_' + bandname][i] elif cat['type'][i] == 'DEV ': if 'shapeDev_r' in cat.colnames: r = cat['shapeDev_r' ][i] else: # DR4 changed to all lower-case... WWHHHHYYY!!?!?!??!?!?!?!? r = cat['shapedev_r'][i] else: if 'shapeExp_r' in cat.colnames: r = cat['shapeExp_r'][i] else: # DR4 changed to all lower-case... WWHHHHYYY!!?!?!??!?!?!?!? r = cat['shapeexp_r'][i] intr.set_yi(f) expflux_r[i] = intr(r) rad[i] = r cat[sbname] = compute_sb(rad*u.arcsec, np.array(expflux_r)) cat[radname] = rad*u.arcsec
def get_term_statistics(corpus_one, corpus_two, freq_num, psudeocount=1, disable_progressbar=False): """ This function is designed to perform the folllowing calculations: - log likelihood of contingency table - log odds ratio keywords: corpus_one - a dataframe object with terms and counts corpus_two - a datafram object with terms and counts freq_num - number of most common words to use from both corpora psudeocount - the psudocount to avoid divide by zero disable_progressbar - show the progress bar? """ spacy_nlp = spacy.load("en_core_web_sm") stop_word_list = list(spacy_nlp.Defaults.stop_words) # Remove special characters here when calculating odds ratio term_list = set( corpus_one.query("lemma.str.len() > 1").query( "lemma.str.contains(r'[a-z]')"). query(f"lemma not in {stop_word_list}").sort_values( "count", ascending=False).head(freq_num).lemma.values) | set( corpus_two.query("lemma.str.len() > 1"). query("lemma.str.contains(r'[a-z]')").query( f"lemma not in {stop_word_list}").sort_values( "count", ascending=False).head(freq_num).lemma.values) corpus_one_total = corpus_one["count"].sum() corpus_two_total = corpus_two["count"].sum() term_data = [] for term in tqdm_notebook(term_list, disable=disable_progressbar): corpus_one_term_count = ( corpus_one.query(f"lemma=={repr(term)}")["count"].values[0] if term in corpus_one.lemma.tolist() else 0) corpus_two_term_count = ( corpus_two.query(f"lemma=={repr(term)}")["count"].values[0] if term in corpus_two.lemma.tolist() else 0) observed_contingency_table = np.array([ [corpus_one_term_count, corpus_two_term_count], [corpus_one_total, corpus_two_total], ]) # Log Likelihood ## add psudeocount to prevent log(0) observed_contingency_table += psudeocount a, b, c, d = ( observed_contingency_table[0][0], observed_contingency_table[0][1], observed_contingency_table[1][0], observed_contingency_table[1][1], ) # Obtained from (Kilgarriff, 2001) - Comparing Corpora def LL(a, b, c, d): return 2 * (a * np.log(a) + b * np.log(b) + c * np.log(c) + d * np.log(d) - (a + b) * np.log(a + b) - (a + c) * np.log(a + c) - (b + d) * np.log(b + d) - (c + d) * np.log(c + d) + (a + b + c + d) * np.log(a + b + c + d)) log_likelihood = LL(a, b, c, d) # Log Odds log_ratio = float((a * d) / (b * c)) term_data.append({ "lemma": term, "corpus_one_a": a, "corpus_two_b": b, "corpus_one_c": c, "corpus_two_d": d, "log_likelihood": log_likelihood, "odds_ratio": log_ratio, }) return pd.DataFrame.from_records(term_data)
def collect_comment(self, keyword): process = tqdm_notebook(self.href) for news in process: process.set_description("댓글 수집 중입니다.") self.__driver.implicitly_wait(3) self.__driver.get(news) try: # 최초 더보기 버튼 클릭 self.__driver.find_element_by_css_selector( ".u_cbox_btn_view_comment").click() self.__driver.implicitly_wait(3) # 버튼의 유형이 다른 경우 발생 except Exception as e: try: self.__driver.find_element_by_css_selector( ".simplecmt_link_text").click() self.__driver.implicitly_wait(3) except: continue # pass # 더보기 버튼 계속 누르기. # 뉴스 기사 및 회사이름 가져오기. company = self.get_company_name() collect_text = "" company = 'C:/Users/khk37/뉴스기사/' + keyword + company.strip() try: if not os.path.exists(company.strip()): os.mkdir(company) except Exception as e: print("os.mkdir 에러", e) try: collect_text = self.get_news_title(company, '.end_tit') except: try: collect_text = self.get_news_title(company, '.tts_head') except: collect_text = self.get_news_title(company, '#articleTitle') try: while True: self.__driver.execute_script( "window.scrollTo(0,document.body.scrollHeight);") self.__driver.find_element_by_css_selector( ".u_cbox_btn_more").click() self.__driver.execute_script( "window.scrollTo(0,document.body.scrollHeight);") except exceptions.ElementNotVisibleException as e: # 페이지 pass except Exception as e: # 다른 예외 발생시 확인 self.page += 1 print("에러 : ", e) # document.body.scrollHeight # 스크롤 끝으로 올림. # self.__driver.execute_script("window.scrollTo(0, 0);") soup = self.parsing_html(self.__driver.page_source) comment_list = soup.find_all("span", {"class": "u_cbox_contents"}) # last_height = self.__driver.execute_script("return document.body.scrollHeight") # elem = self.__driver.find_element_by_tag_name("body") down = 0 number = 1 for comment in comment_list: try: collect_text._write_text( self.model.predict_pos_neg(comment.text)) except: continue self.page += 1 process.set_description("댓글 수집 완료.") return self.model.bad_or_good()
index=dataset.index) dataset_scaled['return'] = dataset['return'] dataset_scaled.describe() # In[7]: import tqdm n = 3 X = [] y = [] indexes = [] dataset_scaled_x = dataset_scaled[feature_names] for i in tqdm.tqdm_notebook(range(0, len(dataset_scaled) - n)): X.append(dataset_scaled_x.iloc[i:i + n].values) y.append(dataset_scaled['return'].iloc[i + n - 1]) indexes.append(dataset_scaled.index[i + n - 1]) #dataset_scaled.head() # In[8]: import numpy as np X = np.array(X) y = np.array(y) # In[9]: indexes = np.array(indexes)
plt.hist(sampledat, 200, normed=True); plt.yscale('log'); # In[5]: np.random.randint(0, len(sampledat), 10) # In[39]: # generate some data bins = np.linspace(-4,4,100) hists = {} stats = {} for npts in tqdm.tqdm_notebook(range(1,102,40)): d1 = sampledat[np.random.randint(0, len(sampledat), npts)] with pm.Model() as model: alpha = pm.Uniform('loc', -10, 10) # beta = pm.Uniform('dist', 1, 1) x = pm.Cauchy(name='x', alpha=alpha, beta=1, observed=d1) trace = pm.sample(10000) hists[npts] = np.histogram(trace['loc'], bins) stats[npts] = np.percentile(trace['loc'], (1, 5, 25, 50, 75, 95, 99)) # In[40]: keys = sorted(list(hists.keys())) for k in keys: p = plt.plot(tb.bin_edges_to_center(bins), hists[k][0]/np.max(hists[k][0]),
def train(self, target,source,gen_optimizer,disc_optimizer,num_epochs=10, disc_steps=1, gen_lr_schedule=None,disc_lr_schedule=None, model_dir=os.getcwd(), save_interval=100,notebook_mode=False,batch_log=True,save_logs=None,display_metrics=True,save_metrics=True): assert(len(target.dataset) == len(source.dataset)) assert(disc_steps < len(target.dataset)) if not os.path.exists(model_dir): os.mkdir(model_dir) self.model_dir = model_dir models_gen = os.path.join(model_dir, "gen_models") models_disc = os.path.join(model_dir, "disc_models") if not os.path.exists(models_gen): os.mkdir(models_gen) if not os.path.exists(models_disc): os.mkdir(models_disc) iterations = 0 from tqdm import tqdm_notebook from tqdm import tqdm train_start_time = time() for e in tqdm(range(num_epochs)): self.gen_model.train() self.disc_model.train() self.on_epoch_start(e) running_gen_loss = torch.Tensor([0.0]) running_disc_loss = torch.Tensor([0.0]) gen_loss = 0.0 disc_loss = 0.0 gen_data_len = 0 disc_data_len = 0 if notebook_mode and batch_log: progress_ = tqdm_notebook(enumerate(zip(target,source))) elif batch_log: progress_ = tqdm(enumerate(zip(target,source))) else: progress_ = enumerate(zip(target,source)) init_time = time() for i,(t,s) in progress_: if isinstance(t, list) or isinstance(t, tuple): inputs = t[0] else: inputs = t batch_size = inputs.size(0) disc_data_len += batch_size if len(self.__input_hooks) > 0: for hook in self.__input_hooks: inputs = hook(inputs) if isinstance(t, list): t[0] = inputs elif isinstance(t, tuple): t = (inputs,t[1]) else: t = inputs self.__disc_train_func__(t, s, disc_optimizer, running_disc_loss, e, i) disc_loss = running_disc_loss.data[0] / disc_data_len if (i+1) % disc_steps == 0: self.__gen_train_func__(t, s, gen_optimizer, running_gen_loss, e, i) gen_data_len += batch_size gen_loss = running_gen_loss.data[0] / gen_data_len if batch_log: progress_dict = {"Gen Loss": gen_loss,"Disc Loss":disc_loss} progress_.set_postfix(progress_dict) iterations += 1 if iterations % save_interval == 0: self.save(s,iterations) self.show(s,iterations) self.on_batch_end(e, i, gen_loss, disc_loss) if self.cuda: cuda.synchronize() duration = time() - init_time self.disc_loss_history.append(disc_loss) self.gen_loss_history.append(gen_loss) if gen_lr_schedule is not None: lr = gen_lr_schedule(e) adjust_learning_rate(lr,gen_optimizer) if disc_lr_schedule is not None: lr = disc_lr_schedule(e) adjust_learning_rate(lr, disc_optimizer) model_file = os.path.join(models_gen, "gen_model_{}.pth".format(e)) self.save_generator(model_file) model_file = os.path.join(models_disc, "disc_model_{}.pth".format(e)) self.save_discriminator(model_file) print("Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(e, duration, gen_loss,disc_loss)) if save_logs is not None: logfile = open(save_logs, "a") logfile.write("Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(e, duration, gen_loss,disc_loss)) logfile.close() epoch_arr = [x for x in range(e + 1)] if display_metrics or save_metrics: save_path = None if save_metrics: save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e)) visualize(epoch_arr, [PlotInput(value=self.gen_loss_history, name="Generator Loss", color="red"), PlotInput(value=self.disc_loss_history, name="Discriminator Loss", color="red")],display=display_metrics, save_path=save_path) self.on_epoch_end(e,gen_loss, disc_loss, duration) train_end_time = time() - train_start_time self.on_training_completed(train_end_time)
def train(self, train_loader, loss_fn, optimizer,train_metrics,test_loader=None,test_metrics=None, num_epochs=10, lr_schedule=None, save_models="all", model_dir=os.getcwd(),notebook_mode=False,batch_log=True,save_logs=None,display_metrics=True,save_metrics=True): if save_models not in ["all", "best"]: raise ValueError("save models must be 'all' or 'best' , {} is invalid".format(save_models)) if save_models == "best" and test_loader is None: raise ValueError("save models can only be best when testloader is provided") if test_loader is not None: if test_metrics is None: raise ValueError("You must provide a metric for your test data") elif len(test_loader) == 0: raise ValueError("test metrics cannot be an empty list") if not os.path.exists(model_dir): os.mkdir(model_dir) models_all = os.path.join(model_dir, "all_models") models_best = os.path.join(model_dir, "best_models") if not os.path.exists(models_all): os.mkdir(models_all) if not os.path.exists(models_best) and test_loader is not None: os.mkdir(models_best) from tqdm import tqdm_notebook from tqdm import tqdm best_metric = 0.0 train_start_time = time() for e in tqdm(range(num_epochs)): print("Epoch {} of {}".format(e,num_epochs)) for metric in train_metrics: metric.reset() self.model.train() self.on_epoch_start(e) running_loss = torch.Tensor([0.0]) train_loss = 0.0 data_len = 0 if notebook_mode and batch_log: progress_ = tqdm_notebook(enumerate(train_loader)) elif batch_log: progress_ = tqdm(enumerate(train_loader)) else: progress_ = enumerate(train_loader) main_batch_size = 0 init_time = time() for i, data in progress_: self.on_batch_start(e, i) if isinstance(data, list) or isinstance(data, tuple): inputs = data[0] else: inputs = data batch_size = inputs.size(0) if main_batch_size < batch_size: main_batch_size = batch_size if len(self.__input_hooks) > 0: for hook in self.__input_hooks: inputs = hook(inputs) if isinstance(data, list): data[0] = inputs elif isinstance(data, tuple): data = (inputs,data[1]) else: data = inputs self.__train_func__(data,optimizer,loss_fn,train_metrics,running_loss,e,i) data_len += batch_size train_loss = running_loss.item()/data_len if batch_log: progress_message = "" for metric in train_metrics: progress_message += "Train {} : {}".format(metric.name, metric.getValue()) progress_.set_description("{}/{} batches ".format(int(ceil(data_len / main_batch_size)), int(ceil(len(train_loader.dataset) / main_batch_size)))) progress_dict = {"Train Loss": train_loss} for metric in train_metrics: progress_dict["Train " + metric.name] = metric.getValue() progress_.set_postfix(progress_dict) self.on_batch_end(e, i, train_metrics, train_loss) if self.cuda: cuda.synchronize() self.loss_history.append(train_loss) duration = time() - init_time if lr_schedule is not None: lr = lr_schedule(e) adjust_learning_rate(lr,optimizer) model_file = os.path.join(models_all, "model_{}.pth".format(e)) self.save_model(model_file) logfile = None if save_logs is not None: logfile = open(save_logs,"a") print(os.linesep+"Epoch: {}, Duration: {} , Train Loss: {}".format(e, duration, train_loss)) if logfile is not None: logfile.write(os.linesep+"Epoch: {}, Duration: {} , Train Loss: {}".format(e, duration, train_loss)) if test_loader is not None: message = "Accuracy did not improve" current_best = best_metric self.evaluate(test_loader,test_metrics) result = test_metrics[0].getValue() if result > current_best: best_metric = result message = "{} improved from {} to {}".format(test_metrics[0].name,current_best, result) model_file = os.path.join(models_best,"model_{}.pth".format(e)) self.save_model(model_file) print(os.linesep+"{} New Best Model saved in {}".format(message,model_file)) if logfile is not None: logfile.write(os.linesep+"{} New Best Model saved in {}".format(message,model_file)) else: print(os.linesep+message) if logfile is not None: logfile.write(os.linesep+message) for metric in test_metrics: print("Test {} : {}".format(metric.name,metric.getValue())) if logfile is not None: logfile.write(os.linesep+"Test {} : {}".format(metric.name,metric.getValue())) for metric in train_metrics: print("Train {} : {}".format(metric.name, metric.getValue())) if logfile is not None: logfile.write(os.linesep + "Train {} : {}".format(metric.name, metric.getValue())) if logfile is not None: logfile.close() for metric in train_metrics: metric.add_history() epoch_arr = [x for x in range(e+1)] if display_metrics or save_metrics: save_path = None if save_metrics: save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e)) visualize(epoch_arr, [PlotInput(value=self.loss_history, name="Train Loss", color="red")],display=display_metrics, save_path=save_path) if test_loader is not None and (display_metrics or save_metrics): for metric in test_metrics: save_path = None if save_metrics: save_path = os.path.join(model_dir, "test_{}_epoch_{}.png".format(metric.name, e)) visualize(epoch_arr, [PlotInput(value=metric.history, name="Test "+metric.name, color="blue")],display=display_metrics, save_path=save_path) for metric in train_metrics: if save_metrics: save_path = os.path.join(model_dir, "test_{}_epoch_{}.png".format(metric.name, e)) visualize(epoch_arr, [PlotInput(value=metric.history, name="Test " + metric.name, color="blue")],display=display_metrics, save_path=save_path) self.on_epoch_end(e, train_metrics, test_metrics, train_loss, duration) train_end_time = time() - train_start_time self.on_training_completed(train_metrics,test_metrics,train_end_time)
X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile( x_roll_mean, 0.95) X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile( x_roll_mean, 0.99) X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean( np.diff(x_roll_mean)) X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = np.mean( np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) X.loc[seg_id, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max() # In[ ]: # iterate over all segments for seg_id in tqdm_notebook(range(segments)): seg = train_df.iloc[seg_id * rows:seg_id * rows + rows] create_features(seg_id, seg, train_X) train_y.loc[seg_id, 'time_to_failure'] = seg['time_to_failure'].values[-1] # Let's check the result. We plot the shape and the head of train_X. # In[ ]: train_X.shape # In[ ]: train_X.head(10) # We scale the data.
pd.DataFrame(y_test).to_csv('./predictions/y_true.csv', index=False, encoding='utf-8') def get_coefs(word, *arr): try: # print("word:",word) # print("arr:",arr) return word, np.asarray(arr, dtype='float32') except: return None, None embeddings_index = dict( get_coefs(*o.strip().split()) for o in tqdm_notebook( open('./embeddings/glove.twitter.27B.50d.txt', encoding="utf8"))) #print(embeddings_index) embed_size = 50 for k in tqdm_notebook(list(embeddings_index.keys())): v = embeddings_index[k] try: if v.shape != (embed_size, ): embeddings_index.pop(k) i = i + 1 except: pass
def iterate_file(self, fname=DS_FILE_NAME, top_n_train=100000, total=125000, learning_rate=0.1, tolerance=1e-16, lmbda=0.01): self._loss = [] n = 0 accurate_sample = [] # откроем файл with open(fname, 'r') as f: # прогуляемся по строкам файла for line in tqdm_notebook(f, total=total, mininterval=1): desired_tags = [] pair = line.strip().split('\t') if len(pair) != 2: continue sentence, tags = pair # слова вопроса, это как раз признаки x sentence = sentence.split(' ') # теги вопроса, это y tags = set(tags.split(' ')) # значение функции потерь для текущего примера sample_loss = 0 # прокидываем градиенты для каждого тега for tag in self._tags: # целевая переменная равна 1 если текущий тег есть у текущего примера y = int(tag in tags) # расчитываем значение линейной комбинации весов и признаков объекта # инициализируем z # ЗАПОЛНИТЕ ПРОПУСКИ В КОДЕ z = self._b[tag] for word in sentence: # если в режиме тестирования появляется слово которого нет в словаре, то мы его игнорируем if n >= top_n_train and word not in self._vocab: continue if word not in self._vocab: self._vocab[word] = len(self._vocab) z += self._w[tag][self._vocab[word]] # вычисляем вероятность наличия тега # ЗАПОЛНИТЕ ПРОПУСКИ В КОДЕ if z >= 0: sigma = 1 / (1 + np.exp(-z)) else: sigma = 1 - 1 / (1 + np.exp(z)) # обновляем значение функции потерь для текущего примера # ЗАПОЛНИТЕ ПРОПУСКИ В КОДЕ if y == 1: sample_loss += -y * np.log(np.max([tolerance, sigma])) else: sample_loss += -(1 - y) * np.log(1 - np.min([1 - tolerance, sigma])) # если мы все еще в тренировочной части, то обновим параметры if n < top_n_train: # вычисляем производную логарифмического правдоподобия по весу # ЗАПОЛНИТЕ ПРОПУСКИ В КОДЕ dLdw = y - sigma # делаем градиентный шаг # мы минимизируем отрицательное логарифмическое правдоподобие (второй знак минус) # поэтому мы идем в обратную сторону градиента для минимизации (первый знак минус) for word in sentence: self._w[tag][self._vocab[word]] -= -learning_rate * dLdw self._b[tag] -= -learning_rate * dLdw if sigma > 0.9: desired_tags.append(tag) if (n > top_n_train): accurate_sample.append(len(tags.intersection(desired_tags)) / len(tags.union(desired_tags))) n += 1 self._loss.append(sample_loss) return (np.mean(accurate_sample))
g_optimizer = torch.optim.Adam(model.G.parameters(), lr=lr, betas=(0.5, 0.999)) z_sample = Variable(torch.randn(64, z_dim)) z_sample = z_sample.to(device) D_loss = [] G_loss = [] GP = [] images = [] lam = 10. try: for epoch in range(args.STARTING_EPOCH, args.STARTING_EPOCH + epochs): for i, (imgs, _) in enumerate(tqdm_notebook(trainloader)): step = epoch * len(trainloader) + i + 1 # set train model.G.train() # leafs imgs = Variable(imgs) bs = imgs.size(0) z = Variable(torch.randn(bs, z_dim)) imgs, z = imgs.to(device), z.to(device) f_imgs = model.G(z) r_logit = model.D(imgs) f_logit = model.D(f_imgs.detach())
cur_seg = final_model.predict(c_img)[0] cur_seg = binary_opening(cur_seg>0.5, np.expand_dims(disk(2), -1)) return cur_seg, c_img def pred_encode(img): cur_seg, _ = predict(img) cur_rles = rle_encode(cur_seg) return [img, cur_rles if len(cur_rles) > 0 else None] from tqdm import tqdm_notebook test_paths = np.array(os.listdir(test_image_dir)) #test_paths = test_paths[0:10] out_pred_rows = [] for c_img_name, index in zip(tqdm_notebook(test_paths), range(len(test_paths))): out_pred_rows += [pred_encode(c_img_name)] if index % 200 == 0: print('Processed {} test images'.format(index)) sub = pd.DataFrame(out_pred_rows) sub.columns = ['ImageId', 'EncodedPixels'] sub.to_csv('submission_003.csv', index=False) test_masks = pd.read_csv('submission_002.csv') test_masks['ships'] = masks['EncodedPixels'].map(lambda c_row: 1 if isinstance(c_row, str) else 0) test_masks.count() test_masks.query('ships>0').count() train_set.shape train_set.query('ship_num>0').shape
def calibrate(arguments): calibration_data_filename, groups_to_calibrate, ids_in_subwatershed, parameter_realz, objective_function, minimize_objective_function, cpu = arguments # Load calibration data calibration_data = pickle.load( open(os.path.join(parent_dir,'calibration_data',calibration_data_filename))) calibration_data = calibration_data[spinup_date:stop_date] N = len(parameter_realz) # for each parameter realization objs = [] best_fit = pd.DataFrame({'modeled':np.zeros(len(timestamps_hillslope))}, index=timestamps_hillslope).resample('D').mean() if minimize_objective_function: objs_curr = np.inf else: objs_curr = -np.inf best_index = -1 desc = "Core #%s"%(cpu) for i in tqdm_notebook(range(N), desc=desc): solved_groups = {} parameter_group_params = {} parameter_group_params = parameter_realz[i] solved_group_hillslopes_dict = {} for group_id in groups_to_calibrate: parameter_group_id = group_id[0] climate_group_id = group_id[1] vz = parameter_group_params[parameter_group_id]['vz'](**parameter_group_params[parameter_group_id]) gz = parameter_group_params[parameter_group_id]['gz'](**parameter_group_params[parameter_group_id]) rew = REW(vz, gz, **{'pet':climate_group_forcing[climate_group_id].pet, 'ppt':climate_group_forcing[climate_group_id].ppt, 'aspect':90}) # storageVZ = np.zeros(np.size(t)) # storageGZ = np.zeros(np.size(t)) discharge = np.zeros(np.size(t)) leakage = np.zeros(np.size(t)) # ET = np.zeros(np.size(t)) # Resample pet and ppt to integration timestep ppt = np.array(rew.ppt[start_date:stop_date].resample(resample_freq_hillslope).ffill()) pet = np.array(rew.pet[start_date:stop_date].resample(resample_freq_hillslope).ffill()) # Solve group hillslope for l in range(len(t)): rew.vz.update(dt,**{'ppt':ppt[l],'pet':pet[l]}) # storageVZ[l] = rew.vz.storageVZ leakage[l] = rew.vz.leakage # ET[l] = rew.vz.ET rew.gz.update(dt,**{'leakage':leakage[l]}) # storageGZ[l] = rew.gz.storageGZ discharge[l] = rew.gz.discharge # resample as daily data solved_groups[group_id] = pd.DataFrame({'discharge':discharge}, index=timestamps_hillslope).resample('D').mean() total_area = 0 for rew_id in ids_in_subwatershed: total_area += rew_config[rew_id]['area_sqkm'] name = str(i) + 'discharge' solved_subwatershed = pd.DataFrame({name:np.zeros(len(timestamps_hillslope))}, index=timestamps_hillslope).resample('D').mean() solved_subwatershed_array = np.zeros(int(len(solved_subwatershed))) for rew_id in ids_in_subwatershed: solved_subwatershed_array += rew_config[rew_id]['area_sqkm']/total_area*solved_groups[rew_config[rew_id]['group']]['discharge'] solved_subwatershed[name] = solved_subwatershed_array objs.append(objective_function(solved_subwatershed[name][spinup_date:stop_date],calibration_data['runoff'][spinup_date:stop_date])) if minimize_objective_function: if objs[i]<objs_curr: objs_curr = objs[i] best_index = i best_fit = solved_subwatershed[name].copy() print('Min objective function value so far is: ' + str(objs_curr)) else: if objs[i]>objs_curr: objs_curr = objs[i] best_index = i best_fit = solved_subwatershed[name].copy() return (best_fit, objs_curr, best_index)
import time import re import pickle import os from collections import defaultdict,OrderedDict import multiprocessing import collections import json from tqdm import tnrange, tqdm_notebook from tqdm import tqdm, tqdm_pandas tqdm_notebook().pandas() import numpy as np import pandas as pd pd.options.display.max_rows = 25 pd.options.display.max_columns = 999 #from datetime import datetime, timedelta, timezone import keras from IPython.display import display from keras.models import Sequential, Model from keras.layers import Input, Dense, LSTM, Dropout, Activation, GRU, Embedding from keras.layers import concatenate as Concatenate from keras.layers.core import Flatten, Reshape from keras.layers.convolutional import * from keras.layers.pooling import * from keras.layers.normalization import BatchNormalization from keras.layers.noise import GaussianDropout
df_test = df_test.iloc[test_idx].copy() del df_ratings # In[14]: df_test.iloc[1]["comment"], [ mapping.get(x, UNK) for x in df_test.iloc[1]["comment"].split(" ") ] # In[15]: results = [] tokens_train, tokens_val, tokens_test = [], [], [] for df, tokens in zip((df_train, df_val, df_test), (tokens_train, tokens_val, tokens_test)): for i, row in tqdm_notebook(df.iterrows(), total=df.shape[0]): tokens.append( np.array([BEG] + [mapping.get(x, UNK) for x in row["comment"].split(" ")])) # In[16]: assert len(tokens_train) == df_train.shape[0] # In[74]: tokens_val[0] # In[75]: df_val.iloc[0]
def run(data_dir: str = './env/data', vae_dir: str = './vae/model', mdnrnn_dir: str = './mdnrnn/model', epochs: int = 20) -> None: """ Train mdnrnn using saved environment rollouts. Parameters ---------- data_dir Directory with train and test data. vae_dir Directory to load VAE model from. mdnrnn_dir Directory to optionally load MDNRNN model from and save trained model to. epochs Number of training epochs. """ # set random seed and deterministic backend SEED = 123 np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True # use GPU if available cuda = torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") # define input transformations transform_train = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((H, W)), transforms.ToTensor(), ]) transform_test = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((H, W)), transforms.ToTensor(), ]) # define train and test datasets dir_train = os.path.join(data_dir, 'train/') dir_test = os.path.join(data_dir, 'test/') dataset_train = GymDataset(dir_train, seq_len=SEQ_LEN, transform=transform_train) dataset_test = GymDataset(dir_test, seq_len=SEQ_LEN, transform=transform_test) dataset_test.load_batch(0) # 1 batch of data used for test set dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn) # define and load VAE model vae = VAE(CHANNELS, LATENT_SIZE) load_vae_file = os.path.join(vae_dir, 'best.tar') state_vae = torch.load(load_vae_file) vae.load_state_dict(state_vae['state_dict']) vae.to(device) # set save and optional load directories for the MDNRNN model load_mdnrnn_file = os.path.join(mdnrnn_dir, 'best.tar') try: state_mdnrnn = torch.load(load_mdnrnn_file) except FileNotFoundError: state_mdnrnn = None # define and load MDNRNN model mdnrnn = MDNRNN(LATENT_SIZE, ACTION_SIZE, HIDDEN_SIZE, N_GAUSS, rewards_terminal=False) if state_mdnrnn is not None: mdnrnn.load_state_dict(state_mdnrnn['state_dict']) mdnrnn.zero_grad() mdnrnn.to(device) # optimizer params = [p for p in mdnrnn.parameters() if p.requires_grad] optimizer = RMSprop(params, lr=LR, alpha=.9) if state_mdnrnn is not None: optimizer.load_state_dict(state_mdnrnn['optimizer']) # learning rate scheduling lr_scheduler = StepLR(optimizer, step_size=3, gamma=0.1) if state_mdnrnn is not None: lr_scheduler.load_state_dict(state_mdnrnn['scheduler']) # helper function def img2latent(obs, batch_size): """ Function to go from image to latent space. """ with torch.no_grad(): obs = obs.view(-1, CHANNELS, H, W) _, mu, logsigma = vae(obs) latent = (mu + logsigma.exp() * torch.randn_like(mu)).view( batch_size, SEQ_LEN, LATENT_SIZE) return latent # define test fn def test(): """ One test epoch """ mdnrnn.eval() test_loss = 0 n_test = len(dataloader_test.dataset) with torch.no_grad(): for (obs, action, next_obs) in generate_obs(dataloader_test): batch_size = len(obs) # place on device try: obs = torch.stack(obs).to(device) next_obs = torch.stack(next_obs).to(device) action = torch.stack(action).to(device) except: print( 'Did not manage to stack test observations and actions.' ) n_test -= batch_size continue # convert to latent space latent_obs = img2latent(obs, batch_size) next_latent_obs = img2latent(next_obs, batch_size) # need to flip dims to feed into LSTM from [batch, seq_len, dim] to [seq_len, batch, dim] latent_obs, action, next_latent_obs = [ arr.transpose(1, 0) for arr in [latent_obs, action, next_latent_obs] ] # forward pass model mus, sigmas, logpi = mdnrnn(action, latent_obs) # compute loss loss = gmm_loss(next_latent_obs, mus, sigmas, logpi) test_loss += loss.item() test_loss /= n_test return test_loss # train n_batch_train = len(dataset_train.batch_list) optimizer.zero_grad() cur_best = None tq_episode = tqdm_notebook(range(epochs)) for epoch in tq_episode: mdnrnn.train() loss_train = 0 n_batch = 0 tq_batch = tqdm_notebook(range(n_batch_train)) for i in tq_batch: # loop over training data for each epoch dataset_train.load_batch(i) dataloader_train = torch.utils.data.DataLoader( dataset_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) tq_minibatch = tqdm_notebook(generate_obs(dataloader_train), total=len(dataloader_train), leave=False) for j, (obs, action, next_obs) in enumerate(tq_minibatch): n_batch += 1 # place on device batch_size = len(obs) try: obs = torch.stack(obs).to(device) next_obs = torch.stack(next_obs).to(device) action = torch.stack(action).to(device) except: print('Did not manage to stack observations and actions.') continue # convert to latent space latent_obs = img2latent(obs, batch_size) next_latent_obs = img2latent(next_obs, batch_size) # need to flip dims to feed into LSTM from [batch, seq_len, dim] to [seq_len, batch, dim] latent_obs, action, next_latent_obs = [ arr.transpose(1, 0) for arr in [latent_obs, action, next_latent_obs] ] # forward pass model mus, sigmas, logpi = mdnrnn(action, latent_obs) # compute loss loss = gmm_loss(next_latent_obs, mus, sigmas, logpi) # backward pass loss.backward() # store loss value loss_train += loss.item() loss_train_avg = loss_train / (n_batch * BATCH_SIZE) # apply gradients and learning rate scheduling with optional gradient accumulation if (j + 1) % GRAD_ACCUMULATION_STEPS == 0: optimizer.step() optimizer.zero_grad() tq_minibatch.set_postfix(loss_train=loss_train_avg) tq_batch.set_postfix(loss_train=loss_train_avg) lr_scheduler.step() # evaluate on test set loss_test_avg = test() # checkpointing best_filename = os.path.join(mdnrnn_dir, 'best.tar') filename = os.path.join(mdnrnn_dir, 'checkpoint.tar') is_best = not cur_best or loss_test_avg < cur_best if is_best: cur_best = loss_test_avg save_checkpoint( { 'epoch': epoch, 'state_dict': mdnrnn.state_dict(), 'precision': loss_test_avg, 'optimizer': optimizer.state_dict(), 'scheduler': lr_scheduler.state_dict() }, is_best, filename, best_filename) tq_episode.set_postfix(loss_train=loss_train_avg, loss_test=loss_test_avg)
def on_train_begin(self, logs): self.progbar = tqdm_notebook(desc='', total=self.params['nb_steps'], leave=True, mininterval=0.5) self.train_start = timeit.default_timer() self.metrics_names = self.model.metrics_names print('Training for {} steps ...'.format(self.params['nb_steps']))
# ## BioRxiv -> Doc Embeddings biorxiv_xpath_str = "//abstract/p|//abstract/title|//body/sec//p|//body/sec//title" word_model = Word2Vec.load( str( Path( "../word_vector_experiment/output/word2vec_models/300/biorxiv_300.model" ))) biorxiv_document_map = { document: generate_doc_vector( word_model, document_path=str(Path("output/biorxiv_xml_files") / document), xpath=biorxiv_xpath_str, ) for document in tqdm_notebook(biorxiv_documents) } # + biorxiv_vec_df = (pd.DataFrame.from_dict( biorxiv_document_map, orient="index").rename( columns={col: f"feat_{col}" for col in range(int(300))}).rename_axis( "document").reset_index()) biorxiv_vec_df.to_csv("output/polka_et_al_biorxiv_embeddings.tsv", sep="\t", index=False) biorxiv_vec_df.head().T # -
from sentence where entity_types::text like '%%Gene%%' or entity_types::text like '%%Disease%%'; ''' sentence_df = pd.read_sql(sql, database_str) sentence_df.head(2) # In[8]: entity_data = [] tagging_error_ids = set({}) #skip tagging error skip_tag_error = False for index, row in tqdm_notebook(sentence_df.iterrows()): #create dictionay for mapping entity types entity_mapper = {"sentence_id": row['sentence_id']} #Keep track of previous entity previous_entity = 'o' #For all entitys in a given sentence decide what is tagged for entity in row['entity_types']: entity = entity.lower() #Non-O tag if entity != 'o' and previous_entity =='o': #If entity not seen before instanciate it if entity not in entity_mapper:
] f3list = [ 'Census_ProcessorCoreCount', 'Census_OEMNameIdentifier', 'CityIdentifier' ] f4list = [ 'GeoNameIdentifier', 'Census_OEMNameIdentifier', 'Census_OSBuildRevision' ] f5list = [ 'Census_OEMModelIdentifier', 'CityIdentifier', 'Census_FirmwareVersionIdentifier' ] flist = [f1list, f2list, f3list, f4list, f5list] for i in tqdm_notebook(range(5)): temp = all_data.groupby(flist[i]).size().reset_index(name='counts' + str(i)) all_data = pd.merge(all_data, temp, how='left', left_on=flist[i], right_on=flist[i]) col = 'counts' + str(i) all_data[col] = all_data[col].astype('int32') train = all_data[:train_shape[0]] test = all_data[train_shape[0]:] del all_data, temp gc.collect() cols = train.columns.tolist()
def worker(args): df, seg = args n_sample = len(df) output = [] for i, sample in tqdm_notebook(df.iterrows()): filename, ebird_code, duration = sample[[ 'filename', 'ebird_code', 'duration' ]] path_folder = sample['folder'] path_audio = os.path.join(path_folder, ebird_code, filename) try: signal, _ = librosa.load(path_audio, sr=sr, mono=True, res_type='kaiser_fast') except: print('file {} corrupted.'.format(filename)) continue signal = librosa.effects.trim(signal)[0] len_signal = len(signal) max_attemp = 100 cnt_attemp = 0 max_snr = -1 tmp_spec = None tmp_idx = None while cnt_attemp < max_attemp: cnt_attemp += 1 chunk = np.zeros(len_frame) if len_signal > len_frame: i_start = np.random.randint(len_signal - len_frame) chunk[:] = signal[i_start:i_start + len_frame] elif len_signal < len_frame: i_start = np.random.randint(len_frame - len_signal) chunk[i_start:i_start + len_signal] = signal else: chunk[:] = signal mel_spec = melspectrogram(chunk, sr, mel_filterbank, **paras_mel_spectrogram) mel_spec = librosa.power_to_db(mel_spec) mel_spec = to_image(mel_spec) snr = signal_noise_ratio(mel_spec) if (snr > snr_threshold) & (cnt_attemp < max_attemp): tmp_chunk = chunk break elif snr > max_snr: tmp_chunk = chunk max_snr = snr chunk = add_noise(chunk) mel_spec = melspectrogram(chunk, sr, mel_filterbank, **paras_mel_spectrogram) mel_spec = librosa.power_to_db(mel_spec) mel_spec = to_image(mel_spec) output.append((mel_spec * 255).astype(np.uint8)) gc.collect() output = np.array(output) np.save('spectrogram{}.npy'.format(seg), output) print('segment {} complete'.format(seg))
def __init__(self, **kwargs): self.bar = tqdm_notebook(**kwargs)
# Preprocessing libraries import nltk import csv from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer import pandas as pd import os.path from tqdm import tqdm_notebook #Search Engine 1 dir_path = r"C:\Users\loren\Downloads\HW3\TSVFile" # Step 1 concatenates various path components for i in tqdm_notebook(range (len(os.listdir(dir_path)))): filename = os.path.join(dir_path, "article_{}.tsv".format(i)) df = pd.read_csv(filename, sep='\t', encoding = 'utf-8') # Creating a dataframe for each movie doc = 'article_{}.tsv'.format(i) col = [] col, message = information(df) if message == 'Continue': continue elif message == 'Pass': pass # Step 2 Taking all the info to_tokenize = col[0]+col[1]+col[2]+col[3]+col[4]+col[5]+col[6]+col[7]+col[8]+col[9]+col[10]+col[11]+col[12]+col[13]