def cache_download(url, filename=None): """ return downloaded filepath """ # check cache if not filename: filename = os.path.basename(url) storepath = os.path.join(appdir, hashlib.sha224(url.encode()).hexdigest(), filename) storedir = os.path.dirname(storepath) if not os.path.isdir(storedir): os.makedirs(storedir) if os.path.exists(storepath) and os.path.getsize(storepath) > 0: return storepath # download from url r = requests.get(url, stream=True) if r.status_code != 200: raise Exception(url, "status code", r.status_code) file_size = int(r.headers.get("Content-Length")) bar = DownloadBar(filename, max=file_size) with open(storepath + '.tmp', 'wb') as f: chunk_length = 16 * 1024 while 1: buf = r.raw.read(chunk_length) if not buf: break f.write(buf) bar.next(len(buf)) bar.finish() shutil.move(storepath + '.tmp', storepath) return storepath
def load_img_dataset(my_dir, downscaling, color): # Count files in directory data_files = [f for f in os.listdir(my_dir) if (f[0:5] == 'shape')] data_files = sorted(data_files) n_imgs = len(data_files) print('I found {} images'.format(n_imgs)) # Check size of first image img = get_img(my_dir + '/' + data_files[0]) height = img.shape[0] width = img.shape[1] # Declare n_channels if (color == 'bw'): n_channels = 1 if (color == 'rgb'): n_channels = 3 # Compute downscaling and allocate array height = math.floor(height / downscaling) width = math.floor(width / downscaling) imgs = np.zeros([n_imgs, height, width, n_channels]) # Load all images bar = progress.bar.Bar('Loading imgs ', max=n_imgs) for i in range(0, n_imgs): imgs[i, :, :, :] = load_and_reshape_img(my_dir + '/' + data_files[i], height, width, color) bar.next() bar.finish() return imgs, n_imgs, height, width, n_channels
def run(self): """ Desc: run ekf Input(s): none Output(s): none """ t_odom_prev = 0.0 # initialize previous odom time # setup progress bar print("running kalman filter, please wait...") bar = progress.bar.IncrementalBar('Progress:', max=len(self.times)) for tt, timestep in enumerate(self.times): # predict step for odometry if self.odom_df['seconds of week [s]'].isin([timestep]).any(): dt_odom = timestep - t_odom_prev t_odom_prev = timestep if not self.initialized_odom: self.initialized_odom = True bar.next() else: odom_timestep = self.odom_df[self.odom_df['seconds of week [s]'] == timestep] odom_vel_x = odom_timestep['ECEF_vel_x'].values[0] odom_vel_y = odom_timestep['ECEF_vel_y'].values[0] odom_vel_z = odom_timestep['ECEF_vel_z'].values[0] self.predict_imu(np.array([[odom_vel_x,odom_vel_y,odom_vel_z]]).T,dt_odom) # update gnss step if self.sat_df['seconds of week [s]'].isin([timestep]).any(): sat_timestep = self.sat_df[self.sat_df['seconds of week [s]'] == timestep] if 'pr [m]' in self.sat_df.columns: pranges = sat_timestep['pr [m]'].to_numpy().reshape(-1,1) sat_x = sat_timestep['sat x ECEF [m]'].to_numpy().reshape(-1,1) sat_y = sat_timestep['sat y ECEF [m]'].to_numpy().reshape(-1,1) sat_z = sat_timestep['sat z ECEF [m]'].to_numpy().reshape(-1,1) sigmas = sat_timestep['Pr_sigma'].to_numpy().reshape(-1,1) time_correction = sat_timestep['idk wtf this is'].to_numpy().reshape(-1,1) self.update_gnss_raw(pranges,sat_x,sat_y,sat_z,sigmas,time_correction) else: lat_t = sat_timestep['Latitude'].to_numpy()[0] lon_t = sat_timestep['Longitude'].to_numpy()[0] alt_t = sat_timestep['Altitude'].to_numpy()[0] self.update_gnss(lat_t,lon_t,alt_t) # add values to history self.mu_history = np.hstack((self.mu_history,self.mu)) self.P_history.append(np.trace(self.P)) bar.next() # progress bar bar.finish() # end progress bar # if finish with different num of items, spoof it. if len(self.times) + 1 == self.mu_history.shape[1]: self.mu_history = self.mu_history[:,:-1] self.P_history = self.P_history[:-1]
def update_title_progress(tags_dictionary): count = len(tags_dictionary) bar = progress.bar.FillingSquaresBar( 'Processing', max=count, suffix='%(index)d/%(max)d - %(percent).1f%%') for tag in range(count): update_title(tags_dictionary) count -= 1 bar.next() bar.finish()
def send_emails(missed_students, teacher_objects, sacs_address): """Function that generates emails to each teacher who has students listed in their una attribute.""" date_today = input( "Enter the date for attendance data uploaded:\nDate: ").strip() authenticated = True while authenticated: try: #try entering a correct username and password; will loop until the user chooses to quit or is able to authenticate username = input("Username: "******"Password: "******"Sending...") #count += 1 bar.finish() smtpObj.quit() for student in missed_students: print("Line {}. No teacher email on file for {} {} {}.".format( student[3], student[0], student[1], student[2])) if missed_students != []: print( "\nBe sure to follow up with this teacher or these teachers individually." ) authenticated = False toc = time.time() #end time for program execution print( "Program execution time:", round(toc - tic, 0), "seconds" ) #print the time taken to complete sending all emails, rounded to 4 decimals except smtplib.SMTPAuthenticationError: print("Looks like your username or password was incorrect.") smtpObj.quit()
def toCSV(self, filepath): with open(filepath, 'w', newline='') as fobj: writer = csv.writer(fobj) nodes = self.traverse(mode='in') bar = progress.bar.Bar('Exporting tree to CSV', max=len(nodes)) for node in nodes: writer.writerow(node.data) bar.next() bar.finish()
def export_all(subjects, folder): assert folder[:1] == "/" logging.info("Exporting subjects...") suffix_format = "%(index)d/%(max)d [%(elapsed_td)s / %(eta_td)s]" bar = progress.bar.Bar("Exporting subjects", max=len(subjects), suffix=suffix_format) for subject in subjects: export_subject(subject, folder) bar.next() bar.finish()
def fromJSON(self, filepath): with open(filepath, 'r') as fil: data = json.load(fil) bar = progress.bar.Bar('Inserting JSON', max=len(data)) for key, value in data.items(): cargo = (key, value) self.insert(data=cargo) bar.next() bar.finish()
def main(stack_name): client = docker.from_env() # Docker Python API doesn't seem to support listing services and stacks, so we have to do # some nasty shell parsing try: output = subprocess.check_output( ['/usr/bin/docker', 'stack', 'list', '--format', '{{.Name}}']) except subprocess.CalledProcessError: print("Cannot list stacks") print(output) exit(1) if stack_name not in output.decode('utf-8').split('\n'): print("Stack {} not found".format(stack_name)) try: output = subprocess.check_output([ '/usr/bin/docker', 'stack', 'services', stack_name, '--format', '{{.Name}}' ]) except subprocess.CalledProcessError: print("Cannot find services for stack {}".format(stack_name)) print(output) exit(1) service_names = output.decode('utf-8').split('\n') services = [ Service(x) for x in client.services.list() if x.name in service_names ] bar = progress.bar.Bar("Deploying", max=len(services)) while True: for s in services: s.update() bar.index = len(list(filter(lambda x: x.is_complete(), services))) bar.update() if all([s.is_complete() for s in services]): bar.finish() for s in services: print("{} - {}".format(s.name, s.get_state())) if all([s.success for s in services]): exit(0) else: exit(1) time.sleep(1)
def download(url, target): print("Download", target) r = requests.get(url, stream=True) r.raise_for_status() bar = progress.bar.Bar() bar.max = int(r.headers.get("content-length")) with open(target, "wb") as f: for chunk in r.iter_content(chunk_size=4096): f.write(chunk) bar.next(len(chunk)) bar.finish()
def download(url: str, storepath: str): r = requests.get(url, stream=True) r.raise_for_status() file_size = int(r.headers.get("Content-Length")) bar = DownloadBar(storepath, max=file_size) chunk_length = 16 * 1024 with open(storepath + '.part', 'wb') as f: for buf in r.iter_content(chunk_length): f.write(buf) bar.next(len(buf)) bar.finish() shutil.move(storepath + '.part', storepath)
def checkRecall(UPC): ''' Parses CFIA recalls for UPC. Returns recall link if found, else none. ''' cfia_url = 'https://www.inspection.gc.ca/food-recall-warnings-and-allergy-alerts/eng/1351519587174/1351519588221' soup = bs4.BeautifulSoup(urllib.request.urlopen(cfia_url), 'html.parser') table = soup.find('tbody') recall_urls = [] rows = table.findChildren('tr') for row in rows: recall_url = row.find('a')['href'] recall_url = 'https://www.inspection.gc.ca/' + recall_url recall_urls.append(recall_url) bar = progress.bar.Bar('Searching CFIA', max=len(recall_urls)) for recall_url in recall_urls: soup = bs4.BeautifulSoup(urllib.request.urlopen(recall_url), 'html.parser') table = soup.find('table', attrs={ 'class': 'table table-bordered table-condensed' }).find('tbody') rows = table.findChildren('tr') for row in rows: if row.findChildren( 'th'): # check if bolded first row header is found col = 2 else: col = 3 UPC_recall = row.findChildren('td')[col].text.strip().replace( u'\xa0', '').replace(' ', '') if UPC == UPC_recall: bar.finish() return recall_url elif 'Startswith' in UPC_recall: UPC_recall_trim = UPC_recall[10:] if UPC_recall_trim in UPC: bar.finish() return recall_url elif 'Noneor' in UPC_recall: UPC_recall = UPC_recall[6:] if UPC == UPC_recall: bar.finish() return recall_url bar.next() bar.finish() return None
def run(self): """ Desc: run ekf Input(s): none Output(s): none """ t_odom_prev = 0.0 # initialize previous odom time # setup progress bar print("running kalman filter, please wait...") bar = progress.bar.IncrementalBar('Progress:', max=len(self.times)) for tt, timestep in enumerate(self.times): # predict step for odometry # if self.odom_df['seconds of week [s]'].isin([timestep]).any(): # dt_odom = timestep - t_odom_prev # t_odom_prev = timestep # if tt == 0: # continue # odom_timestep = self.odom_df[self.odom_df['seconds of week [s]'] == timestep] # odom_vel_x = odom_timestep['ECEF_vel_x'].values[0] # odom_vel_y = odom_timestep['ECEF_vel_y'].values[0] # odom_vel_z = odom_timestep['ECEF_vel_z'].values[0] # self.predict_imu(np.array([[odom_vel_x,odom_vel_y,odom_vel_z]]).T,dt_odom) # update gnss step if self.sat_df['seconds of week [s]'].isin([timestep]).any(): sat_timestep = self.sat_df[self.sat_df['seconds of week [s]'] == timestep] pranges = sat_timestep['pr [m]'].to_numpy().reshape(-1,1) sat_x = sat_timestep['sat x ECEF [m]'].to_numpy().reshape(-1,1) sat_y = sat_timestep['sat y ECEF [m]'].to_numpy().reshape(-1,1) sat_z = sat_timestep['sat z ECEF [m]'].to_numpy().reshape(-1,1) self.predict_simple() self.update_gnss(pranges,sat_x,sat_y,sat_z) # add values to history self.mu_history = np.hstack((self.mu_history,self.mu)) self.P_history.append(np.trace(self.P)) bar.next() # progress bar bar.finish() # end progress bar self.mu_history = self.mu_history[:,:-1] self.mu_history[0,:] += self.x0 self.mu_history[1,:] += self.y0 self.mu_history[2,:] += self.z0 self.P_history = self.P_history[:-1]
def handle(self, *args, **options): prefix = options['prefix'][0] skip_hidden_dir = True l = ManagedFile.objects.all() bar = SlowBar(max=l.count()) for file in l: bar.next() try: if not file.isTracked and file.names.count() < 1: file.delete() except: print("Skipped file id={}, names=".format( file.id, list(file.names.all()))) bar.finish()
def load_drag_lift_dataset(my_dir, n_outputs): sol_files = sorted( [f for f in os.listdir(my_dir) if f.startswith('shape')]) n_sols = len(sol_files) sols = np.zeros([n_sols, n_outputs]) bar = progress.bar.Bar('Loading labels', max=n_sols) for i in range(0, n_sols): y = np.loadtxt(my_dir + '/' + sol_files[i], skiprows=1) if (n_outputs == 1): sols[i, 0] = y[y.shape[0] - 1, 1] if (n_outputs == 2): sols[i, 0:2] = y[y.shape[0] - 1, 1:3] bar.next() bar.finish() return sols, n_sols
def cache_download(url, filename=None, timeout=None, storepath=None, logger=logger): """ return downloaded filepath """ # check cache if not filename: filename = os.path.basename(url) if not storepath: storepath = gen_cachepath(url) storedir = os.path.dirname(storepath) if not os.path.isdir(storedir): os.makedirs(storedir) if os.path.exists(storepath) and os.path.getsize(storepath) > 0: logger.debug("Use cached assets: %s", storepath) return storepath logger.debug("Download %s", url) # download from url headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'keep-alive', 'Origin': 'https://github.com', 'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } # yapf: disable r = requests.get(url, stream=True, headers=headers, timeout=None) r.raise_for_status() file_size = int(r.headers.get("Content-Length")) bar = DownloadBar(filename, max=file_size) with open(storepath + '.part', 'wb') as f: chunk_length = 16 * 1024 while 1: buf = r.raw.read(chunk_length) if not buf: break f.write(buf) bar.next(len(buf)) bar.finish() assert file_size == os.path.getsize(storepath + ".part") # may raise FileNotFoundError shutil.move(storepath + '.part', storepath) return storepath
def mergeCSV(inpath, outpath): filenames = os.listdir(inpath) data = [] bar = progress.bar.Bar('Merging CSV files', max=len(filenames)) for filename in filenames: with open(inpath + filename, 'r') as fobj: reader = csv.reader(fobj) data.extend([row for row in reader]) bar.next() bar.finish() # export with open(outpath, 'w', newline='') as fobj: writer = csv.writer(fobj) writer.writerows(data)
async def map(cls, data, *, concurrency: int, label: str): pool = cls(data, concurrency=concurrency) pool._start() bar = progress.bar.Bar(label[:15].ljust(15), max=len(data)) stop_cnt = 0 while True: piece = await pool._results.get() if piece is cls._STOP: stop_cnt += 1 if stop_cnt == concurrency: bar.finish() return elif isinstance(piece, Exception): raise piece else: bar.next()
def handle(self, *args, **options): prefix = options['prefix'][0] skip_hidden_dir = True crit = Q(size=None) l = ManagedFile.objects.filter(crit) bar = SlowBar(max=l.count()) for file in l: bar.next() try: stat_t = file.robust_stat() file.size = stat_t[stat.ST_SIZE] #print("{} size={}".format(,file.size)) file.save() except: print("Skipped file id={}, names=".format( file.id, list(file.names.all()))) bar.finish()
def cache_download(url, filename=None): """ return downloaded filepath """ # check cache if not filename: filename = os.path.basename(url) storepath = os.path.join(appdir, hashlib.sha224(url.encode()).hexdigest(), filename) storedir = os.path.dirname(storepath) if not os.path.isdir(storedir): os.makedirs(storedir) if os.path.exists(storepath) and os.path.getsize(storepath) > 0: return storepath # download from url headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'keep-alive', 'Origin': 'https://github.com', 'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } r = requests.get(url, stream=True, headers=headers) if r.status_code != 200: raise Exception(url, "status code", r.status_code) file_size = int(r.headers.get("Content-Length")) bar = DownloadBar(filename, max=file_size) with open(storepath + '.tmp', 'wb') as f: chunk_length = 16 * 1024 while 1: buf = r.raw.read(chunk_length) if not buf: break f.write(buf) bar.next(len(buf)) bar.finish() shutil.move(storepath + '.tmp', storepath) return storepath
def download(arch: str, storepath: str): r = requests.get( "https://github.com/openatx/atx-agent/releases/download/{0}/atx-agent_{0}_linux_{1}.tar.gz" .format(__atx_agent_version__, arch), stream=True) r.raise_for_status() file_size = int(r.headers.get("Content-Length")) bar = DownloadBar(storepath, max=file_size) with open(storepath + '.tmp', 'wb') as f: chunk_length = 16 * 1024 while 1: buf = r.raw.read(chunk_length) if not buf: break f.write(buf) bar.next(len(buf)) bar.finish() shutil.move(storepath + '.tmp', storepath)
def run(self): """ Desc: run ekf Input(s): none Output(s): none """ t_odom_prev = 0.0 # initialize previous odom time # setup progress bar print("running kalman filter, please wait...") bar = progress.bar.IncrementalBar('Progress:', max=len(self.times)) for tt, timestep in enumerate(self.times): # simple predict step self.predict_simple() # predict step for odometry if self.odom_df['seconds of week [s]'].isin([timestep]).any(): dt_odom = timestep - t_odom_prev t_odom_prev = timestep if not self.initialized_odom: self.initialized_odom = True bar.next() else: odom_timestep = self.odom_df[self.odom_df['seconds of week [s]'] == timestep] baro_meas = odom_timestep['Normalized barometer:Raw[meters]'].values[0] self.update_barometer(baro_meas) self.mu = np.clip(self.mu,0.0,np.inf) # force to be above zero altitude # add values to history self.mu_history = np.hstack((self.mu_history,self.mu)) self.P_history.append(np.trace(self.P)) bar.next() # progress bar bar.finish() # end progress bar # if finish with different num of items, spoof it. if len(self.times) + 1 == self.mu_history.shape[1]: self.mu_history = self.mu_history[:,:-1] self.P_history = self.P_history[:-1]
def fromCSV(self, filepath, shuffle=True): start = time.time() ### REmove AFTER TESTING with open(filepath) as fil: reader = csv.reader(fil) data = [row for row in reader] random.shuffle(data) if shuffle else False # data = data[:1] #### REMOVE AFTER TESTING bar = progress.bar.Bar('Inserting CSV "{}"'.format(filepath), max=len(data)) for row in data: self.insert(row) bar.next() bar.finish() end = time.time() elapsed = end-start print('ELAPSED: {}'.format(round(elapsed,3)))
def run(self): """ Desc: run ekf Input(s): none Output(s): none """ # setup progress bar print("running kalman filter, please wait...") bar = progress.bar.IncrementalBar('Progress:', max=len(self.times)) for tt, timestep in enumerate(self.times): # update gnss step if self.sat_df['seconds of week [s]'].isin([timestep]).any(): sat_timestep = self.sat_df[self.sat_df['seconds of week [s]'] == timestep] pranges = sat_timestep['pr [m]'].to_numpy().reshape(-1, 1) sat_x = sat_timestep['sat x ECEF [m]'].to_numpy().reshape( -1, 1) sat_y = sat_timestep['sat y ECEF [m]'].to_numpy().reshape( -1, 1) sat_z = sat_timestep['sat z ECEF [m]'].to_numpy().reshape( -1, 1) sigmas = sat_timestep['Pr_sigma'].to_numpy().reshape(-1, 1) time_correction = sat_timestep['idk wtf this is'].to_numpy( ).reshape(-1, 1) self.predict_simple() self.update_gnss(pranges, sat_x, sat_y, sat_z, sigmas, time_correction) # add values to history self.mu_history = np.hstack((self.mu_history, self.mu)) self.P_history.append(np.trace(self.P)) bar.next() # progress bar bar.finish() # end progress bar if len(self.times) + 1 == self.mu_history.shape[1]: self.mu_history = self.mu_history[:, :-1] self.P_history = self.P_history[:-1]
def main(): args = parse_arguments() if sys.platform == "linux": host = "linux-x86_64" elif sys.platform == "darwin": host = "darwin-x86_64" else: print("Unsupported platform: {}".format(sys.platform)) sys.exit(1) ndk_version = "r16b" ndk_package_base = "android-ndk-{}".format(ndk_version) ndk_package_archive = "{}-{}.zip".format(ndk_package_base, host) ndk_url = "https://dl.google.com/android/repository/{}".format(ndk_package_archive) ndk_download_path = "{}/{}".format(args.download_prefix, ndk_package_archive) ndk_tmp_unzip = "/tmp/android-ndk-unzip" ndk_unzip_path = "{}/android-ndk".format(args.download_prefix) if has_progress_bar: bar = ProgressBar("Downloading NDK") urllib.request.urlretrieve(ndk_url, ndk_download_path, bar.urllib_reporthook) bar.finish() bar = ProgressBar("Unzipping NDK") unzip(ndk_download_path, ndk_tmp_unzip, bar.unzip_reporthook) bar.finish() else: print("Downloading NDK...") urllib.request.urlretrieve(ndk_url, ndk_download_path) print("Done.") print("Unzipping NDK...") unzip(ndk_download_path, ndk_tmp_unzip) print("Done.") print("Moving ndk to {}".format(ndk_unzip_path)) shutil.move(ndk_tmp_unzip + "/android-ndk-r16b", ndk_unzip_path) print("Done.")
def _wait_install_finished(self, id, installing_callback): bar = None downloaded = True while True: resp = self._reqsess.get(self.path2url('/install/' + id)) resp.raise_for_status() jdata = resp.json() message = jdata['message'] pg = jdata.get('progress') def notty_print_progress(pg): written = pg['copiedSize'] total = pg['totalSize'] print( time.strftime('%H:%M:%S'), 'downloading %.1f%% [%s/%s]' % (100.0 * written / total if total != 0 else 0, humanize.naturalsize(written, gnu=True), humanize.naturalsize(total, gnu=True))) if message == 'downloading': downloaded = False if pg: # if there is a progress if hasattr(sys.stdout, 'isatty'): if sys.stdout.isatty(): if not bar: bar = _ProgressBar(time.strftime('%H:%M:%S') + ' downloading', max=pg['totalSize']) written = pg['copiedSize'] bar.next(written - bar.index) else: notty_print_progress(pg) else: pass else: print(time.strftime('%H:%M:%S'), "download initialing") else: if not downloaded: downloaded = True if bar: # bar only set in atty bar.next(pg['copiedSize'] - bar.index) if pg else None bar.finish() else: print(time.strftime('%H:%M:%S'), "download 100%") print(time.strftime('%H:%M:%S'), message) if message == 'installing': if callable(installing_callback): installing_callback(self) if message == 'success installed': return jdata.get('packageName') if jdata.get('error'): raise RuntimeError("error", jdata.get('error')) try: time.sleep(1) except KeyboardInterrupt: bar.finish() if bar else None print("keyboard interrupt catched, cancel install id", id) self._reqsess.delete(self.path2url('/install/' + id)) raise
def run(data_path, db_config, index1_table_name, index2_table_names, ske_config): start = datetime.datetime.now() log_manager.info_global("--------------------------------") log_manager.info_global( f"{start.strftime('[%y-%m-%d %H:%M:%S]')} START INDEXING\n") log_manager.info_global("Creating DB tables ...") create_tables(db_config, index1_table_name, index2_table_names) log_manager.info_global("Creating DataFrames from original CSV files ...") # 1. set up the keywords dataframe log_manager.debug_global("Creating DataFrame for keywords ...") keyword_df = read_keyword_df(data_path) # store the keywords df to the database log_manager.debug_global("Writing keywords DF to DB ...") write_df_to_db( keyword_df.drop(columns=['csv_tokens', 'csv_types'], inplace=False), index2_table_names['keywords'], db_config) # 2. set up the text token counts dataframe log_manager.debug_global("Creating DataFrame for token counts ...") token_df = pd.DataFrame() # in doc_df, we create a column for each keyword # and fill it with that keyword's token count in the given document bar = create_progress_bar('Calculating total of tokens per text', keyword_df.shape[0]) for kw in keyword_df.itertuples(): # kw is a Pandas object representing the row # we find the token counts in the CSV file stored in the column 'csv_tokens' of keyword_df temp_df = pd.read_csv(f'{data_path}/CSV/{kw.csv_tokens}', sep='\t', skiprows=8, names=['docid', 'token', 'token_count'], usecols=['docid', 'token_count']) # we need to group by doc id and sum all the token counts for various shapes of the token temp_df = temp_df.groupby(['docid'], as_index=False).sum() # add a column temp_df['keyword_id'] = kw.Index temp_df = temp_df.set_index(['keyword_id', 'docid'], verify_integrity=True) # 1st index: keyword_id, because this allows for fewer lookups when calculating the scores # we append the rows to token_df token_df = token_df.append(temp_df, verify_integrity=True) bar.next() bar.finish() # Don't write to token_df to DB yet because it has a FK constraint to doc_df. # 3. set up the texts dataframe log_manager.debug_global("Creating DataFrame for texts ...") # we use this file only to get a complete list of doc ids doc_df = pd.read_csv(f'{data_path}/mara002_kvr_all.docids.counts.csv', sep='\t', names=['types_count', 'docid'], usecols=['docid']) doc_df['score_rarity_diversity'] = 0.0 doc_df['already_annotated'] = False doc_df['selected_on'] = None doc_df = doc_df.set_index('docid') # Calculate scores log_manager.debug_global("Calculating scores for texts ...") doc_df = score_rarity_diversity(doc_df, keyword_df, token_df) # Write doc_df to DB log_manager.debug_global("Writing DF for texts to DB ...") write_df_to_db(doc_df, index2_table_names['scores'], db_config) # Now we can write token_df to the DB. log_manager.debug_global("Writing DF for tokens to DB ...") write_df_to_db(token_df, index2_table_names['tokens'], db_config) # all done! end = datetime.datetime.now() log_manager.info_global( f"{end.strftime('[%y-%m-%d %H:%M:%S]')} DONE INDEXING, duration: {end-start}" ) return # TODO: Is this empty return on purpose?
def score_rarity_diversity(doc_df, keyword_df, token_df): # This algorithm favors rare keywords over frequent keywords, # and many types over many tokens, # but also many tokens over few tokens. # # score(text) = # sum for each keyword k: # sum for n from 1 to the token count of k in text: # (1/corpus token count of k) * (1/n) # # A keyword with a high token count in the corpus will yield a smaller coefficient, and vice versa, # thus favoring rarity. # A text t1 where keyword k appears n times will have a lower score # than a text t2 where k appears n+1 times, if t1 and t2 are otherwise identical, # thus favoring higher token counts. # A text t1 where keyword k1 appears n times and keyword k2 appears m times, # where k1 and k2 have the same corpus token count, will have a higher score # than a text t2 where k1 appears n+l times and k2 appears m-l times, # thus favoring diversity. log_manager.debug_global("Calculating rarity/diversity scores ...") # We select the column 'score_rarity_diversity', which as of now contains only 0s. # This returns a Series object whose index is the docids (the index of doc_df). scores = doc_df['score_rarity_diversity'] bar = create_progress_bar('Computing scores per keyword', keyword_df.shape[0]) # iterate over rows in keyword_df for kw, data in keyword_df.iterrows(): # kw is the label of the row (the keyword_id) # data is a Series of the values in this row # get this keyword's corpus token count # we will use this to calculate its inverse frequency kw_freq = data.corpus_count # get this keyword's token count per text try: # token_df has a MultiIndex: 1st the keyword_id, 2nd the docid # We select all rows with keyword_id = kw. This returns a DataFrame. # Then we select only the column 'token_count'. This returns a Series. tokencounts = token_df.loc[kw]['token_count'] # tokencounts is a Series, indexed with docid, # containing as values the token counts of kw in the given docid except KeyError as e: tokencounts = pd.Series(index=doc_df.index, data=0) # This is the formula: def calculate_score(token_count, kw_freq): return sum( map(lambda x: pow(kw_freq, -1) * pow(x, -1), range(1, int(token_count) + 1))) # Apply this function to the token counts of the current keyword. scores = scores.add(tokencounts.apply(calculate_score, args=(kw_freq, )), fill_value=0.0) bar.next() bar.finish() # feed the temporary Series back into the table doc_df['score_rarity_diversity'] = scores # sort by highest score doc_df = doc_df.sort_values(by='score_rarity_diversity', ascending=False) return doc_df
def test(opts, model, test_data, which_epoch='best', batch_size=1, expdir=None, save_loss=False, save_images=True): test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=opts.dataloader_workers, pin_memory=True) model.load_checkpoint(which_epoch) model.set_mode('eval') output_dir = os.path.join( opts.results_dir, opts.experiment_name if expdir is None else expdir, 'test_{}'.format(which_epoch)) os.makedirs(output_dir, exist_ok=True) test_start = time.perf_counter() test_loss = None bar = progress.bar.Bar('Test', max=len(test_loader)) for idx, data in enumerate(test_loader): model.set_data(data) model.test(compute_loss=save_loss) if save_loss: if test_loss is None: test_loss = model.get_data() else: test_loss = utils.concatenate_dicts(test_loss, model.get_data()) if save_images: output = model.get_images() for img_label, img in output.items(): this_output_dir = os.path.join(output_dir, img_label) os.makedirs(this_output_dir, exist_ok=True) output_file = os.path.join(this_output_dir, '{:05}.png'.format(idx)) # print("Saving to {}".format(output_file)) img.save(output_file) bar.next() bar.finish() test_end = time.perf_counter() test_fps = len(test_data) / (test_end - test_start) print('Processed {} images | time: {:.3f} s | test: {:.3f} fps'.format( len(test_data), test_end - test_start, test_fps)) if save_loss: loss_file = os.path.join(output_dir, 'loss.csv') header = [key for key in test_loss] entries = [test_loss[key] for key in test_loss] entries = np.atleast_2d(np.array(entries)).T.tolist() print("Saving test loss to {}".format(loss_file)) with open(loss_file, 'wt') as file: file.write(','.join(header) + '\n') for entry in entries: line = ','.join([str(val) for val in entry]) + '\n' file.write(line)
def train(opts, model, train_data, val_data, num_epochs, resume_from_epoch=None): train_loader = DataLoader(train_data, batch_size=opts.batch_size, shuffle=True, num_workers=opts.dataloader_workers, pin_memory=True) val_loader = DataLoader(val_data, batch_size=opts.batch_size, shuffle=False, num_workers=opts.dataloader_workers, pin_memory=True) print('Training images: {}'.format(len(train_data))) print('Validation images: {}'.format(len(val_data))) log_dir = os.path.join(opts.results_dir, opts.experiment_name) writer = SummaryWriter(log_dir) ### LOAD FROM CHECKPOINT ### if resume_from_epoch is not None: try: initial_epoch = model.load_checkpoint(resume_from_epoch) + 1 iterations = (initial_epoch - 1) * opts.batch_size except FileNotFoundError: print('No model available for epoch {}, starting fresh'.format( resume_from_epoch)) initial_epoch = 1 iterations = 0 else: initial_epoch = 1 iterations = 0 ### TRAIN AND VALIDATE ### best_total_val_loss = 1e12 for epoch in range(initial_epoch, num_epochs + 1): epoch_start = time.perf_counter() # TRAIN epoch_train_loss = None model.set_mode('train') bar = progress.bar.Bar('Epoch {} train'.format(epoch), max=len(train_loader)) for data in train_loader: model.set_data(data) model.optimize() if epoch_train_loss is None: epoch_train_loss = model.get_errors() else: epoch_train_loss = utils.concatenate_dicts( epoch_train_loss, model.get_errors()) iterations += 1 bar.next() bar.finish() # VISUALIZE for label, image in model.get_images().items(): image = np.array(image).transpose([2, 0, 1]) writer.add_image('train/' + label, image, epoch) train_end = time.perf_counter() # VALIDATE epoch_val_loss = None model.set_mode('eval') bar = progress.bar.Bar('Epoch {} val '.format(epoch), max=len(val_loader)) for data in val_loader: model.set_data(data) model.test(compute_loss=True) if epoch_val_loss is None: epoch_val_loss = model.get_errors() else: epoch_val_loss = utils.concatenate_dicts( epoch_val_loss, model.get_errors()) bar.next() bar.finish() for label, image in model.get_images().items(): image = np.array(image).transpose([2, 0, 1]) writer.add_image('val/' + label, image, epoch) epoch_end = time.perf_counter() epoch_avg_val_loss = utils.compute_dict_avg(epoch_val_loss) epoch_avg_train_loss = utils.compute_dict_avg(epoch_train_loss) train_fps = len(train_data) / (train_end - epoch_start) val_fps = len(val_data) / (epoch_end - train_end) print( 'End of epoch {}/{} | iter: {} | time: {:.3f} s | train: {:.3f} fps | val: {:.3f} fps' .format(epoch, num_epochs, iterations, epoch_end - epoch_start, train_fps, val_fps)) # LOG ERRORS errors = utils.tag_dict_keys(epoch_avg_train_loss, 'train') errors.update(utils.tag_dict_keys(epoch_avg_val_loss, 'val')) for key, value in sorted(errors.items()): writer.add_scalar(key, value, epoch) print('{:20}: {:.3e}'.format(key, value)) writer.add_scalar('fps/train', train_fps, epoch) writer.add_scalar('fps/val', val_fps, epoch) # SAVE MODELS model.save_checkpoint(epoch, 'latest') if epoch % opts.checkpoint_interval == 0: model.save_checkpoint(epoch, epoch) curr_total_val_loss = 0 for key, val in epoch_avg_val_loss.items(): if 'eval_loss' in key: try: curr_total_val_loss += val[-1] except IndexError: curr_total_val_loss += val if epoch == 1 or curr_total_val_loss < best_total_val_loss: model.save_checkpoint(epoch, 'best') best_total_val_loss = curr_total_val_loss