def create_BOW(root_directory='./preprocessed_texts/'): """ :type root_directory: str """ training_path = os.path.join(root_directory, "training") training_bag_of_author = {} # super_counter = Counter() doc_count_of_author = {} authors = list_dirs(training_path) # total_doc_count = 0 for author in authors: bag = Counter() author_path = os.path.join(training_path, author) files_of_author = list_files(author_path) for filename in files_of_author: file_path = os.path.join(author_path, filename) tokens = tokenize_file(file_path) bag += Counter(tokens) training_bag_of_author[author] = bag doc_count = len(files_of_author) doc_count_of_author[author] = doc_count # total_doc_count += doc_count # super_counter += bag # print(super_counter.most_common(10)) return training_bag_of_author, doc_count_of_author
def copy_supporting_files(start_path, destination): for file in list_files(start_path): if not (file.startswith("_") or file.startswith(".")): print("copying: %s to: %s" % (file, destination)) copy_file(path.join(start_path, file), path.join(destination, file)) for dir in list_dirs(start_path): if not (dir.startswith("_") or dir.startswith(".")): print("copying: %s to: %s" % (dir, destination)) copy_tree(path.join(start_path, dir), path.join(destination, dir))
def init_graphics(self): self.title = draw.create_text(x=self.gw.screen_width - 80, y=376, align='left', t='map editor', c=(255, 255, 255), s=10, group=groups.menu_text) self.add_obj(self.title, draw.go.TEXT) self.text_load = draw.create_text(x=20, y=340, align='left', t='load map', c=(255, 255, 255), s=10, group=groups.menu_text) self.add_obj(self.text_load, draw.go.TEXT) self.topbar = draw.create_rect(x=0, y=self.gw.screen_width - 30, w=self.gw.screen_width, h=30, color=(50, 50, 50, 255), group=groups.menu_rect_back) self.add_obj(self.topbar, draw.go.RECT) self.button_new = button.Button(0, 370, 80, 30, text="new map") self.add_obj(self.button_new, draw.go.BUTTON) utils.mkdir2("maps") #create buttons for every map map_names = utils.list_dirs("./maps") if len(map_names) == 0: draw.change_text(self.text_load, "no maps found") map_names = sorted(map_names) print(map_names) offset = 0 grid = 4 offx = 20 offy = 300 # TODO use utils.grid_coords() method! for mn in map_names: x = int(math.fmod(offset, grid)) y = math.floor(offset / grid) mb = button.Button(offx + (x * 90), offy - (y * 40), 80, 30, text=mn) self.add_obj(mb, draw.go.BUTTON) self.buttons_maps.append(mb) offset += 1
def get_gens_dir(self, gens_to_remove): all_gens_dir = ut.list_dirs(os.path.join(self.dir, '*')) if not gens_to_remove: return all_gens_dir gens_dir = list() for index, gen_dir in enumerate(all_gens_dir): if not any(gen_to_remove in gen_dir for gen_to_remove in gens_to_remove): gens_dir.append(gen_dir) return gens_dir
def migrate_csv_data_to_db(): for mkt in utils.list_dirs(join(base_path, 'data/csv')): dir_path = join(base_path, f'data/csv/{mkt}') files = utils.list_files(dir_path) for filename in files: ticker = utils.remove_filename_ext(filename) # for index tickers, change ^ to _ collection = utils.convert_ticker_to_coll(ticker) print(f'ticker={ticker}, collection={collection}') with open(join(dir_path, filename)) as csv_file: documents = utils.parse_ohlcv_csv(csv_file) resp = mdb.write_many_records(mdb.db_to_use(mkt), collection, documents) added_index = mdb.add_index_on_date(mdb.db_to_use(mkt), collection) print(f'Inserted: {len(resp.inserted_ids)} records') print(f'Index created: {added_index}')
def __init__(self, solid_run_dir): """Create and populate a new SolidRun instance. Arguments: solid_run_dir: path to the top-level directory holding the files generated by the SOLiD sequencer run e.g. /path/to/SOLiD/data/solid0123_20130426_FRAG_BC """ # Initialise self.run_dir = None self.run_name = None self.run_info = None self.run_definition = None self.samples = [] # Basic data the supplied directory name if not os.path.isdir(os.path.abspath(solid_run_dir)): # Directory not found logging.info("SOLiD data dir '%s' not found" % solid_run_dir) return self.run_dir = os.path.abspath(solid_run_dir) # Locate and process the run definition file self.run_name = self.run_dir.strip(os.sep).split(os.sep)[-1] self.run_defn_filn = os.path.join( self.run_dir, self.run_name + "_run_definition.txt") if not os.path.isfile(self.run_defn_filn): # Unable to find run definition logging.warning("Unable to find run definition file for %s" % self.run_dir) # Attempt to recover: look for other possible candidates self.run_defn_filn = None for f in os.listdir(self.run_dir): if f.endswith("_run_definition.txt"): self.run_defn_filn = os.path.join(self.run_dir, f) logging.warning( "%s: using run definition file %s" % (os.path.basename(self.run_dir), self.run_defn_filn)) break if self.run_defn_filn: # Populate run definition object self.run_definition = SolidRunDefinition(self.run_defn_filn) # Get run name and info self.run_name = self.run_definition.runName self.run_info = SolidRunInfo(self.run_name) # Populate libraries for i in range(0, self.run_definition.nSamples()): sample_name = self.run_definition.getDataItem('sampleName', i) library_name = self.run_definition.getDataItem('library', i) # Barcoded samples # # Look for content in the "barcodes" column for the library # in the run definition file # # There may be several barcoded samples # Example barcode items: # --> "1" # --> "1,2,3,4,5,6,7,8" # (or could be empty) try: barcodes = self.run_definition.getDataItem('barcodes', i) except IndexError: barcodes = '' logging.debug("%s: barcodes: %s" % (library_name, barcodes)) library_is_barcoded = (barcodes != '' and barcodes) if library_is_barcoded: barcodes = barcodes.strip('"').split(',') # Look for the directory with the results # # There should be a symlink "results" that will # point to the actual results directory results = os.path.join(self.run_dir, sample_name, 'results') if os.path.islink(results): libraries_dir = os.path.join(self.run_dir, sample_name, os.readlink(results), 'libraries') else: libraries_dir = None self.add_library(sample_name, library_name, libraries_dir, library_is_barcoded) else: logging.warning("No run definition file found for %s" % self.run_dir) # Improvise run name and info self.run_name = os.path.basename(self.run_dir) self.run_info = SolidRunInfo(self.run_name) # Try to guess samples and libraries samples = [] for s in utils.list_dirs(self.run_dir): logging.debug("Examining subdir %s" % s) # Look for 'results' subdir results = os.path.join(self.run_dir, s, 'results') if not os.path.isdir(results): continue # Look for 'libraries' subdir if os.path.islink(results): libraries_dir = os.path.join(self.run_dir, s, os.readlink(results), 'libraries') else: continue # Look for possible libraries for d in utils.list_dirs(libraries_dir): logging.debug("Examining putative library subdir %s" % d) self.add_library(s, d, libraries_dir, False)
def train_model(self, train_dataloader, test_dataloader, train_params: TrainParams, resume=False,new_train=False): """ Start training the model with specified parameters. """ print("####### Training The model...") self.params = train_params self.optimizer = train_params.optimizer # Get the device (GPU/CPU) and migrate the model to it device = train_params.device print("\t Setting up model on ", device.type, "...") if not os.path.exists(self.checkpoints_dir): os.mkdir(self.checkpoints_dir) target_repo = self.git_manager.get_repo('checkpoints') # Initialize training variables print("\t Initializing ", "...") self.min_MAE = 10000 self.min_epoch = 0 train_loss_list = [] test_error_list = [] start_epoch = 0 dirs=utils.list_dirs(self.checkpoints_dir) train_dirs=re.findall('Train_[0-9]+',' '.join(dirs)) if len(train_dirs)==0: last_train=1#self.checkpoints_dir = os.path.join(self.checkpoints_dir,('Train_1')) else: last_train=max(sorted([int(re.sub('Train_','',dirname)) for dirname in train_dirs])) # If resume option is specified, restore state of model and resume training if new_train or (not resume): if len(train_dirs)==0: self.checkpoints_dir = os.path.join(self.checkpoints_dir,'Train_1') else: self.checkpoints_dir = os.path.join(self.checkpoints_dir, 'Train_'+str(last_train+1) ) else: self.checkpoints_dir = os.path.join(self.checkpoints_dir, 'Train_'+str(last_train)) params_hist = [utils.extract_number(file_path) for file_path in glob.glob( os.path.join(os.path.join(self.checkpoints_dir), '*.pth'))] if len(params_hist) > 0: print("\t Restore Checkpoints2 found! Resuming training...") sorted_hist = sorted(params_hist) start_epoch = max(sorted_hist) last_epoch = glob.glob(os.path.join(os.path.join( self.checkpoints_dir, 'epoch_'+str(start_epoch)+'.pth')))[0] _, self.min_MAE, self.min_epoch = self.load_chekpoint( last_epoch) files_to_push = [] for epoch in sorted_hist: if epoch != self.min_epoch and epoch != start_epoch and epoch!= train_params.maxEpochs: path = glob.glob(os.path.join(os.path.join( self.checkpoints_dir, 'epoch_'+str(epoch)+'.pth')))[0] obj = torch.load(path, map_location=device) if obj['model_state_dict'] is not None or obj['optimizer_state_dict']is not None: obj['model_state_dict'] = None obj['optimizer_state_dict'] = None self.save_checkpoint(obj, path) files_to_push.append(path) if len(files_to_push)>0: res = self.git_manager.push_files( target_repo, files_to_push, 'checkpoints migration', branch=self.__class__.__name__,dir=os.path.basename(self.checkpoints_dir)) if isinstance(res, int)and res == len(files_to_push): print( '\t Successfully comitted previous checkpoints(', res, ' files).') else: raise RuntimeError('Couldn\'t push all files') self.to(device) start_epoch += 1 # Start Train for epoch in range(start_epoch, train_params.maxEpochs+1): start = time.time() # Set the Model on training mode self.train() epoch_loss = 0 # Run training pass (feedforward,backpropagation,...) for each batch for i, (img, gt_dmap) in enumerate(train_dataloader): torch.cuda.empty_cache() img = img.to(device).detach() gt_dmap = gt_dmap.to(device).detach() # forward propagation try: est_dmap = self(img) except RuntimeError as e: if 'out of memory' in str(e): torch.cuda.empty_cache() est_dmap = self(img) if not est_dmap.size() == gt_dmap.size(): est_dmap = F.interpolate(est_dmap, size=( gt_dmap.size()[2], gt_dmap.size()[3]), mode='bilinear') # est_dmap = F.interpolate(est_dmap, size=( # gt_dmap.size()[1], gt_dmap.size()[2]), mode='bilinear') # if torch.isnan(est_dmap): print('Estimated is nan') # if torch.isnan(gt_dmap): print('Ground truth is nan') # calculate loss loss = train_params.criterion(est_dmap, gt_dmap) epoch_loss += loss.item() # if i%5==0: print(est_dmap.data.sum(),gt_dmap.data.sum()) torch.cuda.empty_cache() # Setting gradient to zero ,(only in pytorch , because of backward() that accumulate gradients) self.optimizer.zero_grad() # Backpropagation loss.backward() self.optimizer.step() del img, gt_dmap, est_dmap print("\t epoch:"+str(epoch)+"\n", "\t\t loss:", epoch_loss/len(train_dataloader)) train_loss_list.append(epoch_loss/len(train_dataloader)) # Set the Model on validation mode self.eval() MAE = 0 MSE = 0 for i, (img, gt_dmap) in enumerate(test_dataloader): img = img.to(device) gt_dmap = gt_dmap.to(device) # forward propagation try: est_dmap = self(img) except RuntimeError as e: if 'out of memory' in str(e): torch.cuda.empty_cache() est_dmap = self(img) if not est_dmap.size() == gt_dmap.size(): est_dmap = F.interpolate(est_dmap, size=( gt_dmap.size()[2], gt_dmap.size()[3]), mode='bilinear') # est_dmap = F.interpolate(est_dmap, size=( # gt_dmap.size()[1], gt_dmap.size()[2]), mode='bilinear') mae=abs(est_dmap.data.sum()-gt_dmap.data.sum()).item() MAE += mae MSE += mae**2 del img, gt_dmap, est_dmap torch.cuda.empty_cache() MAE = MAE/len(test_dataloader) MSE = np.math.sqrt(MSE/len(test_dataloader)) if MAE < self.min_MAE: self.min_MAE = MAE self.min_epoch = epoch test_error_list.append(MAE) print("\t\t error:"+str(MAE)+" min_MAE:" + str(self.min_MAE)+" min_epoch:"+str(self.min_epoch)) end = time.time() check_point = { 'model_state_dict': self.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': epoch_loss/len(train_dataloader), 'mae': MAE, 'min_MAE': self.min_MAE, 'min_epoch': self.min_epoch, 'duration':str(datetime.timedelta(seconds=end-start)) } # Save checkpoint self.save_checkpoint(check_point, os.path.join( self.checkpoints_dir, 'epoch_'+str(epoch)+'.pth')) # Save training summary into disk self.make_summary(finished=True) print('Training finished.') return (train_loss_list, test_error_list, self.min_epoch, self.min_MAE)
# Process directories seqdirs = [] for d in args: print "Processing %s..." % d for year in years: print "- Year %s" % year dirn = os.path.join(d, "%s" % year) if not os.path.isdir(dirn): continue for platform in platform_list: platform_dirn = os.path.join(dirn, platform) if not os.path.isdir(platform_dirn): continue print "-- Platform %s" % platform # Get all the directories for this year/platform combination for run in utils.list_dirs(platform_dirn): print "--- Run %s" % run run_dir = os.path.join(platform_dirn, run) if os.path.islink(run_dir): logging.warning("%s: is link, ignoring" % run_dir) else: seqdir = SeqDataSizes( run, run_dir, year=year, platform=platform, include_subdirs=options.include_subdirs ) seqdir.get_disk_usage() seqdirs.append(seqdir) # Calculate totals for each year usage = dict() for year in years: usage[year] = dict()
def __init__(self,solid_run_dir): """Create and populate a new SolidRun instance. Arguments: solid_run_dir: path to the top-level directory holding the files generated by the SOLiD sequencer run e.g. /path/to/SOLiD/data/solid0123_20130426_FRAG_BC """ # Initialise self.run_dir = None self.run_name = None self.run_info = None self.run_definition = None self.samples = [] # Basic data the supplied directory name if not os.path.isdir(os.path.abspath(solid_run_dir)): # Directory not found logging.info("SOLiD data dir '%s' not found" % solid_run_dir) return self.run_dir = os.path.abspath(solid_run_dir) # Locate and process the run definition file self.run_name = self.run_dir.strip(os.sep).split(os.sep)[-1] self.run_defn_filn = os.path.join(self.run_dir, self.run_name+"_run_definition.txt") if not os.path.isfile(self.run_defn_filn): # Unable to find run definition logging.warning("Unable to find run definition file for %s" % self.run_dir) # Attempt to recover: look for other possible candidates self.run_defn_filn = None for f in os.listdir(self.run_dir): if f.endswith("_run_definition.txt"): self.run_defn_filn = os.path.join(self.run_dir,f) logging.warning("%s: using run definition file %s" % (os.path.basename(self.run_dir),self.run_defn_filn)) break if self.run_defn_filn: # Populate run definition object self.run_definition = SolidRunDefinition(self.run_defn_filn) # Get run name and info self.run_name = self.run_definition.runName self.run_info = SolidRunInfo(self.run_name) # Populate libraries for i in range(0,self.run_definition.nSamples()): sample_name = self.run_definition.getDataItem('sampleName',i) library_name = self.run_definition.getDataItem('library',i) # Barcoded samples # # Look for content in the "barcodes" column for the library # in the run definition file # # There may be several barcoded samples # Example barcode items: # --> "1" # --> "1,2,3,4,5,6,7,8" # (or could be empty) try: barcodes = self.run_definition.getDataItem('barcodes',i) except IndexError: barcodes = '' logging.debug("%s: barcodes: %s" % (library_name,barcodes)) library_is_barcoded = (barcodes != '' and barcodes) if library_is_barcoded: barcodes = barcodes.strip('"').split(',') # Look for the directory with the results # # There should be a symlink "results" that will # point to the actual results directory results = os.path.join(self.run_dir,sample_name,'results') if os.path.islink(results): libraries_dir = os.path.join(self.run_dir, sample_name, os.readlink(results), 'libraries') else: libraries_dir = None self.add_library(sample_name,library_name, libraries_dir,library_is_barcoded) else: logging.warning("No run definition file found for %s" % self.run_dir) # Improvise run name and info self.run_name = os.path.basename(self.run_dir) self.run_info = SolidRunInfo(self.run_name) # Try to guess samples and libraries samples = [] for s in utils.list_dirs(self.run_dir): logging.debug("Examining subdir %s" % s) # Look for 'results' subdir results = os.path.join(self.run_dir,s,'results') if not os.path.isdir(results): continue # Look for 'libraries' subdir if os.path.islink(results): libraries_dir = os.path.join(self.run_dir,s, os.readlink(results), 'libraries') else: continue # Look for possible libraries for d in utils.list_dirs(libraries_dir): logging.debug("Examining putative library subdir %s" % d) self.add_library(s,d,libraries_dir,False)
def get_addons(repo): '''Returns a list of addon objects for a given repo object.''' addon_paths = list_dirs(repo.path) return [Addon(repo, addon_path) for addon_path in addon_paths]
# Process directories seqdirs = [] for d in args: print "Processing %s..." % d for year in years: print "- Year %s" % year dirn = os.path.join(d, "%s" % year) if not os.path.isdir(dirn): continue for platform in platform_list: platform_dirn = os.path.join(dirn, platform) if not os.path.isdir(platform_dirn): continue print "-- Platform %s" % platform # Get all the directories for this year/platform combination for run in utils.list_dirs(platform_dirn): print "--- Run %s" % run run_dir = os.path.join(platform_dirn, run) if os.path.islink(run_dir): logging.warning("%s: is link, ignoring" % run_dir) else: seqdir = SeqDataSizes( run, run_dir, year=year, platform=platform, include_subdirs=options.include_subdirs) seqdir.get_disk_usage() seqdirs.append(seqdir) # Calculate totals for each year