def create_BOW(root_directory='./preprocessed_texts/'):
    """
    :type root_directory: str
    """
    training_path = os.path.join(root_directory, "training")

    training_bag_of_author = {}
    # super_counter = Counter()
    doc_count_of_author = {}

    authors = list_dirs(training_path)
    # total_doc_count = 0

    for author in authors:
        bag = Counter()

        author_path = os.path.join(training_path, author)
        files_of_author = list_files(author_path)

        for filename in files_of_author:
            file_path = os.path.join(author_path, filename)
            tokens = tokenize_file(file_path)
            bag += Counter(tokens)

        training_bag_of_author[author] = bag
        doc_count = len(files_of_author)
        doc_count_of_author[author] = doc_count
        # total_doc_count += doc_count

        # super_counter += bag

    # print(super_counter.most_common(10))
    return training_bag_of_author, doc_count_of_author
Example #2
0
def copy_supporting_files(start_path, destination):
    for file in list_files(start_path):
        if not (file.startswith("_") or file.startswith(".")):
            print("copying: %s to: %s" % (file, destination))
            copy_file(path.join(start_path, file), path.join(destination, file))

    for dir in list_dirs(start_path):
        if not (dir.startswith("_") or dir.startswith(".")):
            print("copying: %s to: %s" % (dir, destination))
            copy_tree(path.join(start_path, dir), path.join(destination, dir))
Example #3
0
    def init_graphics(self):
        self.title = draw.create_text(x=self.gw.screen_width - 80,
                                      y=376,
                                      align='left',
                                      t='map editor',
                                      c=(255, 255, 255),
                                      s=10,
                                      group=groups.menu_text)
        self.add_obj(self.title, draw.go.TEXT)

        self.text_load = draw.create_text(x=20,
                                          y=340,
                                          align='left',
                                          t='load map',
                                          c=(255, 255, 255),
                                          s=10,
                                          group=groups.menu_text)
        self.add_obj(self.text_load, draw.go.TEXT)

        self.topbar = draw.create_rect(x=0,
                                       y=self.gw.screen_width - 30,
                                       w=self.gw.screen_width,
                                       h=30,
                                       color=(50, 50, 50, 255),
                                       group=groups.menu_rect_back)
        self.add_obj(self.topbar, draw.go.RECT)

        self.button_new = button.Button(0, 370, 80, 30, text="new map")
        self.add_obj(self.button_new, draw.go.BUTTON)

        utils.mkdir2("maps")
        #create buttons for every map
        map_names = utils.list_dirs("./maps")

        if len(map_names) == 0:
            draw.change_text(self.text_load, "no maps found")
        map_names = sorted(map_names)
        print(map_names)

        offset = 0
        grid = 4
        offx = 20
        offy = 300
        # TODO use utils.grid_coords() method!
        for mn in map_names:
            x = int(math.fmod(offset, grid))
            y = math.floor(offset / grid)
            mb = button.Button(offx + (x * 90),
                               offy - (y * 40),
                               80,
                               30,
                               text=mn)
            self.add_obj(mb, draw.go.BUTTON)
            self.buttons_maps.append(mb)
            offset += 1
Example #4
0
    def get_gens_dir(self, gens_to_remove):

        all_gens_dir = ut.list_dirs(os.path.join(self.dir, '*'))
        if not gens_to_remove:
            return all_gens_dir

        gens_dir = list()
        for index, gen_dir in enumerate(all_gens_dir):
            if not any(gen_to_remove in gen_dir for gen_to_remove in gens_to_remove):
                gens_dir.append(gen_dir)

        return gens_dir
Example #5
0
def migrate_csv_data_to_db():
    for mkt in utils.list_dirs(join(base_path, 'data/csv')):
        dir_path = join(base_path, f'data/csv/{mkt}')
        files = utils.list_files(dir_path)

        for filename in files:
            ticker = utils.remove_filename_ext(filename)
            # for index tickers, change ^ to _
            collection = utils.convert_ticker_to_coll(ticker)
            print(f'ticker={ticker}, collection={collection}')

            with open(join(dir_path, filename)) as csv_file:
                documents = utils.parse_ohlcv_csv(csv_file)
                resp = mdb.write_many_records(mdb.db_to_use(mkt), collection,
                                              documents)
                added_index = mdb.add_index_on_date(mdb.db_to_use(mkt),
                                                    collection)
                print(f'Inserted: {len(resp.inserted_ids)} records')
                print(f'Index created: {added_index}')
Example #6
0
    def __init__(self, solid_run_dir):
        """Create and populate a new SolidRun instance.

        Arguments:
          solid_run_dir: path to the top-level directory holding the files
          generated by the SOLiD sequencer run e.g.
          /path/to/SOLiD/data/solid0123_20130426_FRAG_BC
        """

        # Initialise
        self.run_dir = None
        self.run_name = None
        self.run_info = None
        self.run_definition = None
        self.samples = []

        # Basic data the supplied directory name
        if not os.path.isdir(os.path.abspath(solid_run_dir)):
            # Directory not found
            logging.info("SOLiD data dir '%s' not found" % solid_run_dir)
            return
        self.run_dir = os.path.abspath(solid_run_dir)

        # Locate and process the run definition file
        self.run_name = self.run_dir.strip(os.sep).split(os.sep)[-1]
        self.run_defn_filn = os.path.join(
            self.run_dir, self.run_name + "_run_definition.txt")
        if not os.path.isfile(self.run_defn_filn):
            # Unable to find run definition
            logging.warning("Unable to find run definition file for %s" %
                            self.run_dir)
            # Attempt to recover: look for other possible candidates
            self.run_defn_filn = None
            for f in os.listdir(self.run_dir):
                if f.endswith("_run_definition.txt"):
                    self.run_defn_filn = os.path.join(self.run_dir, f)
                    logging.warning(
                        "%s: using run definition file %s" %
                        (os.path.basename(self.run_dir), self.run_defn_filn))
                    break

        if self.run_defn_filn:
            # Populate run definition object
            self.run_definition = SolidRunDefinition(self.run_defn_filn)
            # Get run name and info
            self.run_name = self.run_definition.runName
            self.run_info = SolidRunInfo(self.run_name)
            # Populate libraries
            for i in range(0, self.run_definition.nSamples()):
                sample_name = self.run_definition.getDataItem('sampleName', i)
                library_name = self.run_definition.getDataItem('library', i)
                # Barcoded samples
                #
                # Look for content in the "barcodes" column for the library
                # in the run definition file
                #
                # There may be several barcoded samples
                # Example barcode items:
                # --> "1"
                # --> "1,2,3,4,5,6,7,8"
                # (or could be empty)
                try:
                    barcodes = self.run_definition.getDataItem('barcodes', i)
                except IndexError:
                    barcodes = ''
                logging.debug("%s: barcodes: %s" % (library_name, barcodes))
                library_is_barcoded = (barcodes != '' and barcodes)
                if library_is_barcoded:
                    barcodes = barcodes.strip('"').split(',')

                # Look for the directory with the results
                #
                # There should be a symlink "results" that will
                # point to the actual results directory
                results = os.path.join(self.run_dir, sample_name, 'results')
                if os.path.islink(results):
                    libraries_dir = os.path.join(self.run_dir, sample_name,
                                                 os.readlink(results),
                                                 'libraries')
                else:
                    libraries_dir = None
                self.add_library(sample_name, library_name, libraries_dir,
                                 library_is_barcoded)
        else:
            logging.warning("No run definition file found for %s" %
                            self.run_dir)
            # Improvise run name and info
            self.run_name = os.path.basename(self.run_dir)
            self.run_info = SolidRunInfo(self.run_name)
            # Try to guess samples and libraries
            samples = []
            for s in utils.list_dirs(self.run_dir):
                logging.debug("Examining subdir %s" % s)
                # Look for 'results' subdir
                results = os.path.join(self.run_dir, s, 'results')
                if not os.path.isdir(results):
                    continue
                # Look for 'libraries' subdir
                if os.path.islink(results):
                    libraries_dir = os.path.join(self.run_dir, s,
                                                 os.readlink(results),
                                                 'libraries')
                else:
                    continue
                # Look for possible libraries
                for d in utils.list_dirs(libraries_dir):
                    logging.debug("Examining putative library subdir %s" % d)
                    self.add_library(s, d, libraries_dir, False)
Example #7
0
    def train_model(self, train_dataloader, test_dataloader, train_params: TrainParams, resume=False,new_train=False):
        """
            Start training the model with specified parameters.
        """
        print("####### Training The model...")
        self.params = train_params
        self.optimizer = train_params.optimizer
        # Get the device (GPU/CPU) and migrate the model to it
        device = train_params.device
        print("\t Setting up model on ", device.type, "...")
        if not os.path.exists(self.checkpoints_dir):
            os.mkdir(self.checkpoints_dir)

        target_repo = self.git_manager.get_repo('checkpoints')
            # Initialize training variables
        print("\t Initializing ", "...")
        self.min_MAE = 10000
        self.min_epoch = 0
        train_loss_list = []
        test_error_list = []
        start_epoch = 0

        dirs=utils.list_dirs(self.checkpoints_dir)
        train_dirs=re.findall('Train_[0-9]+',' '.join(dirs))
        if len(train_dirs)==0:
            
            last_train=1#self.checkpoints_dir = os.path.join(self.checkpoints_dir,('Train_1'))
        else:
            last_train=max(sorted([int(re.sub('Train_','',dirname)) for dirname in train_dirs]))  
            
        # If resume option is specified, restore state of model and resume training
        if new_train or (not resume):
            if len(train_dirs)==0:
                self.checkpoints_dir = os.path.join(self.checkpoints_dir,'Train_1')
                
            else:
                self.checkpoints_dir = os.path.join(self.checkpoints_dir, 'Train_'+str(last_train+1) )  

        else:
            self.checkpoints_dir = os.path.join(self.checkpoints_dir, 'Train_'+str(last_train))
            params_hist = [utils.extract_number(file_path) for file_path in glob.glob(
                os.path.join(os.path.join(self.checkpoints_dir), '*.pth'))]
            
            if len(params_hist) > 0:
                print("\t Restore Checkpoints2 found! Resuming training...")
                sorted_hist = sorted(params_hist)
                start_epoch = max(sorted_hist)
                last_epoch = glob.glob(os.path.join(os.path.join(
                    self.checkpoints_dir, 'epoch_'+str(start_epoch)+'.pth')))[0]

                _, self.min_MAE, self.min_epoch = self.load_chekpoint(
                    last_epoch)

                files_to_push = []
                for epoch in sorted_hist:
                    if epoch != self.min_epoch and epoch != start_epoch and epoch!= train_params.maxEpochs:
                        path = glob.glob(os.path.join(os.path.join(
                            self.checkpoints_dir, 'epoch_'+str(epoch)+'.pth')))[0]
                        obj = torch.load(path, map_location=device)
                        if obj['model_state_dict'] is not None or obj['optimizer_state_dict']is not None:
                            obj['model_state_dict'] = None
                            obj['optimizer_state_dict'] = None
                            self.save_checkpoint(obj, path)
                            files_to_push.append(path)

                if len(files_to_push)>0:
                    res = self.git_manager.push_files(
                        target_repo, files_to_push, 'checkpoints migration', branch=self.__class__.__name__,dir=os.path.basename(self.checkpoints_dir))
                    if isinstance(res, int)and res == len(files_to_push):
                        print(
                            '\t Successfully comitted previous checkpoints(', res, ' files).')

                    else:
                        raise RuntimeError('Couldn\'t push all files')
        self.to(device)
       
        start_epoch += 1

            # Start Train
        for epoch in range(start_epoch, train_params.maxEpochs+1):
            start = time.time()
                # Set the Model on training mode
            self.train()
            epoch_loss = 0
                # Run training pass (feedforward,backpropagation,...) for each batch
            for i, (img, gt_dmap) in enumerate(train_dataloader):
                torch.cuda.empty_cache()
                img = img.to(device).detach()
                gt_dmap = gt_dmap.to(device).detach()
                    # forward propagation
                try:
                    est_dmap = self(img)
                except RuntimeError as e:
                    if 'out of memory' in str(e):
                        torch.cuda.empty_cache()
                        est_dmap = self(img)

                if not est_dmap.size() == gt_dmap.size():

                    est_dmap = F.interpolate(est_dmap, size=(
                        gt_dmap.size()[2], gt_dmap.size()[3]), mode='bilinear')
                    # est_dmap = F.interpolate(est_dmap, size=(
                    #     gt_dmap.size()[1], gt_dmap.size()[2]), mode='bilinear')
                # if torch.isnan(est_dmap): print('Estimated is nan')
                # if torch.isnan(gt_dmap): print('Ground truth is nan')
                    # calculate loss
                loss = train_params.criterion(est_dmap, gt_dmap)
                epoch_loss += loss.item()
                # if i%5==0: print(est_dmap.data.sum(),gt_dmap.data.sum())
               
                torch.cuda.empty_cache()
                
                    # Setting gradient to zero ,(only in pytorch , because of backward() that accumulate gradients)
                self.optimizer.zero_grad()
                    # Backpropagation
                loss.backward()
                self.optimizer.step()
                del img, gt_dmap, est_dmap
                
            print("\t epoch:"+str(epoch)+"\n", "\t\t loss:",
                  epoch_loss/len(train_dataloader))
            train_loss_list.append(epoch_loss/len(train_dataloader))      

           

                # Set the Model on validation mode
            self.eval()
            MAE = 0
            MSE = 0
            for i, (img, gt_dmap) in enumerate(test_dataloader):
                img = img.to(device)
                gt_dmap = gt_dmap.to(device)
                    # forward propagation
                try:
                    est_dmap = self(img)
                except RuntimeError as e:
                    if 'out of memory' in str(e):
                        torch.cuda.empty_cache()
                        est_dmap = self(img)

                if not est_dmap.size() == gt_dmap.size():
                    est_dmap = F.interpolate(est_dmap, size=(
                        gt_dmap.size()[2], gt_dmap.size()[3]), mode='bilinear')
                    # est_dmap = F.interpolate(est_dmap, size=(
                    #     gt_dmap.size()[1], gt_dmap.size()[2]), mode='bilinear')
                mae=abs(est_dmap.data.sum()-gt_dmap.data.sum()).item()
                MAE += mae
                MSE += mae**2
                del img, gt_dmap, est_dmap
                torch.cuda.empty_cache()
            MAE = MAE/len(test_dataloader)
            MSE = np.math.sqrt(MSE/len(test_dataloader))

            if MAE < self.min_MAE:
                self.min_MAE = MAE
                self.min_epoch = epoch
            test_error_list.append(MAE)
            print("\t\t error:"+str(MAE)+" min_MAE:" +
                  str(self.min_MAE)+" min_epoch:"+str(self.min_epoch))

            end = time.time()      
            check_point = {
                'model_state_dict': self.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'loss': epoch_loss/len(train_dataloader),
                'mae': MAE,
                'min_MAE': self.min_MAE,
                'min_epoch': self.min_epoch,
                'duration':str(datetime.timedelta(seconds=end-start))
            }
                # Save checkpoint
            self.save_checkpoint(check_point, os.path.join(
                self.checkpoints_dir, 'epoch_'+str(epoch)+'.pth'))
   
        
            # Save training summary into disk
        self.make_summary(finished=True)
    
        print('Training finished.')
        return (train_loss_list, test_error_list, self.min_epoch, self.min_MAE)
    # Process directories
    seqdirs = []
    for d in args:
        print "Processing %s..." % d
        for year in years:
            print "- Year %s" % year
            dirn = os.path.join(d, "%s" % year)
            if not os.path.isdir(dirn):
                continue
            for platform in platform_list:
                platform_dirn = os.path.join(dirn, platform)
                if not os.path.isdir(platform_dirn):
                    continue
                print "-- Platform %s" % platform
                # Get all the directories for this year/platform combination
                for run in utils.list_dirs(platform_dirn):
                    print "--- Run %s" % run
                    run_dir = os.path.join(platform_dirn, run)
                    if os.path.islink(run_dir):
                        logging.warning("%s: is link, ignoring" % run_dir)
                    else:
                        seqdir = SeqDataSizes(
                            run, run_dir, year=year, platform=platform, include_subdirs=options.include_subdirs
                        )
                        seqdir.get_disk_usage()
                        seqdirs.append(seqdir)

    # Calculate totals for each year
    usage = dict()
    for year in years:
        usage[year] = dict()
    def __init__(self,solid_run_dir):
        """Create and populate a new SolidRun instance.

        Arguments:
          solid_run_dir: path to the top-level directory holding the files
          generated by the SOLiD sequencer run e.g.
          /path/to/SOLiD/data/solid0123_20130426_FRAG_BC
        """

        # Initialise
        self.run_dir = None
        self.run_name = None
        self.run_info = None
        self.run_definition = None
        self.samples = []

        # Basic data the supplied directory name
        if not os.path.isdir(os.path.abspath(solid_run_dir)):
            # Directory not found
            logging.info("SOLiD data dir '%s' not found" % solid_run_dir)
            return
        self.run_dir = os.path.abspath(solid_run_dir)

        # Locate and process the run definition file
        self.run_name = self.run_dir.strip(os.sep).split(os.sep)[-1]
        self.run_defn_filn = os.path.join(self.run_dir,
                                          self.run_name+"_run_definition.txt")
        if not os.path.isfile(self.run_defn_filn):
            # Unable to find run definition
            logging.warning("Unable to find run definition file for %s" % self.run_dir)
            # Attempt to recover: look for other possible candidates
            self.run_defn_filn = None
            for f in os.listdir(self.run_dir):
                if f.endswith("_run_definition.txt"):
                    self.run_defn_filn = os.path.join(self.run_dir,f)
                    logging.warning("%s: using run definition file %s" % 
                                    (os.path.basename(self.run_dir),self.run_defn_filn))
                    break

        if self.run_defn_filn:
            # Populate run definition object
            self.run_definition = SolidRunDefinition(self.run_defn_filn)
            # Get run name and info
            self.run_name = self.run_definition.runName
            self.run_info = SolidRunInfo(self.run_name)
            # Populate libraries
            for i in range(0,self.run_definition.nSamples()):
                sample_name = self.run_definition.getDataItem('sampleName',i)
                library_name = self.run_definition.getDataItem('library',i)
                # Barcoded samples
                #
                # Look for content in the "barcodes" column for the library
                # in the run definition file
                #
                # There may be several barcoded samples
                # Example barcode items:
                # --> "1"
                # --> "1,2,3,4,5,6,7,8"
                # (or could be empty)
                try:
                    barcodes = self.run_definition.getDataItem('barcodes',i)
                except IndexError:
                    barcodes = ''
                logging.debug("%s: barcodes: %s" % (library_name,barcodes))
                library_is_barcoded = (barcodes != '' and barcodes)
                if library_is_barcoded:
                    barcodes = barcodes.strip('"').split(',')

                # Look for the directory with the results
                #
                # There should be a symlink "results" that will
                # point to the actual results directory
                results = os.path.join(self.run_dir,sample_name,'results')
                if os.path.islink(results):
                    libraries_dir = os.path.join(self.run_dir,
                                                 sample_name,
                                                 os.readlink(results),
                                                 'libraries')
                else:
                    libraries_dir = None
                self.add_library(sample_name,library_name,
                                 libraries_dir,library_is_barcoded)
        else:
            logging.warning("No run definition file found for %s" % self.run_dir)
            # Improvise run name and info
            self.run_name = os.path.basename(self.run_dir)
            self.run_info = SolidRunInfo(self.run_name)
            # Try to guess samples and libraries
            samples = []
            for s in utils.list_dirs(self.run_dir):
                logging.debug("Examining subdir %s" % s)
                # Look for 'results' subdir
                results = os.path.join(self.run_dir,s,'results')
                if not os.path.isdir(results):
                    continue
                # Look for 'libraries' subdir
                if os.path.islink(results):
                    libraries_dir = os.path.join(self.run_dir,s,
                                                 os.readlink(results),
                                                 'libraries')
                else:
                    continue
                # Look for possible libraries
                for d in utils.list_dirs(libraries_dir):
                    logging.debug("Examining putative library subdir %s" % d)
                    self.add_library(s,d,libraries_dir,False)
def get_addons(repo):
    '''Returns a list of addon objects for a given repo object.'''
    addon_paths = list_dirs(repo.path)
    return [Addon(repo, addon_path) for addon_path in addon_paths]
Example #11
0
    # Process directories
    seqdirs = []
    for d in args:
        print "Processing %s..." % d
        for year in years:
            print "- Year %s" % year
            dirn = os.path.join(d, "%s" % year)
            if not os.path.isdir(dirn):
                continue
            for platform in platform_list:
                platform_dirn = os.path.join(dirn, platform)
                if not os.path.isdir(platform_dirn):
                    continue
                print "-- Platform %s" % platform
                # Get all the directories for this year/platform combination
                for run in utils.list_dirs(platform_dirn):
                    print "--- Run %s" % run
                    run_dir = os.path.join(platform_dirn, run)
                    if os.path.islink(run_dir):
                        logging.warning("%s: is link, ignoring" % run_dir)
                    else:
                        seqdir = SeqDataSizes(
                            run,
                            run_dir,
                            year=year,
                            platform=platform,
                            include_subdirs=options.include_subdirs)
                        seqdir.get_disk_usage()
                        seqdirs.append(seqdir)

    # Calculate totals for each year