def estimate_current_remote_usage(remote, username, r_dir, l_dir, fastq2rsem_ratio): """ estimate the space that has already been or will be consumed by rsem_output by walking through each GSM and computing the sum of their estimated usage, if rsem.COMPLETE exists for a GSM, then ignore that GSM mechanism: fetch the list of files in r_dir, and find the fastq.gz for each GSM, then find the corresponding fastq.gz in l_dir, and estimate sizes based on them :param find_cmd: should be in the form of find {remote_dir} :param r_dir: remote rsem output directory :param l_dir: local rsem output directory """ files = fetch_remote_file_list(remote, username, r_dir) usage = 0 for dir_ in sorted(files): match = re.search(r'(GSM\d+$)', os.path.basename(dir_)) if match: rsem_comp = os.path.join(dir_, 'rsem.COMPLETE') if (not rsem_comp in files) and (not misc.is_empty_dir(dir_, files)): # only count the disk spaces used by those GSMs that are # being processed gsm_dir = dir_.replace(r_dir, l_dir) usage += estimate_rsem_usage(gsm_dir, fastq2rsem_ratio) return usage
def test_is_empty_dir(self): self.assertFalse(misc.is_empty_dir('/p', ['/p', '/p/a.txt'])) self.assertTrue(misc.is_empty_dir('/p', ['/p', '/s', '/s/a.txt']))