def rep_combine(db_fn, gathered_fn, group_size): # new_db = "rep_db" db = symlink_db(db_fn) # db ="raw_reads" # Remove old, in case of resume. # io_io.syscall('rm -f .{db}.*.rep{group_size}.anno .{db}.*.rep{group_size}.data'.format(**locals())) # gathered = io_io.deserialize(gathered_fn) # gathered_dn = os.path.dirname(gathered_fn) with open(gathered_fn, "r") as f: data = f.readlines()[0][1:-1] new_data = data.replace(" ", "") gathered = new_data.split(",") print(len(gathered)) # Create symlinks for all track-files. for job in gathered: job_path = os.path.dirname(job) if not os.path.isabs(job_path): LOG.info('Found relative done-file: {!r}'.format(job_path)) annos = glob.glob('{}/.{}.*.rep{}.anno'.format(job_path, db, group_size)) datas = glob.glob('{}/.{}.*.rep{}.data'.format(job_path, db, group_size)) assert len(annos) == len( datas), 'Mismatched globs:\n{!r}\n{!r}'.format(annos, datas) for fn in annos + datas: symlink(fn, force=False) cmd = 'Catrack -vdf {} rep{}'.format(db, group_size) io_io.syscall(cmd)
def rep_apply(db_fn, script_fn): # daligner would put track-files in the DB-directory, not '.', # so we need to symlink everything first. db = symlink_db(db_fn) symlink(script_fn) io_io.syscall('bash -vex {}'.format(os.path.basename(script_fn)))
def merge_apply(las_paths_fn, las_fn): """Merge the las files into one, a few at a time. This replaces the logic of HPC.daligner. """ with open(las_fn, "r") as f: las_name = f.readlines()[0].strip() io_io.rm_force(las_name) print(las_name) #all_las_paths = rel_to(io_io.deserialize(las_paths_fn), os.path.dirname(las_paths_fn)) all_las_paths = io_io.deserialize(las_paths_fn) # Create symlinks, so system calls will be shorter. all_syms = list() for fn in all_las_paths: symlink(fn) all_syms.append(os.path.basename(fn)) curr_paths = sorted(all_syms) # Merge a few at-a-time. at_a_time = 250 # max is 252 for LAmerge level = 1 while len(curr_paths) > 1: level += 1 next_paths = list() for i, paths in enumerate(ichunked(curr_paths, at_a_time)): tmp_las = 'L{}.{}.las'.format(level, i + 1) paths_arg = ' '.join(paths) cmd = 'LAmerge -v {} {}'.format(tmp_las, paths_arg) io_io.syscall(cmd) next_paths.append(tmp_las) curr_paths = next_paths io_io.syscall('mv -f {} {}'.format(curr_paths[0], 'keep-this')) io_io.syscall('mv -f {} {}'.format('keep-this', las_name))
def _get_rep_daligner_split_scripts(REPmask_opt, db_fn, group_size, coverage_limit): db = os.path.splitext(db_fn)[0] dbname = os.path.basename(db) tracks = get_tracks(db_fn) # First, run HPC.REPmask immediately. script = ''.join([ script_HPC_REPmask(REPmask_opt, db, tracks, prefix='rep-jobs', group_size=group_size, coverage_limit=coverage_limit), ]) script_fn = 'split_db.sh' with open(script_fn, 'w') as ofs: exe = bash.write_sub_script(ofs, script) io_io.syscall('bash -vex {}'.format(script_fn)) # We now have files like rep-jobs.01.OVL # We need to parse that one. (We ignore the others.) lines = open('rep-jobs.01.OVL').readlines() scripts = list() for line in lines: if line.startswith('#'): continue if not line.strip(): continue scripts.append(line) if len(scripts) == 1: scripts = [fake_rep_as_daligner_script_moved(s, dbname) for s in scripts] else: scripts = [fake_rep_as_daligner_script_unmoved(s, dbname) for s in scripts] for i, script in enumerate(scripts): LAcheck = 'LAcheck -vS {} *.las'.format(db) script += '\n' + LAcheck + '\n' scripts[i] = "set -uex\n"+script return scripts
def daligner_split(daligner_opt, db_fn, length_cutoff_fn): db = os.path.splitext(db_fn)[0] dbname = os.path.basename(db) tracks = get_tracks(db_fn) script = ''.join([ script_HPC_daligner(daligner_opt, db, length_cutoff_fn, tracks, prefix='daligner-jobs'), ]) script_fn = 'split_db.sh' with open(script_fn, 'w') as ofs: exe = bash.write_sub_script(ofs, script) io_io.syscall('bash -vex {}'.format(script_fn)) # We now have files like daligner-jobs.01.OVL # We need to parse that one. (We ignore the others.) lines = open('daligner-jobs.01.OVL').readlines() preads_aln = True if dbname == 'preads' else False xformer = functional.get_script_xformer(preads_aln) LOG.debug('preads_aln={!r} (True => use daligner_p)'.format(preads_aln)) scripts = list() for line in lines: if line.startswith('#'): continue if not line.strip(): continue line = xformer(line) # Use daligner_p for preads. scripts.append(line) """ Special case: # Daligner jobs (1) daligner raw_reads raw_reads && mv raw_reads.raw_reads.las raw_reads.las In that case, the "block" name is empty. (See functional.py) We will rename the file. (LAmerge on a single input is a no-op, which is fine.) """ if len(scripts) == 1: script = scripts[0] re_script = re.compile( r'(mv\b.*\S+\s+)(\S+)$') # no trailing newline, for now mo = re_script.search(script) if not mo: msg = 'Only 1 line in daligner-jobs.01.OVL, but\n {!r} did not match\n {!r}.'.format( re_script.pattern, script) LOG.warning(msg) else: new_script = re_script.sub( r'\1{dbname}.1.{dbname}.1.las'.format(dbname=dbname), script, 1) msg = 'Only 1 line in daligner-jobs.01.OVL:\n {!r} matches\n {!r}. Replacing with\n {!r}.'.format( re_script.pattern, script, new_script) LOG.warning(msg) scripts = [new_script] for i, script in enumerate(scripts): LAcheck = 'LAcheck -vS {} *.las'.format(db) script += '\n' + LAcheck + '\n' scripts[i] = "set -vex\n" + script for i, script in enumerate(scripts): job_id = 'j_{:04d}'.format(i) script_dir = os.path.join('.', 'daligner-scripts', job_id) script_fn = os.path.join(script_dir, 'run_daligner.sh') io_io.mkdirs(script_dir) with open(script_fn, 'w') as stream: stream.write(script)
def tan_split(tanmask_opt, db_fn, uows_fn, bash_template_fn): with open(bash_template_fn, 'w') as stream: stream.write( "python -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} tan-split --split={output.split} --bash-template={output.bash_template}" ) # TANmask would put track-files in the DB-directory, not '.', # so we need to symlink everything first. db = symlink_db(db_fn) script = ''.join([ script_HPC_TANmask(tanmask_opt, db, prefix='tan-jobs'), ]) script_fn = 'split_db.sh' with open(script_fn, 'w') as ofs: exe = bash.write_sub_script(ofs, script) io_io.syscall('bash -vex {}'.format(script_fn)) # We now have files like tan-jobs.01.OVL # We need to parse that one. (We ignore the others.) lines = open('tan-jobs.01.OVL').readlines() re_block = re.compile(r'{}(\.\d+|)'.format(db)) def get_blocks(line): """Return ['.1', '.2', ...] """ return [mo.group(1) for mo in re_block.finditer(line)] scripts = list() for line in lines: if line.startswith('#'): continue if not line.strip(): continue blocks = get_blocks(line) assert blocks, 'No blocks found in {!r} from {!r}'.format( line, 'tan-jobs.01.OVL') las_files = ' '.join('TAN.{db}{block}.las'.format(db=db, block=block) for block in blocks) script_lines = [ line, 'LAcheck {} {}\n'.format(db, las_files), 'TANmask {} {}\n'.format(db, las_files), 'rm -f {}\n'.format(las_files), ] if [''] == blocks: # special case -- If we have only 1 block, then HPC.TANmask fails to use the block-number. # However, if there are multiple blocks, it is still possible for a single line to have # only 1 block. So we look for a solitary block that is '', and we symlink the .las to pretend # that it was named properly in the first place. script_lines.append( 'mv .{db}.tan.data .{db}.1.tan.data\n'.format(db=db)) script_lines.append( 'mv .{db}.tan.anno .{db}.1.tan.anno\n'.format(db=db)) scripts.append(''.join(script_lines)) db_dir = os.path.dirname(db_fn) for i, script in enumerate(scripts): bash_script = """ db_dir={db_dir} ln -sf ${{db_dir}}/.{db_prefix}.bps . ln -sf ${{db_dir}}/.{db_prefix}.idx . ln -sf ${{db_dir}}/{db_prefix}.db . ln -sf ${{db_dir}}/.{db_prefix}.dust.anno . ln -sf ${{db_dir}}/.{db_prefix}.dust.data . {script} """.format(db_dir=db_dir, db_prefix="raw_reads", script=script) job_id = 'tan_{:03d}'.format(i) script_dir = os.path.join('.', 'tan-scripts', job_id) script_fn = os.path.join(script_dir, 'run_datander.sh') io_io.mkdirs(script_dir) with open(script_fn, 'w') as stream: stream.write('{}\n'.format(bash_script))