def folderRN(gspath, newpath, cores=1): """ """ lis = lsFiles([gspath]) if lis != 0: h.parrun(['gsutil -m mv ' + val + " " + newpath for val in lis], cores=cores) else: raise ValueError('no such folder')
def recoverFiles(files, cores=1): """ recover a set of files in parallel that were erased files need to have their #id appended found using ls -al file Args: ---- files: gs paths location: to move the files to """ cmd = ['gsutil mv ' + f + ' ' + f.split('#')[0] for f in files] h.parrun(cmd, cores=cores)
def patternRN(rename_dict, location, wildcards, types=[], dryrun=True, check_dependencies=True, cores=1): """ rename/move a bunch of GCP objects found in some specific places Args: ----- rename_dict: dict(prevName,newName) location: wildcards: list[str] can be one of ['**', '.*', '*.','-.*'] if needs to be ** means any occurence of this file in any folder will change its name .* means all file unregarding of the suffix, will rename them all a.bam [a]da.bai to b.bam, [b]da.bai *. means all files with the suffix, will change the suffix of these files from a to b -.* means all file unregarding of the suffix, will rename them. not just replacing the a part with a to b but the full file name [a]dea.bam to b.bam types: Nothing yet test: if test, just shows the command but does not run it cores: cores tells on how many processor to parallelize the tas#k """ val = [] for k, v in rename_dict.items(): val.append(v) if k in val and check_dependencies: raise ValueError('circular dependency in the rename with key ' + k) for k, v in rename_dict.items(): loc = location if '**' in wildcards: loc += '**/' if '*.' in wildcards or '-.*' in wildcards: loc += '*' loc += k if '.*' in wildcards or '-.*' in wildcards: loc += '*' res = os.popen('gsutil -m ls ' + loc).read().split('\n')[:-1] print('found ' + str(len(res)) + ' files to rename') if '-.*' in wildcards: cmd = [ "gsutil mv " + val + " " + '/'.join(val.split('/')[:-1]) + '/' + v + '.' + '.'.join(val.split('/')[-1].split('.')[1:]) for val in res ] else: cmd = ["gsutil mv " + val + " " + val.replace(k, v) for val in res] if dryrun: print(cmd) else: h.parrun(cmd, cores=cores)
def deleteJob(workspaceid, subid, taskid, deleteCurrent=False, dryrun=True): """ removes files generated by a job on Terra Args: ----- workspaceid: str wokspace name subid: str the name of the job taskid: str the name of the task in this job DeleteCurrent: bool whether or not to delete files if they appear in one of the sample/samplesets/pairs data tables dryrun: bool just plot the commands but don't execute them """ wm = dm.WorkspaceManager(workspaceid) bucket = wm.get_bucket_id() data = [] if deleteCurrent: if dryrun: print('gsutil -m rm gs://' + bucket + '/' + subid + '/*/' + taskid + '/**') else: res = subprocess.run('gsutil -m rm gs://' + bucket + '/' + subid + '/*/' + taskid + '/**', shell=True, capture_output=True) if res.returncode != 0: raise ValueError(str(res.stderr)) else: res = subprocess.run('gsutil -m ls gs://' + bucket + '/' + subid + '/*/' + taskid + '/**', shell=True, capture_output=True) if res.returncode != 0 or len(str(res.stdout)) < 4: raise ValueError(str(res.stderr)) data += str(res.stdout)[2:-1].split('\\n')[:-1] if "TOTAL:" in data[-1]: data = data[:-1] sam = pd.concat( [wm.get_samples(), wm.get_pairs(), wm.get_sample_sets()]) tokeep = set([ val for val in sam.values.ravel() if type(val) is str and val[:5] == 'gs://' ]) torm = set(data) - tokeep if dryrun: print(torm) else: h.parrun(['gsutil rm ' + i for i in torm], cores=12)
async def indexBams(bucketpath, cores=4): """ given a bucket path, will index all .bam files without an associated index and return their paths """ files = gcp.lsFiles([bucketpath]) bams = [val for val in files if '.bam' in val[-4:]] unindexed = [ val for val in bams if val[:-4] + '.bai' not in files and val[:4] + '.bam.bai' not in files ] print("found " + str(len(unindexed)) + " files to reindex") h.parrun([ "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools index " + val for val in unindexed ], cores) return {val: val[:-4] + ".bam.bai" for val in unindexed}
def changeGSlocation(workspacefrom, newgs, workspaceto=None, prevgslist=[], index_func=None, flag_non_matching=False, onlysamples=[], onlycol=[], entity='samples', droplists=True, keeppath=True, dry_run=True, par=20): """ Function to move data around from one workspace to a bucket or to another workspace. can also work on dataframes containing lists of paths Args: ----- workspacefrom: the workspace name where the data is newgs: the newgs bucket where to copy the data in workspaceto: if we should have these new samples and columns added to another workspace instead \ of just updating the same one (usefull to copy one workspace to another) prevgslist: if providded, will only move files that are in the set of google bucket listed here index_func: *WIP* unused flag_non_matching: if set to true and prevgslist is set to some value, will return a list of samples that were not matched to anything in the prevgslist onlycol: do this only on a subset of columns in terra workspace entity: the entity in the terra workspace on which to do this droplists: if set to true remove all columns containing list of paths (list of path are not uploaded well in terra) keeppath: if set to true, will keep the full object path and just change the bucket dry_run: if set to true will not update anything on Terra but just return the result par: on how many processor do the gs copy commands. Returns: ------- torename: the pandas.df containing the new paths flaglist: the samples that were non matching (if flag_non_matching is set to true) """ flaglist = [] wmfrom = dm.WorkspaceManager(workspacefrom) a = wmfrom.get_entities(entity) if len(onlysamples) > 0: a = a[a.index.isin(onlysamples)] print("using the data from " + workspacefrom + " " + entity + " list") if len(a) == 0: raise ValueError('no ' + entity) if onlycol: a = a[onlycol] todrop = set() torename = {} print( 'this should only contains gs:// paths otherwise precise columns using \"onlycol\"' ) for col in a.columns.tolist(): val = [] for k, prev in a[col].iteritems(): if type(prev) is str: new = prev if newgs not in new: if len(prevgslist) > 0: for prevgs in prevgslist: new = new.replace(prevgs, newgs) if flag_non_matching: if new == prev: flaglist.append(prev) if not keeppath: new = newgs + new.split('/')[-1] else: new = newgs + '/'.join(new.split('/')[3:]) else: print("sample " + str(k) + " was already in the new gs") val.append(new) # IN CASE WE HAVE A LIST if type(prev) is list: if droplists: todrop.add(k) continue ind = [] for prevname in prev: newname = prevname if newgs not in newname: if len(prevgslist) > 0: for prevgs in prevgslist: new = new.replace(prevgs, newgs) if flag_non_matching: if new == prev: flaglist.append(prev) if not keeppath: new = newgs + new.split('/')[-1] else: new = newgs + '/'.join(new.split('/')[3:]) else: print("sample " + str(k) + " was already in the new gs") ind.append(newname) val.append(ind) torename.update({col: val}) if not dry_run: if keeppath: h.parrun([ 'gsutil mv ' + a.iloc[i][col] + ' ' + v for i, v in enumerate(val) ], cores=20) else: gcp.mvFiles(a[col].tolist(), newgs) else: if keeppath: print([ 'gsutil mv ' + a.iloc[i][col] + ' ' + v for i, v in enumerate(val) ]) else: print("mv " + str(a[col].tolist()) + " " + newgs) torename = pd.DataFrame(data=torename, index=[i for i in a.index.tolist() if i != 'nan']) if workspaceto is not None: wmto = dm.WorkspaceManager(workspaceto) if not dry_run: wmto.disable_hound().update_entity_attributes(entity, torename) return torename, flaglist
async def getSpikeInControlScales(refgenome, fastq=None, fastQfolder='', mapper='bwa', pairedEnd=False, cores=1, pathtosam='samtools', pathtotrim_galore='trim_galore', pathtobwa='bwa', totrim=True, tomap=True, tofilter=True, results='res/', toremove=False): """ Will extract the spikeInControls from a fastq file (usefull for, let say ChIPseq data with spike ins) Count based sequencing data is not absolute and will be normalized as each sample will be sequenced at a specific depth. To figure out what was the actual sample concentration, we use Spike In control You should have FastQfolder/[NAME].fastq & BigWigFolder/[NAME].bw with NAME being the same for the same samples Args: ----- refgenome: str the file path to the indexed reference genome FastQfolder: str the folder path where the fastq files are stored (should be named the same as files in BigWigFolder) BigWigFolder: str the folder path where the bigwig files are stored (should be named the same as files in FastQfolder) mapper: str flag to 'bwa', ... pairedEnd: Bool flat to true for paired end sequences. if true, You should have FastQfolder/[NAME]_1|2.fastq Returns: -------- dict(file,float) the scaling factor dict """ if len(fastQfolder) > 0: print('using all files from folder') fastqs = os.listdir(fastQfolder) fastqs = [i for i in fastqs if '.fq.gz' == i[-6:] or '.fastq.gz' == i[-9:]] fastqs.sort() if pairedEnd and (tomap or totrim): print("need to be name_*1, name_*2") fastqs = [i for i in h.grouped(fastqs, 2)] elif fastq is None: raise ValueError('you need input files') else: if type(fastq) is list: print('your files need to be all in the same folder') fastQfolder = '/'.join(fastq[0].split('/')[:-1]) + '/' if not totrim and not tomap: fastqs = [f.split('/')[-1] for f in fastq] else: print("need to be name_*1, name_*2") fastqs = [[f[0].split('/')[-1], f[1].split('/')[-1]] for f in h.grouped(fastq, 2)] else: fastQfolder = '/'.join(fastq.split('/')[:-1]) + '/' fastqs = [fastq.split('/')[-1]] print(fastqs) if not totrim: print("you need to have your files in the " + results + " folder") if totrim and tomap: print("\n\ntrimming\n\n") if pairedEnd: cmds = [] rm = [] for file in fastqs: cmd = pathtotrim_galore + ' --paired --fastqc --gzip ' + fastQfolder + \ file[0] + ' ' + fastQfolder + file[1] + " -o " + results if toremove: rm.append('rm ' + fastQfolder + file[0] + ' ' + fastQfolder + file[1]) cmds.append(cmd) print(cmds) h.parrun(cmds, cores, add=rm) fastqs = [[file[0].split('.')[ 0] + '_val_1.fq.gz', file[1].split('.')[0] + '_val_2.fq.gz'] for file in fastqs] if tomap: print("\n\nmapping\n\n") if pairedEnd: cmds = [] rm = [] for file in fastqs: cmd = pathtobwa + ' mem ' + refgenome + ' ' + results + file[0] + ' ' + results +\ file[1] + ' | ' + pathtosam + ' sort - -o ' + \ results + file[0].split('.')[0] + '.sorted.bam' if toremove: rm.append('rm ' + results + file[0] + ' ' + results + file[1]) cmds.append(cmd) h.parrun(cmds, cores, add=rm) fastqs = [file[0].split('.')[0] + '.sorted.bam' for file in fastqs] if tofilter: print("\n\nfiltering\n\n") cmds = [] rm = [] h.parrun([pathtosam + ' index ' + results + file.split('.') [0] + '.sorted.bam' for file in fastqs], cores) h.parrun([pathtosam + ' flagstat ' + results + file.split('.')[0] + '.sorted.bam > ' + results + file.split('.')[0] + '.sorted.bam.flagstat' for file in fastqs], cores) h.parrun([pathtosam + ' idxstats ' + results + file.split('.')[0] + '.sorted.bam > ' + results + file.split('.')[0] + '.sorted.bam.idxstat' for file in fastqs], cores) fastqs = [file.split('.')[0] + '.sorted.bam' for file in fastqs] else: print("files need to be named: NAME.sorted.bam") fastqs = [file for file in fastqs if '.sorted.bam' == file[-11:]] mapped = {} norm = {} unique_mapped = {} print("\n\ncounting\n\n") for file in fastqs: mapped[file.split('.')[0]] = int(os.popen(pathtosam + ' view -c -F 0x004 -F 0x0008 -f 0x001 -F 0x0400 -q 1 ' + results + file + ' -@ ' + str(cores)).read().split('\n')[0]) # unique_mapped[file.split('.')[0]] = int(re.findall("Mapped reads: (\d+)", os.popen('bamtools stats -in '+results + # file + '.sorted.bam').read())[0]) nbmapped = np.array([i for i in mapped.values()]) nbmapped = np.sort(nbmapped)[0] / nbmapped.astype(float) for i, val in enumerate(mapped.keys()): norm[val] = nbmapped[i] return norm, mapped, # unique_mapped