Example #1
0
def folderRN(gspath, newpath, cores=1):
    """
  """
    lis = lsFiles([gspath])
    if lis != 0:
        h.parrun(['gsutil -m mv ' + val + " " + newpath for val in lis],
                 cores=cores)
    else:
        raise ValueError('no such folder')
Example #2
0
def recoverFiles(files, cores=1):
    """
  recover a set of files in parallel that were erased 

  files need to have their #id appended found using ls -al file

  Args:
  ----
      files: gs paths
      location: to move the files to
  """
    cmd = ['gsutil mv ' + f + ' ' + f.split('#')[0] for f in files]
    h.parrun(cmd, cores=cores)
Example #3
0
def patternRN(rename_dict,
              location,
              wildcards,
              types=[],
              dryrun=True,
              check_dependencies=True,
              cores=1):
    """
  rename/move a bunch of GCP objects found in some specific places

  Args:
  -----
      rename_dict: dict(prevName,newName)
      location:
      wildcards: list[str] can be one of  ['**', '.*', '*.','-.*'] if needs to be 
                  ** means any occurence of this file in any folder will change its name
                  .* means all file unregarding of the suffix, will rename them all a.bam [a]da.bai to b.bam, [b]da.bai
                  *. means all files with the suffix, will change the suffix of these files from a to b
                  -.* means all file unregarding of the suffix, will rename them. not just replacing the a part with a to b but the full file name [a]dea.bam to b.bam
      types: Nothing yet
      test: if test, just shows the command but does not run it
      cores:  cores tells on how many processor to parallelize the tas#k
  """
    val = []
    for k, v in rename_dict.items():
        val.append(v)
        if k in val and check_dependencies:
            raise ValueError('circular dependency in the rename with key ' + k)
    for k, v in rename_dict.items():
        loc = location
        if '**' in wildcards:
            loc += '**/'
        if '*.' in wildcards or '-.*' in wildcards:
            loc += '*'
        loc += k
        if '.*' in wildcards or '-.*' in wildcards:
            loc += '*'
        res = os.popen('gsutil -m ls ' + loc).read().split('\n')[:-1]
        print('found ' + str(len(res)) + ' files to rename')
        if '-.*' in wildcards:
            cmd = [
                "gsutil mv " + val + " " + '/'.join(val.split('/')[:-1]) +
                '/' + v + '.' + '.'.join(val.split('/')[-1].split('.')[1:])
                for val in res
            ]
        else:
            cmd = ["gsutil mv " + val + " " + val.replace(k, v) for val in res]
        if dryrun:
            print(cmd)
        else:
            h.parrun(cmd, cores=cores)
Example #4
0
def deleteJob(workspaceid, subid, taskid, deleteCurrent=False, dryrun=True):
    """
  removes files generated by a job on Terra

  Args:
  -----
    workspaceid: str wokspace name
    subid: str the name of the job
    taskid: str the name of the task in this job
    DeleteCurrent: bool whether or not to delete files if they appear in one of the sample/samplesets/pairs data tables
    dryrun: bool just plot the commands but don't execute them
  """
    wm = dm.WorkspaceManager(workspaceid)
    bucket = wm.get_bucket_id()
    data = []
    if deleteCurrent:
        if dryrun:
            print('gsutil -m rm gs://' + bucket + '/' + subid + '/*/' +
                  taskid + '/**')
        else:
            res = subprocess.run('gsutil -m rm gs://' + bucket + '/' + subid +
                                 '/*/' + taskid + '/**',
                                 shell=True,
                                 capture_output=True)
            if res.returncode != 0:
                raise ValueError(str(res.stderr))
    else:
        res = subprocess.run('gsutil -m ls gs://' + bucket + '/' + subid +
                             '/*/' + taskid + '/**',
                             shell=True,
                             capture_output=True)
        if res.returncode != 0 or len(str(res.stdout)) < 4:
            raise ValueError(str(res.stderr))
        data += str(res.stdout)[2:-1].split('\\n')[:-1]
        if "TOTAL:" in data[-1]:
            data = data[:-1]
        sam = pd.concat(
            [wm.get_samples(),
             wm.get_pairs(),
             wm.get_sample_sets()])
        tokeep = set([
            val for val in sam.values.ravel()
            if type(val) is str and val[:5] == 'gs://'
        ])
        torm = set(data) - tokeep
        if dryrun:
            print(torm)
        else:
            h.parrun(['gsutil rm ' + i for i in torm], cores=12)
Example #5
0
async def indexBams(bucketpath, cores=4):
    """
    given a bucket path, will index all .bam files without an associated index and return their paths
    """
    files = gcp.lsFiles([bucketpath])
    bams = [val for val in files if '.bam' in val[-4:]]
    unindexed = [
        val for val in bams
        if val[:-4] + '.bai' not in files and val[:4] + '.bam.bai' not in files
    ]
    print("found " + str(len(unindexed)) + " files to reindex")
    h.parrun([
        "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools index "
        + val for val in unindexed
    ], cores)
    return {val: val[:-4] + ".bam.bai" for val in unindexed}
Example #6
0
def changeGSlocation(workspacefrom,
                     newgs,
                     workspaceto=None,
                     prevgslist=[],
                     index_func=None,
                     flag_non_matching=False,
                     onlysamples=[],
                     onlycol=[],
                     entity='samples',
                     droplists=True,
                     keeppath=True,
                     dry_run=True,
                     par=20):
    """
  Function to move data around from one workspace to a bucket or to another workspace.

  can also work on dataframes containing lists of paths

  Args:
  -----
    workspacefrom: the workspace name where the data is
    newgs: the newgs bucket where to copy the data in
    workspaceto: if we should have these new samples and columns added to another workspace instead \
    of just updating the same one (usefull to copy one workspace to another)
    prevgslist: if providded, will only move files that are in the set of google bucket listed here
    index_func: *WIP* unused
    flag_non_matching: if set to true and prevgslist is set to some value, will return a list of samples that were not
    matched to anything in the prevgslist
    onlycol: do this only on a subset of columns in terra workspace
    entity: the entity in the terra workspace on which to do this
    droplists: if set to true remove all columns containing list of paths (list of path are not uploaded well in terra)
    keeppath: if set to true, will keep the full object path and just change the bucket
    dry_run: if set to true will not update anything on Terra but just return the result
    par: on how many processor do the gs copy commands.

  Returns:
  -------
    torename: the pandas.df containing the new paths
    flaglist: the samples that were non matching (if flag_non_matching is set to true)
  """
    flaglist = []
    wmfrom = dm.WorkspaceManager(workspacefrom)
    a = wmfrom.get_entities(entity)
    if len(onlysamples) > 0:
        a = a[a.index.isin(onlysamples)]
    print("using the data from " + workspacefrom + " " + entity + " list")
    if len(a) == 0:
        raise ValueError('no ' + entity)
    if onlycol:
        a = a[onlycol]
    todrop = set()
    torename = {}
    print(
        'this should only contains gs:// paths otherwise precise columns using \"onlycol\"'
    )
    for col in a.columns.tolist():
        val = []
        for k, prev in a[col].iteritems():
            if type(prev) is str:
                new = prev
                if newgs not in new:
                    if len(prevgslist) > 0:
                        for prevgs in prevgslist:
                            new = new.replace(prevgs, newgs)
                        if flag_non_matching:
                            if new == prev:
                                flaglist.append(prev)
                    if not keeppath:
                        new = newgs + new.split('/')[-1]
                    else:
                        new = newgs + '/'.join(new.split('/')[3:])
                else:
                    print("sample " + str(k) + " was already in the new gs")
                val.append(new)
        # IN CASE WE HAVE A LIST
            if type(prev) is list:
                if droplists:
                    todrop.add(k)
                    continue
                ind = []
                for prevname in prev:
                    newname = prevname
                    if newgs not in newname:
                        if len(prevgslist) > 0:
                            for prevgs in prevgslist:
                                new = new.replace(prevgs, newgs)
                            if flag_non_matching:
                                if new == prev:
                                    flaglist.append(prev)
                        if not keeppath:
                            new = newgs + new.split('/')[-1]
                        else:
                            new = newgs + '/'.join(new.split('/')[3:])
                    else:
                        print("sample " + str(k) +
                              " was already in the new gs")
                    ind.append(newname)
                val.append(ind)
        torename.update({col: val})
        if not dry_run:
            if keeppath:
                h.parrun([
                    'gsutil mv ' + a.iloc[i][col] + ' ' + v
                    for i, v in enumerate(val)
                ],
                         cores=20)
            else:
                gcp.mvFiles(a[col].tolist(), newgs)
        else:
            if keeppath:
                print([
                    'gsutil mv ' + a.iloc[i][col] + ' ' + v
                    for i, v in enumerate(val)
                ])
            else:
                print("mv " + str(a[col].tolist()) + " " + newgs)
    torename = pd.DataFrame(data=torename,
                            index=[i for i in a.index.tolist() if i != 'nan'])
    if workspaceto is not None:
        wmto = dm.WorkspaceManager(workspaceto)
        if not dry_run:
            wmto.disable_hound().update_entity_attributes(entity, torename)
    return torename, flaglist
Example #7
0
async def getSpikeInControlScales(refgenome, fastq=None, fastQfolder='', mapper='bwa', pairedEnd=False, cores=1,
                            pathtosam='samtools', pathtotrim_galore='trim_galore', pathtobwa='bwa',
                            totrim=True, tomap=True, tofilter=True, results='res/', toremove=False):
  """
  Will extract the spikeInControls from a fastq file (usefull for, let say ChIPseq data with spike ins)

  Count based sequencing data is not absolute and will be normalized as each sample will be sequenced at a specific depth.
  To figure out what was the actual sample concentration, we use Spike In control
  You should have FastQfolder/[NAME].fastq & BigWigFolder/[NAME].bw with NAME being the same for the same samples


  Args:
  -----
    refgenome: str the file path to the indexed reference genome
    FastQfolder: str the folder path where the fastq files are stored (should be named the same as files in BigWigFolder)
    BigWigFolder: str the folder path where the bigwig files are stored (should be named the same as files in FastQfolder)
    mapper: str flag to 'bwa', ...
    pairedEnd: Bool flat to true for paired end sequences. if true, You should have FastQfolder/[NAME]_1|2.fastq

  Returns:
  --------
    dict(file,float) the scaling factor dict

  """
  if len(fastQfolder) > 0:
    print('using all files from folder')
    fastqs = os.listdir(fastQfolder)
    fastqs = [i for i in fastqs if '.fq.gz' ==
              i[-6:] or '.fastq.gz' == i[-9:]]
    fastqs.sort()
    if pairedEnd and (tomap or totrim):
      print("need to be name_*1, name_*2")
      fastqs = [i for i in h.grouped(fastqs, 2)]
  elif fastq is None:
    raise ValueError('you need input files')
  else:
    if type(fastq) is list:
      print('your files need to be all in the same folder')
      fastQfolder = '/'.join(fastq[0].split('/')[:-1]) + '/'
      if not totrim and not tomap:
        fastqs = [f.split('/')[-1] for f in fastq]
      else:
        print("need to be name_*1, name_*2")
        fastqs = [[f[0].split('/')[-1], f[1].split('/')[-1]]
                  for f in h.grouped(fastq, 2)]
    else:
      fastQfolder = '/'.join(fastq.split('/')[:-1]) + '/'
      fastqs = [fastq.split('/')[-1]]
  print(fastqs)
  if not totrim:
    print("you need to have your files in the " + results + " folder")
  if totrim and tomap:
    print("\n\ntrimming\n\n")
    if pairedEnd:
      cmds = []
      rm = []
      for file in fastqs:
        cmd = pathtotrim_galore + ' --paired --fastqc --gzip ' + fastQfolder + \
          file[0] + ' ' + fastQfolder + file[1] + " -o " + results
        if toremove:
          rm.append('rm ' + fastQfolder +
                    file[0] + ' ' + fastQfolder + file[1])
        cmds.append(cmd)
      print(cmds)
      h.parrun(cmds, cores, add=rm)
      fastqs = [[file[0].split('.')[
        0] + '_val_1.fq.gz', file[1].split('.')[0] + '_val_2.fq.gz'] for file in fastqs]
  if tomap:
    print("\n\nmapping\n\n")
    if pairedEnd:
      cmds = []
      rm = []
      for file in fastqs:
        cmd = pathtobwa + ' mem ' + refgenome + ' ' + results + file[0] + ' ' + results +\
          file[1] + ' | ' + pathtosam + ' sort - -o ' + \
          results + file[0].split('.')[0] + '.sorted.bam'
        if toremove:
          rm.append('rm ' + results +
                    file[0] + ' ' + results + file[1])
        cmds.append(cmd)
    h.parrun(cmds, cores, add=rm)
    fastqs = [file[0].split('.')[0] + '.sorted.bam' for file in fastqs]

  if tofilter:
    print("\n\nfiltering\n\n")
    cmds = []
    rm = []
    h.parrun([pathtosam + ' index ' + results + file.split('.')
            [0] + '.sorted.bam' for file in fastqs], cores)
    h.parrun([pathtosam + ' flagstat ' + results + file.split('.')[0] + '.sorted.bam > ' +
            results + file.split('.')[0] + '.sorted.bam.flagstat' for file in fastqs], cores)
    h.parrun([pathtosam + ' idxstats ' + results + file.split('.')[0] + '.sorted.bam > ' +
            results + file.split('.')[0] + '.sorted.bam.idxstat' for file in fastqs], cores)
    fastqs = [file.split('.')[0] + '.sorted.bam' for file in fastqs]
  else:
    print("files need to be named: NAME.sorted.bam")
    fastqs = [file for file in fastqs if '.sorted.bam' == file[-11:]]
  mapped = {}
  norm = {}
  unique_mapped = {}
  print("\n\ncounting\n\n")
  for file in fastqs:
    mapped[file.split('.')[0]] = int(os.popen(pathtosam + ' view -c -F 0x004 -F 0x0008 -f 0x001 -F 0x0400 -q 1 ' + results +
                                              file + ' -@ ' + str(cores)).read().split('\n')[0])
    # unique_mapped[file.split('.')[0]] = int(re.findall("Mapped reads: (\d+)", os.popen('bamtools stats -in '+results +
    #                                             file + '.sorted.bam').read())[0])
  nbmapped = np.array([i for i in mapped.values()])
  nbmapped = np.sort(nbmapped)[0] / nbmapped.astype(float)
  for i, val in enumerate(mapped.keys()):
    norm[val] = nbmapped[i]
  return norm, mapped,  # unique_mapped