Example #1
0
def lsFiles(files, add='', group=50):
    """
  list a set of files in parallel (when the set is huge)

  Args:
  ----
      files: gs paths
      add: additional params to add
      group: files to do in parallel
  """
    print('listing files in gs')
    by = len(files) if len(files) < group else group
    res = []
    for sfiles in h.grouped(files, by):
        a = ''
        for val in sfiles:
            a += val + ' '
        data = subprocess.run("gsutil -m ls " + add + " " + a,
                              capture_output=True,
                              shell=True)
        if data.returncode != 0:
            if "One or more URLs matched no objects" not in str(data.stderr):
                raise ValueError('issue with the command: ' + str(data.stderr))
        if len(str(data.stdout)) < 4:
            return []
        res += str(
            data.stdout)[2:-1].split('\\n')[:-1] if 'L' not in add else [
                'gs://' + i for i in str(data.stdout).split('\\ngs://')
            ]
        if "TOTAL:" in res[-1] and 'L' not in add:
            res = res[:-1]
    return res
Example #2
0
def cpFiles(files, location, group=50):
    """
  copy a set of files in parallel (when the set is huge)

  Args:
  ----
      files: gs paths
      location to copy
      group: files to do in parallel
  """
    by = len(files) if len(files) < group else group
    for sfiles in h.grouped(files, by):
        a = ''
        for val in sfiles:
            a += val + ' '
        code = os.system("gsutil -m cp " + a + location)
        if code != 0:
            print('pressed ctrl+c or command failed')
            break
Example #3
0
def mvFiles(files, location, group=50, listen_to_errors=False):
    """
  move a set of files in parallel (when the set is huge)

  Args:
  ----
      files: gs paths
      location: to move the files to
      group: files to do in parallel
  """
    by = len(files) if len(files) < group else group
    for sfiles in h.grouped(files, by):
        a = ''
        for val in sfiles:
            a += val + ' '
        code = os.system("gsutil -m mv " + a + location)
        if code != 0 and listen_to_errors:
            print('pressed ctrl+c or command failed')
            break
Example #4
0
def catFiles(files, group=50, split=False, cut=False):
    """
  copy a set of files in parallel (when the set is huge)

  Args:
  ----
      files: gs paths
      location to copy
      group: files to do in parallel
      cut: split all lines into chunks of size cut
      split: split lines by split e.g. \\n
  """
    by = len(files) if len(files) < group else group
    res = []
    for i, sfiles in enumerate(h.grouped(files, by)):
        print(i / (len(files) / by))
        a = ''
        for val in sfiles:
            a += val + ' '
        data = subprocess.run("gsutil -m cat " + a,
                              capture_output=True,
                              shell=True)
        if data.returncode != 0:
            if "One or more URLs matched no objects" not in str(data.stderr):
                print(ValueError('issue with the command: ' +
                                 str(data.stderr)))
                return res
        if len(str(data.stdout)) < 4:
            return []
        resa = str(data.stdout)[2:-1]
        if cut:
            res += [
                resa[i * cut:(i + 1) * cut]
                for i in range(int(len(resa) / cut))
            ]
        elif split:
            res += resa.split(split)
        else:
            res += [resa]
    return res
Example #5
0
def rmFiles(files, group=50, add='', dryrun=True):
    """
  remove a set of files in parallel (when the set is huge)

  Args:
  ----
      files: gs paths
      group: number to do in parallel
      add: additional gsutil cp params
  """
    by = len(files) if len(files) < group else group
    for sfiles in h.grouped(files, by):
        a = ''
        for val in sfiles:
            a += ' ' + val
        if add:
            add = ' ' + add
        if dryrun:
            print("gsutil -m rm" + add + a)
        else:
            code = os.system("gsutil -m rm" + add + a)
            if code != 0:
                print('pressed ctrl+c or command failed')
                break
Example #6
0
async def getSpikeInControlScales(refgenome, fastq=None, fastQfolder='', mapper='bwa', pairedEnd=False, cores=1,
                            pathtosam='samtools', pathtotrim_galore='trim_galore', pathtobwa='bwa',
                            totrim=True, tomap=True, tofilter=True, results='res/', toremove=False):
  """
  Will extract the spikeInControls from a fastq file (usefull for, let say ChIPseq data with spike ins)

  Count based sequencing data is not absolute and will be normalized as each sample will be sequenced at a specific depth.
  To figure out what was the actual sample concentration, we use Spike In control
  You should have FastQfolder/[NAME].fastq & BigWigFolder/[NAME].bw with NAME being the same for the same samples


  Args:
  -----
    refgenome: str the file path to the indexed reference genome
    FastQfolder: str the folder path where the fastq files are stored (should be named the same as files in BigWigFolder)
    BigWigFolder: str the folder path where the bigwig files are stored (should be named the same as files in FastQfolder)
    mapper: str flag to 'bwa', ...
    pairedEnd: Bool flat to true for paired end sequences. if true, You should have FastQfolder/[NAME]_1|2.fastq

  Returns:
  --------
    dict(file,float) the scaling factor dict

  """
  if len(fastQfolder) > 0:
    print('using all files from folder')
    fastqs = os.listdir(fastQfolder)
    fastqs = [i for i in fastqs if '.fq.gz' ==
              i[-6:] or '.fastq.gz' == i[-9:]]
    fastqs.sort()
    if pairedEnd and (tomap or totrim):
      print("need to be name_*1, name_*2")
      fastqs = [i for i in h.grouped(fastqs, 2)]
  elif fastq is None:
    raise ValueError('you need input files')
  else:
    if type(fastq) is list:
      print('your files need to be all in the same folder')
      fastQfolder = '/'.join(fastq[0].split('/')[:-1]) + '/'
      if not totrim and not tomap:
        fastqs = [f.split('/')[-1] for f in fastq]
      else:
        print("need to be name_*1, name_*2")
        fastqs = [[f[0].split('/')[-1], f[1].split('/')[-1]]
                  for f in h.grouped(fastq, 2)]
    else:
      fastQfolder = '/'.join(fastq.split('/')[:-1]) + '/'
      fastqs = [fastq.split('/')[-1]]
  print(fastqs)
  if not totrim:
    print("you need to have your files in the " + results + " folder")
  if totrim and tomap:
    print("\n\ntrimming\n\n")
    if pairedEnd:
      cmds = []
      rm = []
      for file in fastqs:
        cmd = pathtotrim_galore + ' --paired --fastqc --gzip ' + fastQfolder + \
          file[0] + ' ' + fastQfolder + file[1] + " -o " + results
        if toremove:
          rm.append('rm ' + fastQfolder +
                    file[0] + ' ' + fastQfolder + file[1])
        cmds.append(cmd)
      print(cmds)
      h.parrun(cmds, cores, add=rm)
      fastqs = [[file[0].split('.')[
        0] + '_val_1.fq.gz', file[1].split('.')[0] + '_val_2.fq.gz'] for file in fastqs]
  if tomap:
    print("\n\nmapping\n\n")
    if pairedEnd:
      cmds = []
      rm = []
      for file in fastqs:
        cmd = pathtobwa + ' mem ' + refgenome + ' ' + results + file[0] + ' ' + results +\
          file[1] + ' | ' + pathtosam + ' sort - -o ' + \
          results + file[0].split('.')[0] + '.sorted.bam'
        if toremove:
          rm.append('rm ' + results +
                    file[0] + ' ' + results + file[1])
        cmds.append(cmd)
    h.parrun(cmds, cores, add=rm)
    fastqs = [file[0].split('.')[0] + '.sorted.bam' for file in fastqs]

  if tofilter:
    print("\n\nfiltering\n\n")
    cmds = []
    rm = []
    h.parrun([pathtosam + ' index ' + results + file.split('.')
            [0] + '.sorted.bam' for file in fastqs], cores)
    h.parrun([pathtosam + ' flagstat ' + results + file.split('.')[0] + '.sorted.bam > ' +
            results + file.split('.')[0] + '.sorted.bam.flagstat' for file in fastqs], cores)
    h.parrun([pathtosam + ' idxstats ' + results + file.split('.')[0] + '.sorted.bam > ' +
            results + file.split('.')[0] + '.sorted.bam.idxstat' for file in fastqs], cores)
    fastqs = [file.split('.')[0] + '.sorted.bam' for file in fastqs]
  else:
    print("files need to be named: NAME.sorted.bam")
    fastqs = [file for file in fastqs if '.sorted.bam' == file[-11:]]
  mapped = {}
  norm = {}
  unique_mapped = {}
  print("\n\ncounting\n\n")
  for file in fastqs:
    mapped[file.split('.')[0]] = int(os.popen(pathtosam + ' view -c -F 0x004 -F 0x0008 -f 0x001 -F 0x0400 -q 1 ' + results +
                                              file + ' -@ ' + str(cores)).read().split('\n')[0])
    # unique_mapped[file.split('.')[0]] = int(re.findall("Mapped reads: (\d+)", os.popen('bamtools stats -in '+results +
    #                                             file + '.sorted.bam').read())[0])
  nbmapped = np.array([i for i in mapped.values()])
  nbmapped = np.sort(nbmapped)[0] / nbmapped.astype(float)
  for i, val in enumerate(mapped.keys()):
    norm[val] = nbmapped[i]
  return norm, mapped,  # unique_mapped