Example #1
0
def extractFromWorkspace(samples,
                         stype,
                         recomputeTime=True,
                         recomputesize=True,
                         recomputedate=True,
                         recomputehash=True,
                         extract={}):
    """
  Extract more information from a list of samples found on GP workspaces

  Args:
  -----
    samples: pd dataframes of samples with at least arxspan ids and sizes
    stype: str sequencing type
    recomputeTime: bool whether to recompute the date of upload of the bam file
    recomputesize: bool whether to recompute the of the bam file
    recomputehash: bool whether to recompute the of the bam file
    extract: if you want to specify what values should refer to which column names
      dict{
      'name':
      'bai':
      'bam':
      'source':
      'from_arxspan_id':
      ...} (see extract_defaults)

  Returns:
  --------
    samples: pd dataframe the filtered sample list
  """
    extract.update(extract_defaults)
    if extract['legacy_hash'] not in samples.columns or recomputelegacy_hash:
        samples[extract['hash']] = [
            gcp.extractHash(val)
            for val in gcp.lsFiles(samples[extract["bam"]].tolist(), "-L", 200)
        ]
    lis = gcp.lsFiles(samples[extract['bam']].tolist(), '-al', 200)
    if extract['legacy_size'] not in samples.columns or recomputesize:
        samples[extract['legacy_size']] = [gcp.extractSize(i)[1] for i in lis]
    if extract['update_time'] not in samples.columns or recomputeTime:
        samples[extract['update_time']] = [gcp.extractTime(i) for i in lis]
    todrop = []
    for k, val in samples.iterrows():
        if val[extract['legacy_size']] < MINSIZES[stype]:
            todrop.append(k)
            print("too small size, removing sample: " +
                  str(val[extract["from_arxspan_id"]]))
    samples = samples.drop(index=todrop)
    # getting the date released
    if len(samples) == 0:
        return None
    if extract['release_date'] not in samples.columns or recomputedate:
        samples[extract["release_date"]] = seq.getBamDate(
            samples[extract["bam"]])
    samples[extract['release_date']] = list(
        h.datetoint(samples[extract['release_date']].values))
    return samples
Example #2
0
def changeToBucket(samples,
                   gsfolderto,
                   name_col=None,
                   values=['bam', 'bai'],
                   filetypes=None,
                   catchdup=False,
                   dryrun=True):
    """
  moves all bam/bai files in a sampleList from Terra, to another gs bucket and rename them in the sample list

  will prevent erasing a duplicate sample by adding a random string or by flagging them and not copying them

  Args:
  ----
    samples: pandas.dataframe with columns to move
    gsfolderto: the bucket path to move the data to
    values: list of the cols in the dataframe containing the gs object path to be moved
    filetypes: list[str] of size values for each columns, give a suffix (.txt, .bam, ...)
    catchdup: if false will prepend a random string to the names before moving them, else will flag duplicate names
    dryrun: only shows the output but does not move the files

  Returns:
  --------
    the updated sample pandas.dataframe
  """
    # to do the download to the new dataspace
    for i, val in samples.iterrows():
        ran = h.randomString(6, 'underscore', withdigits=False)
        for j, ntype in enumerate(values):
            # TODO try:catch
            filetype = '.'.join(val[ntype].split('/')[-1].split('.')
                                [1:]) if filetypes is None else filetypes[j]
            if name_col is None:
                name = val[ntype].split('/')[-1].split('.')[0]
            elif name_col == "index":
                name = val.name
            else:
                name = val[name_col]
            name = name + '.' + filetype if catchdup else name + '_' + ran + '.' + filetype
            if not gcp.exists(gsfolderto + name) or not catchdup:
                cmd = 'gsutil cp ' + val[ntype] + ' ' + gsfolderto + name
                if dryrun:
                    print(cmd)
                else:
                    res = subprocess.run(cmd, shell=True, capture_output=True)
                    if res.returncode != 0:
                        raise ValueError(str(res.stderr))
                    samples.loc[i, ntype] = gsfolderto + name
            else:
                print(name + ' already exists in the folder: ' + gsfolderto)
                print(gcp.lsFiles([gsfolderto + name], '-la'))
    return samples
Example #3
0
async def indexBams(bucketpath, cores=4):
    """
    given a bucket path, will index all .bam files without an associated index and return their paths
    """
    files = gcp.lsFiles([bucketpath])
    bams = [val for val in files if '.bam' in val[-4:]]
    unindexed = [
        val for val in bams
        if val[:-4] + '.bai' not in files and val[:4] + '.bam.bai' not in files
    ]
    print("found " + str(len(unindexed)) + " files to reindex")
    h.parrun([
        "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools index "
        + val for val in unindexed
    ], cores)
    return {val: val[:-4] + ".bam.bai" for val in unindexed}
Example #4
0
def GetNewCellLinesFromWorkspaces(wmfroms,
                                  sources,
                                  stype,
                                  maxage,
                                  refurl="",
                                  addonly=[],
                                  match='ACH',
                                  extract={},
                                  extract_defaults=extract_defaults,
                                  wto=None,
                                  refsamples=None,
                                  participantslicepos=10,
                                  accept_unknowntypes=False,
                                  rename=dict(),
                                  recomputehash=False):
    """
  As GP almost always upload their data to a data workspace. we have to merge it to our processing workspace

  Will merge samples from a set of data workspaces to a processing workspace on Terra. Will only
  get a subset of the metadata and rename it.
  Will find out the duplicates based on the file size.
  Can also upload the bam files to a google storage bucket

  Args:
  -----
    wto: str the workspace where you want to create the tsvs
    wfroms: list[str] the workspaces where the Samples to add are stored
    sources: list[str] the corresponding source names
    stype: str sequencing type
    maxage: str earliest date of the bam file upload to be considered new
    refurl: str(url) the reference url for the cell line tracker spreadsheet (only if no refsamples)
    match: list[str]|str the possible values that a sample id need to contain to be considered valid
    refsamples: pdDataFrame with columns matching values is in "extract" for the right keys (see "extract_default")
    participantslicepos: int the length of the sample id string
    accept_unknowntypes: bool whether or not the sample type column for that sample can be different from "Tumor"
    rename: dict(str:str) mapping a wrong arxpand_id to a good arxspan id for known cases of misslabelling
    recomputehash: bool whether or not to recompute the hash of the bam file when loading it
    addonly: list of sample id that you only want to add
    extract: if you want to specify what values should refer to which column names
      dict{
      'name':
      'bai':
      'bam':
      'source':
      'from_arxspan_id':
      ...} (see extract_defaults)
    extract_defaults: the full default dict to specificy what values should refer to which column names

  Returns:
  -------
    samples: a dataframe with the samples that were resolved by the tool (we still need to add some more annotations)
    pairs: the corresponding pair from matching known normals with known tumors
    wrongssamples: a dataframe containing samples that passed most QCs but couldn't be resolved

  Raise:
  -----
    Exception: when no new samples in this matrix
  """
    extract.update(extract_defaults)
    if type(match) is str and match:
        match = [match]
    if refurl:
        print('refsamples is overrided by a refurl')
        refsamples = sheets.get(refurl).sheets[0].to_frame(index_col=0)
    if refsamples is None:
        if wto is None:
            raise ValueError('missing refsamples or refworkspace (wto)')
        wto = dm.WorkspaceManager(wto)
        print(
            'we do not have refsamples data. Using the wto workspace sample data instead'
        )
        refsamples = wto.get_samples()
        # TODO: update directly the df if data is not already in here)
        refsamples[extract['ref_arxspan_id']] = [
            a.split('_')[0] for a in refsamples[extract['ref_arxspan_id']]
            if type(a) is str
        ]
        if extract['hash'] not in refsamples.columns:
            refsamples[extract['hash']] = [
                gcp.extractHash(val) for val in gcp.lsFiles([
                    i for i in refsamples[extract["ref_bams"]]
                    if type(i) is str and str(i) != 'NA'
                ], "-L", 200)
            ]
        if extract['size'] not in refsamples.columns:
            refsamples['size'] = [
                gcp.extractSize(i)[1] for i in gcp.lsFiles(
                    refsamples[extract['bam']].tolist(), '-al', 200)
            ]
        if extract['release_date'] not in refsamples.columns:
            refsamples[extract["ref_bams"]] = seq.getBamDate(
                refsamples[extract["ref_bams"]])
    refsamples[extract['release_date']] = list(
        h.datetoint(refsamples[extract["release_date"]].values, '/'))
    if stype not in set(refsamples[extract['ref_type']]):
        h.ask("we have never seen this type: " + stype +
              ", in the reference, continue?")
    # do NOT make refids a set; we use the num of occurences as way to determine what number to add to the sample id
    # filter refids to only include those that include the strings in the 'match' argument
    refsamples = refsamples[refsamples.index.str.contains('|'.join(match))]
    for match_substring in match:
        refsamples.index = [
            match_substring +
            i.split(match_substring)[-1] if match_substring in i else i
            for i in refsamples.index
        ]
    refsamples.index = [i[:participantslicepos] for i in refsamples.index]
    print("Getting sample infos...")
    if type(sources) is str:
        sources = [sources]
    if type(wmfroms) is str:
        wmfroms = [wmfroms]
    sampless = pd.DataFrame()
    wrongsampless = pd.DataFrame()
    for source, wmfrom in zip(sources, wmfroms):
        broken_bams = []
        wmfrom = dm.WorkspaceManager(wmfrom)
        samples = wmfrom.get_samples().replace(np.nan, '',
                                               regex=True).reset_index()
        # keep samples that contain the match requirement (e.g. ACH for DepMap IDs)

        print("\nThe shape of the sample tsv from " + str(wmfrom) + ": " +
              str(samples.shape))

        # remove true duplicates from consideration
        print(
            "Identifying any true duplicates by checking file hashes (this runs for each data source)..."
        )
        print(
            "This step can take a while as we need to use gsutil to check the size of each potential duplicate..."
        )
        dups_to_remove = []
        # check for broken bam files; if broken, then remove from consideration
        # need to check for broken filepaths before checking if the sample is in Terra so that we don't
        # add a broken file path for a new participant
        foundfiles = gcp.lsFiles(samples[extract['bam']])
        broken_bams = set(samples[extract['bam']]) - set(foundfiles)
        print('These ' + str(len(broken_bams)) +
              ' bam file path do not exist: ' + str(broken_bams))

        wrongsamples = samples[(~samples[extract['bam']].isin(broken_bams)) & (
            ~samples[extract['from_arxspan_id']].str.contains('|'.join(match))
        )]
        wrongsamples = extractFromWorkspace(wrongsamples, stype, recomputehash,
                                            extract)
        if wrongsamples is not None:
            wrongsamples = mapSamples(wrongsamples, source, extract)
            wrongsampless = pd.concat([wrongsampless, wrongsamples],
                                      sort=False)
        samples = samples[(~samples[extract['bam']].isin(broken_bams)) & (
            samples[extract['from_arxspan_id']].str.contains('|'.join(match)))]
        # getting correct arxspan id
        if samples is None:
            continue
        samples = extractFromWorkspace(samples, stype, recomputehash, extract)
        if samples is None:
            continue
        samples = mapSamples(samples, source, extract)
        samples = resolveFromWorkspace(
            samples, refsamples[refsamples[extract['ref_type']] == stype],
            match, participantslicepos, accept_unknowntypes, addonly, extract)
        if samples is None:
            continue
        sampless = pd.concat([sampless, samples], sort=False)

    if len(sampless) == 0:
        print("no new data available")
        return sampless, pd.DataFrame()

    sampless = assessAllSamples(sampless, refsamples, stype, rename, extract)
    # creating pairs
    pairs = myterra.setupPairsFromSamples(
        sampless, refsamples[refsamples[extract['ref_type']] == stype],
        extract)
    # I am trying to remove duplicates from samples without arxspan ids to then look more into them
    # and see if I have to get data for them or if I should just throw them out
    toremov = set()
    for k, val in wrongsampless.iterrows():
        withsamesize = wrongsampless[wrongsampless[extract["legacy_size"]] ==
                                     val[extract["legacy_size"]]]
        if (val[extract["legacy_size"]]
                in sampless[extract["legacy_size"]].tolist()
            ) or (val[extract["legacy_size"]] in refsamples[extract["size"]]):
            toremov.add(k)
        if len(withsamesize) > 1:
            for l, _ in withsamesize.iloc[1:].iterrows():
                toremov.add(l)
        #elif len(refsamples[refsamples[extract['size']] == withsamesize[extract["size"]][0]]):
        #toremov.add(k)
    for i in toremov:
        wrongsampless = wrongsampless.drop(i)
    for i, v in wrongsampless.iterrows():
        if not gcp.exists(v[extract['ref_bam']]):
            print(v.ccle_name)
            wrongsampless = wrongsampless.drop(i)
    a = len(sampless)
    sampless = deleteClosest(sampless, refsamples, extract['legacy_size'],
                             extract['legacy_size'], extract['ref_arxspan_id'])
    sampless = deleteClosest(sampless, refsamples, extract['legacy_size'],
                             extract['legacy__size'],
                             extract['ref_arxspan_id'])
    print('removed: ' + str(a - len(sampless)) +
          " samples from size alone (too similar to a replicate)")
    wrongsampless = wrongsampless[~wrongsampless[extract['legacy_size']].isin(
        set(refsamples[extract['legacy_size']]))]
    wrongsampless = wrongsampless[~wrongsampless[extract['legacy_size']].isin(
        set(refsamples[extract['legacy_size']]))]
    wrongsampless = deleteClosest(wrongsampless, refsamples,
                                  extract['legacy_size'],
                                  extract['legacy_size'],
                                  extract['ref_arxspan_id'])
    wrongsampless = deleteClosest(wrongsampless, refsamples,
                                  extract['legacy_size'],
                                  extract['legacy_size'],
                                  extract['ref_arxspan_id'])
    #removing duplicate PDOs
    a = len(sampless)
    wrongsampless = wrongsampless[~wrongsampless[extract['PDO_id']].
                                  isin(set(refsamples[extract['PDO_id']]))]
    sampless = sampless[~sampless[extract['PDO_id']].
                        isin(set(refsamples[extract['PDO_id']]))]
    print('removed: ' + str(a - len(sampless)) +
          " samples with duplicat PDO ids ")
    # removing anything too old
    a = len(sampless)
    wrongsampless = wrongsampless[
        wrongsampless[extract['update_time']] > maxage]
    sampless = sampless[sampless[extract['update_time']] > maxage]
    print('removed: ' + str(a - len(sampless)) +
          " samples that have not changed since last time (likely\
     duplicate having been removed)")
    return sampless, pairs, wrongsampless
Example #5
0
def resolveFromWorkspace(samples,
                         refsamples,
                         match,
                         participantslicepos=10,
                         accept_unknowntypes=True,
                         addonly=[],
                         extract={}):
    """
  Filters our list by trying to find duplicate in our dataset and remove any sample that isn't tumor

  Args:
  -----
    match: list[str]|str the possible values that a sample id need to contain to be considered valid
    participantslicepos: int the length of the sample id string
    accept_unknowntypes: bool whether or not the sample type column for that sample can be different from "Tumor"
    refsamples: pd dataframe representing a sample tracker
    samples: pd dataframes of samples with at least arxspan ids and sizes
    extract: if you want to specify what values should refer to which column names
      dict{
      'name':
      'bai':
      'bam':
      'source':
      'from_arxspan_id':
      ...} (see extract_defaults)

  Returns:
  --------
    samples: pd dataframe the filtered sample list
  """
    extract.update(extract_defaults)
    prevlen = len(samples)
    for match_substring in match:
        samples[extract['ref_arxspan_id']] = [
            (match_substring +
             i.split(match_substring)[-1]) if match_substring in i else i
            for i in samples[extract['ref_arxspan_id']]
        ]
    samples[extract['ref_arxspan_id']] = [
        i[:participantslicepos] for i in samples[extract['ref_arxspan_id']]
    ]
    print('we found and removed ' + str(prevlen - len(samples)) +
          ' samples which did not match our id names: ' + str(match))

    tolookfor = [
        val[extract['ref_bam']] for _, val in samples.iterrows()
        if val[extract['ref_arxspan_id']] in set(refsamples[
            extract['ref_arxspan_id']])
    ]
    print("found " + str(len(tolookfor)) + ' likely replicate')
    sample_hash = {
        gcp.extractSize(val)[1]: gcp.extractSize(val)[0]
        for val in gcp.lsFiles(tolookfor, "-la")
    }
    dups_to_remove = [
        sample_hash[a] for a in set(sample_hash.keys())
        & set(refsamples[extract['legacy_size']])
    ]
    dups_to_remove.extend([
        sample_hash[a] for a in set(sample_hash.keys())
        & set(refsamples[extract['legacy_size']])
    ])
    # remove the duplicates from consideration
    print("Len of samples before removal: " + str(len(samples)))
    print("Dups from this workspace has len " + str(len(dups_to_remove)) +
          ":\n " + str(dups_to_remove))
    # remove the samples with broken bam filepaths from consideration
    samples = samples[~samples[extract['ref_bam']].isin(dups_to_remove)]

    print("Len of samples after removal: " + str(len(samples)))
    if len(samples) == 0:
        return None

    # if only add some samples
    if len(addonly) > 0:
        samples = samples[samples[extract['ref_arxspan_id']].isin(addonly)]

    # unknown types
    if 'sample_type' in samples.columns:
        if not accept_unknowntypes:
            samples = samples[samples['sample_type'].isin(['Tumor'])]
    return samples