def extractFromWorkspace(samples, stype, recomputeTime=True, recomputesize=True, recomputedate=True, recomputehash=True, extract={}): """ Extract more information from a list of samples found on GP workspaces Args: ----- samples: pd dataframes of samples with at least arxspan ids and sizes stype: str sequencing type recomputeTime: bool whether to recompute the date of upload of the bam file recomputesize: bool whether to recompute the of the bam file recomputehash: bool whether to recompute the of the bam file extract: if you want to specify what values should refer to which column names dict{ 'name': 'bai': 'bam': 'source': 'from_arxspan_id': ...} (see extract_defaults) Returns: -------- samples: pd dataframe the filtered sample list """ extract.update(extract_defaults) if extract['legacy_hash'] not in samples.columns or recomputelegacy_hash: samples[extract['hash']] = [ gcp.extractHash(val) for val in gcp.lsFiles(samples[extract["bam"]].tolist(), "-L", 200) ] lis = gcp.lsFiles(samples[extract['bam']].tolist(), '-al', 200) if extract['legacy_size'] not in samples.columns or recomputesize: samples[extract['legacy_size']] = [gcp.extractSize(i)[1] for i in lis] if extract['update_time'] not in samples.columns or recomputeTime: samples[extract['update_time']] = [gcp.extractTime(i) for i in lis] todrop = [] for k, val in samples.iterrows(): if val[extract['legacy_size']] < MINSIZES[stype]: todrop.append(k) print("too small size, removing sample: " + str(val[extract["from_arxspan_id"]])) samples = samples.drop(index=todrop) # getting the date released if len(samples) == 0: return None if extract['release_date'] not in samples.columns or recomputedate: samples[extract["release_date"]] = seq.getBamDate( samples[extract["bam"]]) samples[extract['release_date']] = list( h.datetoint(samples[extract['release_date']].values)) return samples
def changeToBucket(samples, gsfolderto, name_col=None, values=['bam', 'bai'], filetypes=None, catchdup=False, dryrun=True): """ moves all bam/bai files in a sampleList from Terra, to another gs bucket and rename them in the sample list will prevent erasing a duplicate sample by adding a random string or by flagging them and not copying them Args: ---- samples: pandas.dataframe with columns to move gsfolderto: the bucket path to move the data to values: list of the cols in the dataframe containing the gs object path to be moved filetypes: list[str] of size values for each columns, give a suffix (.txt, .bam, ...) catchdup: if false will prepend a random string to the names before moving them, else will flag duplicate names dryrun: only shows the output but does not move the files Returns: -------- the updated sample pandas.dataframe """ # to do the download to the new dataspace for i, val in samples.iterrows(): ran = h.randomString(6, 'underscore', withdigits=False) for j, ntype in enumerate(values): # TODO try:catch filetype = '.'.join(val[ntype].split('/')[-1].split('.') [1:]) if filetypes is None else filetypes[j] if name_col is None: name = val[ntype].split('/')[-1].split('.')[0] elif name_col == "index": name = val.name else: name = val[name_col] name = name + '.' + filetype if catchdup else name + '_' + ran + '.' + filetype if not gcp.exists(gsfolderto + name) or not catchdup: cmd = 'gsutil cp ' + val[ntype] + ' ' + gsfolderto + name if dryrun: print(cmd) else: res = subprocess.run(cmd, shell=True, capture_output=True) if res.returncode != 0: raise ValueError(str(res.stderr)) samples.loc[i, ntype] = gsfolderto + name else: print(name + ' already exists in the folder: ' + gsfolderto) print(gcp.lsFiles([gsfolderto + name], '-la')) return samples
async def indexBams(bucketpath, cores=4): """ given a bucket path, will index all .bam files without an associated index and return their paths """ files = gcp.lsFiles([bucketpath]) bams = [val for val in files if '.bam' in val[-4:]] unindexed = [ val for val in bams if val[:-4] + '.bai' not in files and val[:4] + '.bam.bai' not in files ] print("found " + str(len(unindexed)) + " files to reindex") h.parrun([ "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools index " + val for val in unindexed ], cores) return {val: val[:-4] + ".bam.bai" for val in unindexed}
def GetNewCellLinesFromWorkspaces(wmfroms, sources, stype, maxage, refurl="", addonly=[], match='ACH', extract={}, extract_defaults=extract_defaults, wto=None, refsamples=None, participantslicepos=10, accept_unknowntypes=False, rename=dict(), recomputehash=False): """ As GP almost always upload their data to a data workspace. we have to merge it to our processing workspace Will merge samples from a set of data workspaces to a processing workspace on Terra. Will only get a subset of the metadata and rename it. Will find out the duplicates based on the file size. Can also upload the bam files to a google storage bucket Args: ----- wto: str the workspace where you want to create the tsvs wfroms: list[str] the workspaces where the Samples to add are stored sources: list[str] the corresponding source names stype: str sequencing type maxage: str earliest date of the bam file upload to be considered new refurl: str(url) the reference url for the cell line tracker spreadsheet (only if no refsamples) match: list[str]|str the possible values that a sample id need to contain to be considered valid refsamples: pdDataFrame with columns matching values is in "extract" for the right keys (see "extract_default") participantslicepos: int the length of the sample id string accept_unknowntypes: bool whether or not the sample type column for that sample can be different from "Tumor" rename: dict(str:str) mapping a wrong arxpand_id to a good arxspan id for known cases of misslabelling recomputehash: bool whether or not to recompute the hash of the bam file when loading it addonly: list of sample id that you only want to add extract: if you want to specify what values should refer to which column names dict{ 'name': 'bai': 'bam': 'source': 'from_arxspan_id': ...} (see extract_defaults) extract_defaults: the full default dict to specificy what values should refer to which column names Returns: ------- samples: a dataframe with the samples that were resolved by the tool (we still need to add some more annotations) pairs: the corresponding pair from matching known normals with known tumors wrongssamples: a dataframe containing samples that passed most QCs but couldn't be resolved Raise: ----- Exception: when no new samples in this matrix """ extract.update(extract_defaults) if type(match) is str and match: match = [match] if refurl: print('refsamples is overrided by a refurl') refsamples = sheets.get(refurl).sheets[0].to_frame(index_col=0) if refsamples is None: if wto is None: raise ValueError('missing refsamples or refworkspace (wto)') wto = dm.WorkspaceManager(wto) print( 'we do not have refsamples data. Using the wto workspace sample data instead' ) refsamples = wto.get_samples() # TODO: update directly the df if data is not already in here) refsamples[extract['ref_arxspan_id']] = [ a.split('_')[0] for a in refsamples[extract['ref_arxspan_id']] if type(a) is str ] if extract['hash'] not in refsamples.columns: refsamples[extract['hash']] = [ gcp.extractHash(val) for val in gcp.lsFiles([ i for i in refsamples[extract["ref_bams"]] if type(i) is str and str(i) != 'NA' ], "-L", 200) ] if extract['size'] not in refsamples.columns: refsamples['size'] = [ gcp.extractSize(i)[1] for i in gcp.lsFiles( refsamples[extract['bam']].tolist(), '-al', 200) ] if extract['release_date'] not in refsamples.columns: refsamples[extract["ref_bams"]] = seq.getBamDate( refsamples[extract["ref_bams"]]) refsamples[extract['release_date']] = list( h.datetoint(refsamples[extract["release_date"]].values, '/')) if stype not in set(refsamples[extract['ref_type']]): h.ask("we have never seen this type: " + stype + ", in the reference, continue?") # do NOT make refids a set; we use the num of occurences as way to determine what number to add to the sample id # filter refids to only include those that include the strings in the 'match' argument refsamples = refsamples[refsamples.index.str.contains('|'.join(match))] for match_substring in match: refsamples.index = [ match_substring + i.split(match_substring)[-1] if match_substring in i else i for i in refsamples.index ] refsamples.index = [i[:participantslicepos] for i in refsamples.index] print("Getting sample infos...") if type(sources) is str: sources = [sources] if type(wmfroms) is str: wmfroms = [wmfroms] sampless = pd.DataFrame() wrongsampless = pd.DataFrame() for source, wmfrom in zip(sources, wmfroms): broken_bams = [] wmfrom = dm.WorkspaceManager(wmfrom) samples = wmfrom.get_samples().replace(np.nan, '', regex=True).reset_index() # keep samples that contain the match requirement (e.g. ACH for DepMap IDs) print("\nThe shape of the sample tsv from " + str(wmfrom) + ": " + str(samples.shape)) # remove true duplicates from consideration print( "Identifying any true duplicates by checking file hashes (this runs for each data source)..." ) print( "This step can take a while as we need to use gsutil to check the size of each potential duplicate..." ) dups_to_remove = [] # check for broken bam files; if broken, then remove from consideration # need to check for broken filepaths before checking if the sample is in Terra so that we don't # add a broken file path for a new participant foundfiles = gcp.lsFiles(samples[extract['bam']]) broken_bams = set(samples[extract['bam']]) - set(foundfiles) print('These ' + str(len(broken_bams)) + ' bam file path do not exist: ' + str(broken_bams)) wrongsamples = samples[(~samples[extract['bam']].isin(broken_bams)) & ( ~samples[extract['from_arxspan_id']].str.contains('|'.join(match)) )] wrongsamples = extractFromWorkspace(wrongsamples, stype, recomputehash, extract) if wrongsamples is not None: wrongsamples = mapSamples(wrongsamples, source, extract) wrongsampless = pd.concat([wrongsampless, wrongsamples], sort=False) samples = samples[(~samples[extract['bam']].isin(broken_bams)) & ( samples[extract['from_arxspan_id']].str.contains('|'.join(match)))] # getting correct arxspan id if samples is None: continue samples = extractFromWorkspace(samples, stype, recomputehash, extract) if samples is None: continue samples = mapSamples(samples, source, extract) samples = resolveFromWorkspace( samples, refsamples[refsamples[extract['ref_type']] == stype], match, participantslicepos, accept_unknowntypes, addonly, extract) if samples is None: continue sampless = pd.concat([sampless, samples], sort=False) if len(sampless) == 0: print("no new data available") return sampless, pd.DataFrame() sampless = assessAllSamples(sampless, refsamples, stype, rename, extract) # creating pairs pairs = myterra.setupPairsFromSamples( sampless, refsamples[refsamples[extract['ref_type']] == stype], extract) # I am trying to remove duplicates from samples without arxspan ids to then look more into them # and see if I have to get data for them or if I should just throw them out toremov = set() for k, val in wrongsampless.iterrows(): withsamesize = wrongsampless[wrongsampless[extract["legacy_size"]] == val[extract["legacy_size"]]] if (val[extract["legacy_size"]] in sampless[extract["legacy_size"]].tolist() ) or (val[extract["legacy_size"]] in refsamples[extract["size"]]): toremov.add(k) if len(withsamesize) > 1: for l, _ in withsamesize.iloc[1:].iterrows(): toremov.add(l) #elif len(refsamples[refsamples[extract['size']] == withsamesize[extract["size"]][0]]): #toremov.add(k) for i in toremov: wrongsampless = wrongsampless.drop(i) for i, v in wrongsampless.iterrows(): if not gcp.exists(v[extract['ref_bam']]): print(v.ccle_name) wrongsampless = wrongsampless.drop(i) a = len(sampless) sampless = deleteClosest(sampless, refsamples, extract['legacy_size'], extract['legacy_size'], extract['ref_arxspan_id']) sampless = deleteClosest(sampless, refsamples, extract['legacy_size'], extract['legacy__size'], extract['ref_arxspan_id']) print('removed: ' + str(a - len(sampless)) + " samples from size alone (too similar to a replicate)") wrongsampless = wrongsampless[~wrongsampless[extract['legacy_size']].isin( set(refsamples[extract['legacy_size']]))] wrongsampless = wrongsampless[~wrongsampless[extract['legacy_size']].isin( set(refsamples[extract['legacy_size']]))] wrongsampless = deleteClosest(wrongsampless, refsamples, extract['legacy_size'], extract['legacy_size'], extract['ref_arxspan_id']) wrongsampless = deleteClosest(wrongsampless, refsamples, extract['legacy_size'], extract['legacy_size'], extract['ref_arxspan_id']) #removing duplicate PDOs a = len(sampless) wrongsampless = wrongsampless[~wrongsampless[extract['PDO_id']]. isin(set(refsamples[extract['PDO_id']]))] sampless = sampless[~sampless[extract['PDO_id']]. isin(set(refsamples[extract['PDO_id']]))] print('removed: ' + str(a - len(sampless)) + " samples with duplicat PDO ids ") # removing anything too old a = len(sampless) wrongsampless = wrongsampless[ wrongsampless[extract['update_time']] > maxage] sampless = sampless[sampless[extract['update_time']] > maxage] print('removed: ' + str(a - len(sampless)) + " samples that have not changed since last time (likely\ duplicate having been removed)") return sampless, pairs, wrongsampless
def resolveFromWorkspace(samples, refsamples, match, participantslicepos=10, accept_unknowntypes=True, addonly=[], extract={}): """ Filters our list by trying to find duplicate in our dataset and remove any sample that isn't tumor Args: ----- match: list[str]|str the possible values that a sample id need to contain to be considered valid participantslicepos: int the length of the sample id string accept_unknowntypes: bool whether or not the sample type column for that sample can be different from "Tumor" refsamples: pd dataframe representing a sample tracker samples: pd dataframes of samples with at least arxspan ids and sizes extract: if you want to specify what values should refer to which column names dict{ 'name': 'bai': 'bam': 'source': 'from_arxspan_id': ...} (see extract_defaults) Returns: -------- samples: pd dataframe the filtered sample list """ extract.update(extract_defaults) prevlen = len(samples) for match_substring in match: samples[extract['ref_arxspan_id']] = [ (match_substring + i.split(match_substring)[-1]) if match_substring in i else i for i in samples[extract['ref_arxspan_id']] ] samples[extract['ref_arxspan_id']] = [ i[:participantslicepos] for i in samples[extract['ref_arxspan_id']] ] print('we found and removed ' + str(prevlen - len(samples)) + ' samples which did not match our id names: ' + str(match)) tolookfor = [ val[extract['ref_bam']] for _, val in samples.iterrows() if val[extract['ref_arxspan_id']] in set(refsamples[ extract['ref_arxspan_id']]) ] print("found " + str(len(tolookfor)) + ' likely replicate') sample_hash = { gcp.extractSize(val)[1]: gcp.extractSize(val)[0] for val in gcp.lsFiles(tolookfor, "-la") } dups_to_remove = [ sample_hash[a] for a in set(sample_hash.keys()) & set(refsamples[extract['legacy_size']]) ] dups_to_remove.extend([ sample_hash[a] for a in set(sample_hash.keys()) & set(refsamples[extract['legacy_size']]) ]) # remove the duplicates from consideration print("Len of samples before removal: " + str(len(samples))) print("Dups from this workspace has len " + str(len(dups_to_remove)) + ":\n " + str(dups_to_remove)) # remove the samples with broken bam filepaths from consideration samples = samples[~samples[extract['ref_bam']].isin(dups_to_remove)] print("Len of samples after removal: " + str(len(samples))) if len(samples) == 0: return None # if only add some samples if len(addonly) > 0: samples = samples[samples[extract['ref_arxspan_id']].isin(addonly)] # unknown types if 'sample_type' in samples.columns: if not accept_unknowntypes: samples = samples[samples['sample_type'].isin(['Tumor'])] return samples