def test_flat(self): # if input is not iterable, raise TypeError self.assertRaises(TypeError, common.flat, 1) # empties stay empty self.assertEquals(common.flat(''), []) self.assertEquals(common.flat([]), []) # strings get split self.assertEquals(common.flat('test'), ['t', 'e', 's', 't']) # flat lists stay flat self.assertEquals(common.flat(['a', 'ab']), ['a', 'ab']) # nested iterables become flat lists self.assertEquals(common.flat(['a', ('b', 'c')]), ['a', 'b', 'c'])
def get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_control_ids): exp_id = experiment['accession'] #Build a list of the possible_control experiments possible_control_experiments = [] for uri in experiment.get('possible_controls'): possible_control_experiment = common.encoded_get(server + uri, keypair) target_uri = possible_control_experiment.get('target') # For now only use controls with no target or target "Control" (i.e. not IgG) if not target_uri or target_uri.split('/')[2].startswith('Control'): possible_control_experiments.append(possible_control_experiment) logging.debug(pprint.pformat(possible_control_experiments)) try: matching_ta = next(ta for ta in [ get_rep_ta(e, repn, default_project, ta_folders) for e in possible_control_experiments ] if ta and ta['id'] not in used_control_ids) except StopIteration: logging.warning('Failed to find control rep with matching repn') matching_ta = None else: return matching_ta try: any_ta = next(ta for ta in common.flat([ get_all_tas(e, default_project, ta_folders) for e in possible_control_experiments ]) if ta and ta['id'] not in used_control_ids) except StopIteration: logging.error('Failed to find any possible control') return None else: return any_ta
def get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_control_ids): exp_id = experiment['accession'] #Build a list of the possible_control experiments possible_control_experiments = [] for uri in experiment.get('possible_controls'): possible_control_experiment = common.encoded_get(server+uri, keypair) target_uri = possible_control_experiment.get('target') # For now only use controls with no target or target "Control" (i.e. not IgG) if not target_uri or target_uri.split('/')[2].startswith('Control'): possible_control_experiments.append(possible_control_experiment) logging.debug(pprint.pformat(possible_control_experiments)) try: matching_ta = next(ta for ta in [get_rep_ta(e, repn, default_project, ta_folders) for e in possible_control_experiments] if ta and ta['id'] not in used_control_ids) except StopIteration: logging.warning('Failed to find control rep with matching repn') matching_ta = None else: return matching_ta try: any_ta = next(ta for ta in common.flat([get_all_tas(e, default_project, ta_folders) for e in possible_control_experiments]) if ta and ta['id'] not in used_control_ids) except StopIteration: logging.error('Failed to find any possible control') return None else: return any_ta
def get_tas(experiment, server, keypair, default_project, ta_folders): # tas = { # 'rep1_ta': { # 'file_id': "", # 'project_id': "", # 'folder': "", # 'name': "", # 'paired_end': False, # 'control_path': "", # 'enc_repn': 0 #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR... #.from this list infer repns from the paths ../bams/ENCSR.../repn* #.from this list infer the ENCFF's for the fastqs that were used #for each repn go to the experiment and find all the fastqs for that rep #if there are different fastq's in the experiment, or different reps, warn #for each fastq found in the TA filename, find its controlled_by #if any have controlled_by, all must have controlled_by else error # gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error #else get possible_controls and try to match the repn, else pick one (rememeber it) # gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project_id = project.get_id() project_name += ":" else: project_id = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path += '/' logging.debug("Looking for TA's in %s %s %s" % (project_id, project_name, path)) for dxfile in dxpy.find_data_objects( classname='file', state='closed', folder=path + 'bams/%s/' % (exp_id), project=project_id, describe=True, recurse=True, ): desc = dxfile.get('describe') if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) logging.debug('Found %s possible files' % (len(possible_files))) logging.debug('%s' % ([(f.get('folder'), f.get('name')) for f in possible_files])) repns = [] files_to_ignore = [] for f in possible_files: m = re.search('/rep(\d+)$', f['folder']) if m: repn = int(m.group(1)) logging.debug("Matched rep%d" % (repn)) if repn in repns: logging.warning( "Ignoring additional rep%d bam, using first found" % (repn)) files_to_ignore.append(f) else: logging.debug("First time finding rep%d" % (repn)) repns.append(repn) else: logging.error("Cannot parse rep number from %s" % (f['folder'])) return None for f in files_to_ignore: possible_files.remove(f) logging.debug('Discovered repns %s' % (repns)) if len(repns) != 2: logging.error("Required to have exactly 2 reps for %s. Found %d: %s" % (exp_id, len(repns), repns)) return None tas = {} used_controls = [] for i, repn in enumerate(repns): encode_files = [ common.encoded_get(server + '/files/%s/' % (f), keypair) for f in get_encffs(possible_files[i].get('name')) ] controlled_by = common.flat( [f.get('controlled_by') for f in encode_files]) if any(controlled_by): controlled_by_accessions = list( set([uri.split('/')[2] for uri in controlled_by if uri])) controlled_by_ta = get_ta_from_accessions(controlled_by_accessions, default_project, ta_folders) if controlled_by_ta: controlled_by_ta_name = controlled_by_ta.get('name') controlled_by_ta_id = controlled_by_ta.get('id') else: logging.error( "%s: Could not find controlled_by_ta for accessions %s" % (experiment.get('accession'), controlled_by_accessions)) controlled_by_ta_name = None controlled_by_ta_id = None else: #evaluate possible controls controlled_by_accessions = None possible_controls = experiment.get('possible_controls') logging.warning( '%s: No controlled_by for rep%d, attempting to infer from possible_controls %s' % (experiment.get('accession'), repn, possible_controls)) if not possible_controls or not any(possible_controls): logging.error( '%s: Could not find controlled_by or resolve possible_controls for rep%d' % (experiment.get('accession'), repn)) controlled_by_ta_name = None controlled_by_ta_id = None else: control_ta = get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_controls) controlled_by_ta_name = control_ta.get('name') controlled_by_ta_id = control_ta.get('id') if controlled_by_ta_id and controlled_by_ta_id in used_controls: logging.warning('%s: Using same control %s for multiple reps' % (controlled_by_ta_id, controlled_by_ta_name)) used_controls.append(controlled_by_ta_id) #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same. #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first. if repn == min(repns): ta_index = 1 else: ta_index = 2 tas.update({ 'rep%d_ta' % (ta_index): { 'file_id': possible_files[i].get('id'), 'project_id': possible_files[i].get('project'), 'folder': possible_files[i].get('folder'), 'file_name': possible_files[i].get('name'), 'enc_fqs': get_encffs(possible_files[i].get('name')), 'controlled_by': controlled_by_accessions, 'controlled_by_name': controlled_by_ta_name, 'control_id': controlled_by_ta_id, 'enc_repn': repn, 'paired_end': is_paired_end(possible_files[i]) } }) return tas
def get_tas(experiment, server, keypair, default_project, ta_folders): # tas = { # 'rep1_ta': { # 'file_id': "", # 'project_id': "", # 'folder': "", # 'name': "", # 'paired_end': False, # 'control_path': "", # 'enc_repn': 0 #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR... #.from this list infer repns from the paths ../bams/ENCSR.../repn* #.from this list infer the ENCFF's for the fastqs that were used #for each repn go to the experiment and find all the fastqs for that rep #if there are different fastq's in the experiment, or different reps, warn #for each fastq found in the TA filename, find its controlled_by #if any have controlled_by, all must have controlled_by else error # gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error #else get possible_controls and try to match the repn, else pick one (rememeber it) # gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project_id = project.get_id() project_name += ":" else: project_id = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path += '/' logging.debug( "Looking for TA's in %s %s %s" % (project_id, project_name, path)) for dxfile in dxpy.find_data_objects( classname='file', state='closed', folder=path + 'bams/%s/' %(exp_id), project=project_id, describe=True, recurse=True, ): desc = dxfile.get('describe') if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) logging.debug('Found %s possible files' %(len(possible_files))) logging.debug('%s' %([(f.get('folder'),f.get('name')) for f in possible_files])) repns = [] files_to_ignore = [] for f in possible_files: m = re.search('/rep(\d+)$',f['folder']) if m: repn = int(m.group(1)) logging.debug("Matched rep%d" %(repn)) if repn in repns: logging.warning("Ignoring additional rep%d bam, using first found" %(repn)) files_to_ignore.append(f) else: logging.debug("First time finding rep%d" %(repn)) repns.append(repn) else: logging.error("Cannot parse rep number from %s" %(f['folder'])) return None for f in files_to_ignore: possible_files.remove(f) logging.debug('Discovered repns %s' %(repns)) if len(repns) != 2: logging.error("Required to have exactly 2 reps for %s. Found %d: %s" %(exp_id, len(repns), repns)) return None tas = {} used_controls = [] for i, repn in enumerate(repns): encode_files = [common.encoded_get(server+'/files/%s/' %(f), keypair) for f in get_encffs(possible_files[i].get('name'))] controlled_by = common.flat([f.get('controlled_by') for f in encode_files]) if any(controlled_by): controlled_by_accessions = list(set([uri.split('/')[2] for uri in controlled_by if uri])) controlled_by_ta = get_ta_from_accessions(controlled_by_accessions, default_project, ta_folders) if controlled_by_ta: controlled_by_ta_name = controlled_by_ta.get('name') controlled_by_ta_id = controlled_by_ta.get('id') else: logging.error("%s: Could not find controlled_by_ta for accessions %s" %(experiment.get('accession'), controlled_by_accessions)) controlled_by_ta_name = None controlled_by_ta_id = None else: #evaluate possible controls controlled_by_accessions = None possible_controls = experiment.get('possible_controls') logging.warning('%s: No controlled_by for rep%d, attempting to infer from possible_controls %s' %(experiment.get('accession'), repn, possible_controls)) if not possible_controls or not any(possible_controls): logging.error('%s: Could not find controlled_by or resolve possible_controls for rep%d' %(experiment.get('accession'), repn)) controlled_by_ta_name = None controlled_by_ta_id = None else: control_ta = get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_controls) controlled_by_ta_name = control_ta.get('name') controlled_by_ta_id = control_ta.get('id') if controlled_by_ta_id and controlled_by_ta_id in used_controls: logging.warning('%s: Using same control %s for multiple reps' %(controlled_by_ta_id, controlled_by_ta_name)) used_controls.append(controlled_by_ta_id) #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same. #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first. if repn == min(repns): ta_index = 1 else: ta_index = 2 tas.update( {'rep%d_ta' %(ta_index): { 'file_id': possible_files[i].get('id'), 'project_id': possible_files[i].get('project'), 'folder': possible_files[i].get('folder'), 'file_name': possible_files[i].get('name'), 'enc_fqs': get_encffs(possible_files[i].get('name')), 'controlled_by': controlled_by_accessions, 'controlled_by_name': controlled_by_ta_name, 'control_id': controlled_by_ta_id, 'enc_repn': repn, 'paired_end': is_paired_end(possible_files[i]) } } ) return tas