Ejemplo n.º 1
0
 def test_flat(self):
     # if input is not iterable, raise TypeError
     self.assertRaises(TypeError, common.flat, 1)
     # empties stay empty
     self.assertEquals(common.flat(''), [])
     self.assertEquals(common.flat([]), [])
     # strings get split
     self.assertEquals(common.flat('test'), ['t', 'e', 's', 't'])
     # flat lists stay flat
     self.assertEquals(common.flat(['a', 'ab']), ['a', 'ab'])
     # nested iterables become flat lists
     self.assertEquals(common.flat(['a', ('b', 'c')]), ['a', 'b', 'c'])
def get_possible_ctl_ta(experiment, repn, server, keypair, default_project,
                        ta_folders, used_control_ids):
    exp_id = experiment['accession']

    #Build a list of the possible_control experiments
    possible_control_experiments = []
    for uri in experiment.get('possible_controls'):
        possible_control_experiment = common.encoded_get(server + uri, keypair)
        target_uri = possible_control_experiment.get('target')
        # For now only use controls with no target or target "Control" (i.e. not IgG)
        if not target_uri or target_uri.split('/')[2].startswith('Control'):
            possible_control_experiments.append(possible_control_experiment)
    logging.debug(pprint.pformat(possible_control_experiments))
    try:
        matching_ta = next(ta for ta in [
            get_rep_ta(e, repn, default_project, ta_folders)
            for e in possible_control_experiments
        ] if ta and ta['id'] not in used_control_ids)
    except StopIteration:
        logging.warning('Failed to find control rep with matching repn')
        matching_ta = None
    else:
        return matching_ta

    try:
        any_ta = next(ta for ta in common.flat([
            get_all_tas(e, default_project, ta_folders)
            for e in possible_control_experiments
        ]) if ta and ta['id'] not in used_control_ids)
    except StopIteration:
        logging.error('Failed to find any possible control')
        return None
    else:
        return any_ta
def get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_control_ids):
    exp_id = experiment['accession']

    #Build a list of the possible_control experiments
    possible_control_experiments = []
    for uri in experiment.get('possible_controls'):
        possible_control_experiment = common.encoded_get(server+uri, keypair)
        target_uri = possible_control_experiment.get('target')
        # For now only use controls with no target or target "Control" (i.e. not IgG)
        if not target_uri or target_uri.split('/')[2].startswith('Control'):
            possible_control_experiments.append(possible_control_experiment)
    logging.debug(pprint.pformat(possible_control_experiments))
    try:
        matching_ta = next(ta for ta in [get_rep_ta(e, repn, default_project, ta_folders) for e in possible_control_experiments] if ta and ta['id'] not in used_control_ids)
    except StopIteration:
        logging.warning('Failed to find control rep with matching repn')
        matching_ta = None
    else:
        return matching_ta

    try:
        any_ta = next(ta for ta in common.flat([get_all_tas(e, default_project, ta_folders) for e in possible_control_experiments]) if ta and ta['id'] not in used_control_ids)
    except StopIteration:
        logging.error('Failed to find any possible control')
        return None
    else:
        return any_ta
def get_tas(experiment, server, keypair, default_project, ta_folders):
    # tas = {
    #   'rep1_ta': {
    #       'file_id': "",
    #       'project_id': "",
    #       'folder': "",
    #       'name': "",
    #       'paired_end': False,
    #       'control_path': "",
    #       'enc_repn': 0
    #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR...
    #.from this list infer repns from the paths ../bams/ENCSR.../repn*
    #.from this list infer the ENCFF's for the fastqs that were used
    #for each repn go to the experiment and find all the fastqs for that rep
    #if there are different fastq's in the experiment, or different reps, warn
    #for each fastq found in the TA filename, find its controlled_by
    #if any have controlled_by, all must have controlled_by else error
    #   gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error
    #else get possible_controls and try to match the repn, else pick one (rememeber it)
    #   gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error
    exp_id = experiment['accession']
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project_id = project.get_id()
            project_name += ":"
        else:
            project_id = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        if not path.endswith('/'):
            path += '/'
        logging.debug("Looking for TA's in %s %s %s" %
                      (project_id, project_name, path))
        for dxfile in dxpy.find_data_objects(
                classname='file',
                state='closed',
                folder=path + 'bams/%s/' % (exp_id),
                project=project_id,
                describe=True,
                recurse=True,
        ):
            desc = dxfile.get('describe')
            if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    logging.debug('Found %s possible files' % (len(possible_files)))
    logging.debug('%s' % ([(f.get('folder'), f.get('name'))
                           for f in possible_files]))
    repns = []
    files_to_ignore = []
    for f in possible_files:
        m = re.search('/rep(\d+)$', f['folder'])
        if m:
            repn = int(m.group(1))
            logging.debug("Matched rep%d" % (repn))
            if repn in repns:
                logging.warning(
                    "Ignoring additional rep%d bam, using first found" %
                    (repn))
                files_to_ignore.append(f)
            else:
                logging.debug("First time finding rep%d" % (repn))
                repns.append(repn)
        else:
            logging.error("Cannot parse rep number from %s" % (f['folder']))
            return None
    for f in files_to_ignore:
        possible_files.remove(f)
    logging.debug('Discovered repns %s' % (repns))
    if len(repns) != 2:
        logging.error("Required to have exactly 2 reps for %s.  Found %d: %s" %
                      (exp_id, len(repns), repns))
        return None

    tas = {}
    used_controls = []
    for i, repn in enumerate(repns):
        encode_files = [
            common.encoded_get(server + '/files/%s/' % (f), keypair)
            for f in get_encffs(possible_files[i].get('name'))
        ]
        controlled_by = common.flat(
            [f.get('controlled_by') for f in encode_files])
        if any(controlled_by):
            controlled_by_accessions = list(
                set([uri.split('/')[2] for uri in controlled_by if uri]))
            controlled_by_ta = get_ta_from_accessions(controlled_by_accessions,
                                                      default_project,
                                                      ta_folders)
            if controlled_by_ta:
                controlled_by_ta_name = controlled_by_ta.get('name')
                controlled_by_ta_id = controlled_by_ta.get('id')
            else:
                logging.error(
                    "%s: Could not find controlled_by_ta for accessions %s" %
                    (experiment.get('accession'), controlled_by_accessions))
                controlled_by_ta_name = None
                controlled_by_ta_id = None
        else:
            #evaluate possible controls
            controlled_by_accessions = None
            possible_controls = experiment.get('possible_controls')
            logging.warning(
                '%s: No controlled_by for rep%d, attempting to infer from possible_controls %s'
                % (experiment.get('accession'), repn, possible_controls))
            if not possible_controls or not any(possible_controls):
                logging.error(
                    '%s: Could not find controlled_by or resolve possible_controls for rep%d'
                    % (experiment.get('accession'), repn))
                controlled_by_ta_name = None
                controlled_by_ta_id = None
            else:
                control_ta = get_possible_ctl_ta(experiment, repn, server,
                                                 keypair, default_project,
                                                 ta_folders, used_controls)
                controlled_by_ta_name = control_ta.get('name')
                controlled_by_ta_id = control_ta.get('id')
        if controlled_by_ta_id and controlled_by_ta_id in used_controls:
            logging.warning('%s: Using same control %s for multiple reps' %
                            (controlled_by_ta_id, controlled_by_ta_name))
        used_controls.append(controlled_by_ta_id)
        #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same.
        #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first.
        if repn == min(repns):
            ta_index = 1
        else:
            ta_index = 2
        tas.update({
            'rep%d_ta' % (ta_index): {
                'file_id': possible_files[i].get('id'),
                'project_id': possible_files[i].get('project'),
                'folder': possible_files[i].get('folder'),
                'file_name': possible_files[i].get('name'),
                'enc_fqs': get_encffs(possible_files[i].get('name')),
                'controlled_by': controlled_by_accessions,
                'controlled_by_name': controlled_by_ta_name,
                'control_id': controlled_by_ta_id,
                'enc_repn': repn,
                'paired_end': is_paired_end(possible_files[i])
            }
        })

    return tas
def get_tas(experiment, server, keypair, default_project, ta_folders):
    # tas = {
    #   'rep1_ta': {
    #       'file_id': "",
    #       'project_id': "",
    #       'folder': "",
    #       'name': "",
    #       'paired_end': False,
    #       'control_path': "",
    #       'enc_repn': 0
    #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR...
    #.from this list infer repns from the paths ../bams/ENCSR.../repn*
    #.from this list infer the ENCFF's for the fastqs that were used
    #for each repn go to the experiment and find all the fastqs for that rep
    #if there are different fastq's in the experiment, or different reps, warn
    #for each fastq found in the TA filename, find its controlled_by
    #if any have controlled_by, all must have controlled_by else error
    #   gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error
    #else get possible_controls and try to match the repn, else pick one (rememeber it)
    #   gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error
    exp_id = experiment['accession']
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project_id = project.get_id()
            project_name += ":"
        else:
            project_id = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        if not path.endswith('/'):
            path += '/'
        logging.debug(
            "Looking for TA's in %s %s %s" % (project_id, project_name, path))
        for dxfile in dxpy.find_data_objects(
            classname='file',
            state='closed',
            folder=path + 'bams/%s/' %(exp_id),
            project=project_id,
            describe=True,
            recurse=True,
        ):
            desc = dxfile.get('describe')
            if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    logging.debug('Found %s possible files' %(len(possible_files)))
    logging.debug('%s' %([(f.get('folder'),f.get('name')) for f in possible_files]))
    repns = []
    files_to_ignore = []
    for f in possible_files:
        m = re.search('/rep(\d+)$',f['folder'])
        if m:
            repn = int(m.group(1))
            logging.debug("Matched rep%d" %(repn))
            if repn in repns:
                logging.warning("Ignoring additional rep%d bam, using first found" %(repn))
                files_to_ignore.append(f)
            else:
                logging.debug("First time finding rep%d" %(repn))
                repns.append(repn)
        else:
            logging.error("Cannot parse rep number from %s" %(f['folder']))
            return None
    for f in files_to_ignore:
        possible_files.remove(f)
    logging.debug('Discovered repns %s' %(repns))
    if len(repns) != 2:
        logging.error("Required to have exactly 2 reps for %s.  Found %d: %s" %(exp_id, len(repns), repns))
        return None

    tas = {}
    used_controls = []
    for i, repn in enumerate(repns):
        encode_files = [common.encoded_get(server+'/files/%s/' %(f), keypair) for f in get_encffs(possible_files[i].get('name'))]
        controlled_by = common.flat([f.get('controlled_by') for f in encode_files])
        if any(controlled_by):
            controlled_by_accessions = list(set([uri.split('/')[2] for uri in controlled_by if uri]))
            controlled_by_ta = get_ta_from_accessions(controlled_by_accessions, default_project, ta_folders)
            if controlled_by_ta:
                controlled_by_ta_name = controlled_by_ta.get('name')
                controlled_by_ta_id = controlled_by_ta.get('id')
            else:
                logging.error("%s: Could not find controlled_by_ta for accessions %s" %(experiment.get('accession'), controlled_by_accessions))
                controlled_by_ta_name = None
                controlled_by_ta_id = None
        else:
            #evaluate possible controls
            controlled_by_accessions = None
            possible_controls = experiment.get('possible_controls')
            logging.warning('%s: No controlled_by for rep%d, attempting to infer from possible_controls %s' %(experiment.get('accession'), repn, possible_controls))
            if not possible_controls or not any(possible_controls):
                logging.error('%s: Could not find controlled_by or resolve possible_controls for rep%d' %(experiment.get('accession'), repn))
                controlled_by_ta_name = None
                controlled_by_ta_id = None
            else:
                control_ta = get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_controls)
                controlled_by_ta_name = control_ta.get('name')
                controlled_by_ta_id = control_ta.get('id')
        if controlled_by_ta_id and controlled_by_ta_id in used_controls:
            logging.warning('%s: Using same control %s for multiple reps' %(controlled_by_ta_id, controlled_by_ta_name))
        used_controls.append(controlled_by_ta_id)
        #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same.
        #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first.
        if repn == min(repns):
            ta_index = 1
        else:
            ta_index = 2
        tas.update(
            {'rep%d_ta' %(ta_index): {
                'file_id': possible_files[i].get('id'),
                'project_id': possible_files[i].get('project'),
                'folder': possible_files[i].get('folder'),
                'file_name': possible_files[i].get('name'),
                'enc_fqs': get_encffs(possible_files[i].get('name')),
                'controlled_by': controlled_by_accessions,
                'controlled_by_name': controlled_by_ta_name,
                'control_id': controlled_by_ta_id,
                'enc_repn': repn,
                'paired_end': is_paired_end(possible_files[i])
                }
            }
        )

    return tas