def main():

	args = get_args()
	if args.debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.analysis_ids:
		ids = args.analysis_ids
	else:
		ids = args.infile

	formats = ['bed_narrowPeak', 'bed_gappedPeak']
	fieldnames = ['file','analysis','experiment','replicates','output_name','file_format','output_type','target','biosample_term_name','biosample_term_id','biosample_type','biosample_life_stage','biosample_age','biosample_organism']
	writer = csv.DictWriter(args.outfile, fieldnames, delimiter='\t')
	writer.writeheader()
	for (i, analysis_id) in enumerate(ids):
		analysis_id = analysis_id.rstrip()
		logger.info('%s' %(analysis_id))
		try:
			files = analysis_files(analysis_id, keypair, server, args.assembly)
		except:
			logger.error('%s error finding analysis_files.  Check experiment metadata.' %(analysis_id))
		for f in [f_obj for f_obj in files if f_obj.get('file_format') in formats]:
			fid = f['dx'].get_id()
			local_path = os.path.join(args.outdir,fid)
			if not os.path.isfile(local_path):
				if not os.path.exists(args.outdir):
					os.makedirs(args.outdir)
				dxpy.download_dxfile(fid, local_path)
			replicates = []
			for derived_from in f['derived_from']:
				rep_ns = common.biorep_ns(derived_from, server, keypair)
				for r in rep_ns:
					replicates.append(r)
			experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(f['dataset'])), keypair)
			rep = common.encoded_get(urlparse.urljoin(server, experiment['replicates'][0]), keypair)
			lib = common.encoded_get(urlparse.urljoin(server, rep['library']), keypair)
			biosample = common.encoded_get(urlparse.urljoin(server, lib['biosample']), keypair)
			writer.writerow({
				'file': fid,
				'analysis': analysis_id,
				'experiment': experiment.get('accession'),
				'replicates': replicates,
				'output_name': f.get('name'),
				'file_format': f.get('file_format'),
				'output_type': f.get('output_type'),
				'target': experiment.get('target'),
				'biosample_term_name': experiment.get('biosample_term_name'),
				'biosample_term_id': experiment.get('biosample_term_id'),
				'biosample_type': experiment.get('biosample_type'),
				'biosample_life_stage': biosample.get('life_stage'),
				'biosample_age': biosample.get('age'),
				'biosample_organism': biosample.get('organism')})
Beispiel #2
0
def main():
    global args
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.experiments:
        exp_ids = csv.reader(
            StringIO.StringIO('\n'.join([s.rstrip()
                                         for s in args.experiments])))
    else:
        exp_ids = csv.reader(args.infile)

    for instring in exp_ids:
        exp_id = instring[0].strip()
        if len(instring) > 1:
            repns = []
            for s in instring[1:]:
                repns.extend(s.split(','))
            biorep_ns = list(set([int(s) for s in repns]))
        else:
            biorep_ns = []
        outstrings = []
        encode_url = urlparse.urljoin(server, exp_id)
        experiment = common.encoded_get(encode_url, keypair)
        outstrings.append(exp_id)
        files = files_to_map(experiment, server, keypair, args.sfn_dupes)
        outstrings.append(str(len(files)))
        outstrings.append(str([f.get('accession') for f in files]))
        replicates = replicates_to_map(files, server, keypair, biorep_ns)

        if files:
            for biorep_n in set(
                [rep.get('biological_replicate_number')
                 for rep in replicates]):
                outstrings.append('rep%s' % (biorep_n))
                biorep_files = [
                    f for f in files
                    if biorep_n in common.biorep_ns(f, server, keypair)
                ]
                paired_files = []
                unpaired_files = []
                while biorep_files:
                    file_object = biorep_files.pop()
                    if file_object.get(
                            'paired_end'
                    ) == None:  # group all the unpaired reads for this biorep together
                        unpaired_files.append(file_object)
                    elif file_object.get('paired_end') in ['1', '2']:
                        if file_object.get('paired_with'):
                            mate = next((f for f in biorep_files if f.get(
                                '@id') == file_object.get('paired_with')),
                                        None)
                        else:  #have to find the file that is paired with this one
                            mate = next((f for f in biorep_files if f.get(
                                'paired_with') == file_object.get('@id')),
                                        None)
                        if mate:
                            biorep_files.remove(mate)
                        else:
                            logging.warning('%s:%s could not find mate' %
                                            (experiment.get('accession'),
                                             file_object.get('accession')))
                            mate = {}
                        paired_files.append((file_object, mate))
                if biorep_files:
                    logging.warning(
                        '%s: leftover file(s) %s' %
                        (experiment.get('accession'), biorep_files))
                if paired_files:
                    pe_jobs = map_only(experiment, biorep_n, paired_files,
                                       args.key, server, keypair)
                if unpaired_files:
                    se_jobs = map_only(experiment, biorep_n, unpaired_files,
                                       args.key, server, keypair)
                if paired_files and pe_jobs:
                    outstrings.append(
                        'paired:%s' %
                        ([(a.get('accession'), b.get('accession'))
                          for (a, b) in paired_files]))
                    outstrings.append('paired jobs:%s' %
                                      ([j.get_id() for j in pe_jobs]))
                else:
                    outstrings.append('paired:%s' % (None))
                if unpaired_files and se_jobs:
                    outstrings.append(
                        'unpaired:%s' %
                        ([f.get('accession') for f in unpaired_files]))
                    outstrings.append('unpaired jobs:%s' %
                                      ([j.get_id() for j in se_jobs]))
                else:
                    outstrings.append('unpaired:%s' % (None))

            print '\t'.join(outstrings)
        else:  # no files
            if not replicates:
                logging.warning('%s: No files and no replicates' %
                                experiment.get('accession'))
            else:
                logging.warning('%s: No files to map' %
                                experiment.get('accession'))
        if files and not replicates:
            logging.warning('%s: Files but no replicates' %
                            experiment.get('accession'))
def main():
    global args
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid,authpw)

    if args.experiments:
        exp_ids = csv.reader(StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments])))
    else:
        exp_ids = csv.reader(args.infile)

    for instring in exp_ids:
        exp_id = instring[0].strip()
        if len(instring) > 1:
            repns = []
            for s in instring[1:]:
                repns.extend(s.split(','))
            biorep_ns = list(set([int(s) for s in repns]))
        else:
            biorep_ns = []
        outstrings = []
        encode_url = urlparse.urljoin(server,exp_id)
        experiment = common.encoded_get(encode_url, keypair)
        outstrings.append(exp_id)
        files = files_to_map(experiment, server, keypair, args.no_sfn_dupes)
        outstrings.append(str(len(files)))
        outstrings.append(str([f.get('accession') for f in files]))
        replicates = replicates_to_map(files, server, keypair, biorep_ns)
        in_process = False
        if files:
            for biorep_n in set([rep.get('biological_replicate_number') for rep in replicates]):
                outstrings.append('rep%s' %(biorep_n))
                biorep_files = [f for f in files if biorep_n in common.biorep_ns(f,server,keypair)]
                paired_files = []
                unpaired_files = []
                while biorep_files:
                    file_object = biorep_files.pop()
                    if file_object.get('paired_end') == None: # group all the unpaired reads for this biorep together
                        unpaired_files.append(file_object)
                    elif file_object.get('paired_end') in ['1','2']:
                        if file_object.get('paired_with'):
                            mate = next((f for f in biorep_files if f.get('@id') == file_object.get('paired_with')), None)
                        else: #have to find the file that is paired with this one
                            mate = next((f for f in biorep_files if f.get('paired_with') == file_object.get('@id')), None)
                        if mate:
                            biorep_files.remove(mate)
                        else:
                            logging.warning('%s:%s could not find mate' %(experiment.get('accession'), file_object.get('accession')))
                            mate = {}

                        # if mapping as SE, ignore the mate and just map the
                        # rep1 as SE with all the other SE for this rep, if any
                        if args.force_se:
                            unpaired_files.append(next(
                                f for f in [file_object, mate]
                                if f.get('paired_end') == '1'))
                        else:
                            paired_files.append((file_object, mate))

                if biorep_files:
                    logging.warning('%s: leftover file(s) %s' %(experiment.get('accession'), biorep_files))
                if paired_files:
                    pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair, args.sex_specific)
                    in_process = True
                if unpaired_files:
                    se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair, args.sex_specific)
                    in_process = True
                if paired_files and pe_jobs:
                    outstrings.append('paired:%s' %([(a.get('accession'), b.get('accession')) for (a,b) in paired_files]))
                    outstrings.append('paired jobs:%s' %([j.get_id() for j in pe_jobs]))
                else:
                    outstrings.append('paired:%s' %(None))
                if unpaired_files and se_jobs:
                    outstrings.append('unpaired:%s' %([f.get('accession') for f in unpaired_files]))
                    outstrings.append('unpaired jobs:%s' %([j.get_id() for j in se_jobs]))
                else:
                    outstrings.append('unpaired:%s' %(None))
            if in_process:
                r = common.encoded_patch(encode_url, keypair, {"internal_status": "processing"}, return_response=True)
                try:
                    r.raise_for_status()
                except:
                    logging.error("Tried and failed to set internal_status")
                    logging.error(r.text)
            print '\t'.join(outstrings)
        else: # no files
            if not replicates:
                logging.warning('%s: No files and no replicates' %experiment.get('accession'))
            else:
                logging.warning('%s: No files to map' %experiment.get('accession'))
        if files and not replicates:
            logging.warning('%s: Files but no replicates' %experiment.get('accession'))
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.query:
        r = requests.get(
            args.query, auth=keypair, headers={"content-type": "application/json", "accept": "application/json"}
        )
        experiments = r.json()["@graph"]
        exp_ids = [e["accession"] for e in experiments]
    elif args.experiments:
        exp_ids = args.experiments
    else:
        exp_ids = args.infile

    for (i, exp_id) in enumerate(exp_ids):
        exp_id = exp_id.strip()
        logger.info("%s" % (exp_id))

        url = urlparse.urljoin(server, "/experiments/%s" % (exp_id))
        experiment_object = common.encoded_get(url, keypair)
        original_files = [
            common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair)
            for uri in experiment_object.get("original_files")
        ]
        bams = [
            f
            for f in original_files
            if f.get("file_format") == "bam" and f.get("status") not in ["revoked", "deleted", "replaced"]
        ]
        fastqs = [
            f
            for f in original_files
            if f.get("file_format") == "fastq" and f.get("status") not in ["revoked", "deleted", "replaced"]
        ]
        for f in fastqs:
            f["replicate"] = common.encoded_get(urlparse.urljoin(server, "%s" % (f.get("replicate"))), keypair)
        for bam in bams:
            bioreps = common.biorep_ns(bam.get("accession"), server, keypair)
            if len(bioreps) != 1:
                logger.error(
                    "Expected to find 1 biorep for bam %s, found %d.  Skipping." % (bam.get("accession"), len(bioreps))
                )
                continue
            else:
                bam_biorep = bioreps[0]
            try:
                derived_from = [
                    common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair)
                    for uri in bam.get("derived_from")
                ]
            except:
                derived_from = None
            if not derived_from:
                logger.error("bam %s is derived from nothing. Skipping" % (bam.get("accession")))
                continue
            for f in derived_from:
                if f.get("file_format") != "fastq":
                    logger.error(
                        "bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files."
                        % (bam.get("accession"), f.get("accession"))
                    )
                    continue
                try:
                    if common.after(f.get("date_created"), bam.get("date_created")):
                        logger.error(
                            "Date conflict. Bam %s is derived from newer Fastq %s"
                            % (bam.get("accession"), f.get("accession"))
                        )
                except:
                    logger.error(
                        "Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files."
                        % (bam.get("date_created"), f.get("date_created"))
                    )
                    continue
            for f in fastqs:
                if f.get("replicate").get("biological_replicate_number") == bam_biorep:
                    if common.after(f.get("date_created"), bam.get("date_created")):
                        logger.info(
                            "bam %s is out-of-date.  fastq %s is newer" % (bam.get("accession"), f.get("accession"))
                        )
                        if re.search("control", experiment_object.get("target").lower()):
                            logger.info(
                                "WARNING, %s is a control experiment so many other experiments may be out-of-date."
                                % (experiment_object.get("accession"))
                            )
def main():
	global args
	args = get_args()

	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.experiments:
		exp_ids = csv.reader(StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments])))
	else:
		exp_ids = csv.reader(args.infile)

	for instring in exp_ids:
		exp_id = instring[0].strip()
		if len(instring) > 1:
			repns = []
			for s in instring[1:]:
				repns.extend(s.split(','))
			biorep_ns = list(set([int(s) for s in repns]))
		else:
			biorep_ns = []
		outstrings = []
		encode_url = urlparse.urljoin(server,exp_id)
		experiment = common.encoded_get(encode_url, keypair)
		outstrings.append(exp_id)
		files = files_to_map(experiment, server, keypair, args.sfn_dupes)
		outstrings.append(str(len(files)))
		outstrings.append(str([f.get('accession') for f in files]))
		replicates = replicates_to_map(files, server, keypair, biorep_ns)

		if files:
			for biorep_n in set([rep.get('biological_replicate_number') for rep in replicates]):
				outstrings.append('rep%s' %(biorep_n))
				biorep_files = [f for f in files if biorep_n in common.biorep_ns(f,server,keypair)]
				paired_files = []
				unpaired_files = []
				while biorep_files:
					file_object = biorep_files.pop()
					if file_object.get('paired_end') == None: # group all the unpaired reads for this biorep together
						unpaired_files.append(file_object)
					elif file_object.get('paired_end') in ['1','2']:
						if file_object.get('paired_with'):
							mate = next((f for f in biorep_files if f.get('@id') == file_object.get('paired_with')), None)
						else: #have to find the file that is paired with this one
							mate = next((f for f in biorep_files if f.get('paired_with') == file_object.get('@id')), None)
						if mate:
							biorep_files.remove(mate)
						else:
							logging.warning('%s:%s could not find mate' %(experiment.get('accession'), file_object.get('accession')))
							mate = {}
						paired_files.append((file_object,mate))
				if biorep_files:
					logging.warning('%s: leftover file(s) %s' %(experiment.get('accession'), biorep_files))
				if paired_files:
					pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair)
				if unpaired_files:
					se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair)
				if paired_files and pe_jobs:
					outstrings.append('paired:%s' %([(a.get('accession'), b.get('accession')) for (a,b) in paired_files]))
					outstrings.append('paired jobs:%s' %([j.get_id() for j in pe_jobs]))
				else:
					outstrings.append('paired:%s' %(None))
				if unpaired_files and se_jobs:
					outstrings.append('unpaired:%s' %([f.get('accession') for f in unpaired_files]))
					outstrings.append('unpaired jobs:%s' %([j.get_id() for j in se_jobs]))
				else:
					outstrings.append('unpaired:%s' %(None))

			print '\t'.join(outstrings)
		else: # no files
			if not replicates:
				logging.warning('%s: No files and no replicates' %experiment.get('accession'))
			else:
				logging.warning('%s: No files to map' %experiment.get('accession'))
		if files and not replicates:
			logging.warning('%s: Files but no replicates' %experiment.get('accession'))
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid,authpw)

    if args.query:
        r = requests.get(args.query, auth=keypair, headers={'content-type': 'application/json', 'accept': 'application/json'})
        experiments = r.json()['@graph']
        exp_ids = [e['accession'] for e in experiments]
    elif args.experiments:
        exp_ids = args.experiments
    else:
        exp_ids = args.infile

    logger.info('Checking %d experiments' % (len(exp_ids)))
    for (i, exp_id) in enumerate(exp_ids):
        exp_id = exp_id.strip()
        #logger.info('%s' %(exp_id))

        url = urlparse.urljoin(server, '/experiments/%s' %(exp_id))
        experiment_object = common.encoded_get(url, keypair)
        original_files = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in experiment_object.get('original_files')]
        bams = [f for f in original_files if f.get('file_format') == 'bam' and f.get('status') not in ['revoked','deleted','replaced']]
        fastqs = [f for f in original_files if f.get('file_format') == 'fastq' and f.get('status') not in ['revoked','deleted','replaced']]
        for f in fastqs:
            f['replicate'] = common.encoded_get(urlparse.urljoin(server,'%s' %(f.get('replicate'))), keypair)
        for bam in bams:
            bioreps = common.biorep_ns(bam.get('accession'),server,keypair)
            if len(bioreps) != 1:
                logger.error("Expected to find 1 biorep for bam %s, found %s.  Skipping." %(bam.get('accession'), bioreps))
                continue
            else:
                bam_biorep = bioreps[0]
            try:
                derived_from = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in bam.get('derived_from')]
            except:
                derived_from = None
            if not derived_from:
                logger.error('bam %s is derived from nothing. Skipping' %(bam.get('accession')))
                continue
            for f in derived_from:
                if f.get('output_category') == 'reference':
                    continue
                if f.get('file_format') != 'fastq':
                    logger.error("bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files." %(bam.get('accession'), f.get('accession')))
                    continue
                try:
                    if common.after(f.get('date_created'), bam.get('date_created')):
                        logger.error("Date conflict. Bam %s is derived from newer Fastq %s" %(bam.get('accession'), f.get('accession')))
                except:
                    logger.error("Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files." %(bam.get('date_created'), f.get('date_created')))
                    continue
            for f in fastqs:
                if f.get('replicate').get('biological_replicate_number') == bam_biorep:
                    if common.after(f.get('date_created'), bam.get('date_created')):
                        logger.info("bam %s is out-of-date.  fastq %s is newer" %(bam.get('accession'), f.get('accession')))
                        if re.search('control',experiment_object.get('target').lower()):
                            logger.info("WARNING, %s is a control experiment so many other experiments may be out-of-date." %(experiment_object.get('accession')))
Beispiel #7
0
def main():
    global args
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.experiments:
        exp_ids = csv.reader(
            StringIO.StringIO('\n'.join([s.rstrip()
                                         for s in args.experiments])))
    else:
        exp_ids = csv.reader(args.infile)

    for row in exp_ids:
        if row[0].startswith('#'):
            continue
        exp_id = row[0].strip()
        if len(row) > 1:
            repns = []
            for s in row[1:]:
                repns.extend(s.split(','))
            map_only_reps = list(set([int(s) for s in repns]))
        else:
            map_only_reps = []
        outstrings = []
        encode_url = urlparse.urljoin(server, exp_id)
        experiment = common.encoded_get(encode_url, keypair)
        outstrings.append(exp_id)
        files = files_to_map(experiment, server, keypair, args.no_sfn_dupes)
        outstrings.append(str(len(files)))
        outstrings.append(str([f.get('accession') for f in files]))
        replicates = replicates_to_map(files, server, keypair, map_only_reps)
        biorep_numbers = \
            set([rep.get('biological_replicate_number') for rep in replicates])
        in_process = False
        if files:
            for biorep_n in biorep_numbers:
                outstrings.append('rep%s' % (biorep_n))
                biorep_files = [
                    f for f in files
                    if biorep_n in common.biorep_ns(f, server, keypair)
                ]
                paired_files = []
                unpaired_files = []
                while biorep_files:
                    file_object = biorep_files.pop()
                    if file_object.get(
                            'paired_end'
                    ) == None:  # group all the unpaired reads for this biorep together
                        unpaired_files.append(file_object)
                    elif file_object.get('paired_end') in ['1', '2']:
                        if file_object.get('paired_with'):
                            mate = next((f for f in biorep_files if f.get(
                                '@id') == file_object.get('paired_with')),
                                        None)
                        else:  #have to find the file that is paired with this one
                            mate = next((f for f in biorep_files if f.get(
                                'paired_with') == file_object.get('@id')),
                                        None)
                        if mate:
                            biorep_files.remove(mate)
                        else:
                            logging.warning('%s:%s could not find mate' %
                                            (experiment.get('accession'),
                                             file_object.get('accession')))
                            mate = {}

                        # if mapping as SE, ignore the mate and just map the
                        # rep1 as SE with all the other SE for this rep, if any
                        if args.force_se:
                            unpaired_files.append(
                                next(f for f in [file_object, mate]
                                     if f.get('paired_end') == '1'))
                        else:
                            paired_files.append((file_object, mate))

                if biorep_files:
                    logging.warning(
                        '%s: leftover file(s) %s' %
                        (experiment.get('accession'), biorep_files))
                if paired_files:
                    pe_jobs = \
                        map_only(experiment, biorep_n, paired_files,
                                 server, keypair, args.sex_specific,
                                 args.crop_length, args.accession,
                                 args.fqcheck, args.force_patch,
                                 args.use_existing_folders, args.encoded_check)
                    in_process = True
                if unpaired_files:
                    se_jobs = \
                        map_only(experiment, biorep_n, unpaired_files,
                                 server, keypair, args.sex_specific,
                                 args.crop_length, args.accession,
                                 args.fqcheck, args.force_patch,
                                 args.use_existing_folders, args.encoded_check)
                    in_process = True
                if paired_files and pe_jobs:
                    outstrings.append(
                        'paired:%s' %
                        ([(a.get('accession'), b.get('accession'))
                          for (a, b) in paired_files]))
                    outstrings.append('paired jobs:%s' %
                                      ([j.get_id() for j in pe_jobs]))
                else:
                    outstrings.append('paired:%s' % (None))
                if unpaired_files and se_jobs:
                    outstrings.append(
                        'unpaired:%s' %
                        ([f.get('accession') for f in unpaired_files]))
                    outstrings.append('unpaired jobs:%s' %
                                      ([j.get_id() for j in se_jobs]))
                else:
                    outstrings.append('unpaired:%s' % (None))
            if in_process:
                r = common.encoded_patch(encode_url,
                                         keypair,
                                         {"internal_status": "processing"},
                                         return_response=True)
                try:
                    r.raise_for_status()
                except:
                    logging.error("Tried and failed to set internal_status")
                    logging.error(r.text)
            print('\t'.join(outstrings))
        else:  # no files
            if not replicates:
                logging.warning('%s: No files and no replicates' %
                                experiment.get('accession'))
            else:
                logging.warning('%s: No files to map' %
                                experiment.get('accession'))
        if files and not replicates:
            logging.warning('%s: Files but no replicates' %
                            experiment.get('accession'))
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.analysis_ids:
        ids = args.analysis_ids
    else:
        ids = args.infile

    formats = ['bed_narrowPeak', 'bed_gappedPeak']
    fieldnames = [
        'file', 'analysis', 'experiment', 'replicates', 'output_name',
        'file_format', 'output_type', 'target', 'biosample_term_name',
        'biosample_term_id', 'biosample_type', 'biosample_life_stage',
        'biosample_age', 'biosample_organism'
    ]
    writer = csv.DictWriter(args.outfile, fieldnames, delimiter='\t')
    writer.writeheader()
    for (i, analysis_id) in enumerate(ids):
        analysis_id = analysis_id.rstrip()
        logger.info('%s' % (analysis_id))
        try:
            files = analysis_files(analysis_id, keypair, server, args.assembly)
        except:
            logger.error(
                '%s error finding analysis_files.  Check experiment metadata.'
                % (analysis_id))
        for f in [
                f_obj for f_obj in files if f_obj.get('file_format') in formats
        ]:
            fid = f['dx'].get_id()
            local_path = os.path.join(args.outdir, fid)
            if not os.path.isfile(local_path):
                if not os.path.exists(args.outdir):
                    os.makedirs(args.outdir)
                dxpy.download_dxfile(fid, local_path)
            replicates = []
            for derived_from in f['derived_from']:
                rep_ns = common.biorep_ns(derived_from, server, keypair)
                for r in rep_ns:
                    replicates.append(r)
            experiment = common.encoded_get(
                urlparse.urljoin(server, '/experiments/%s' % (f['dataset'])),
                keypair)
            rep = common.encoded_get(
                urlparse.urljoin(server, experiment['replicates'][0]), keypair)
            lib = common.encoded_get(urlparse.urljoin(server, rep['library']),
                                     keypair)
            biosample = common.encoded_get(
                urlparse.urljoin(server, lib['biosample']), keypair)
            writer.writerow({
                'file':
                fid,
                'analysis':
                analysis_id,
                'experiment':
                experiment.get('accession'),
                'replicates':
                replicates,
                'output_name':
                f.get('name'),
                'file_format':
                f.get('file_format'),
                'output_type':
                f.get('output_type'),
                'target':
                experiment.get('target'),
                'biosample_term_name':
                experiment.get('biosample_term_name'),
                'biosample_term_id':
                experiment.get('biosample_term_id'),
                'biosample_type':
                experiment.get('biosample_type'),
                'biosample_life_stage':
                biosample.get('life_stage'),
                'biosample_age':
                biosample.get('age'),
                'biosample_organism':
                biosample.get('organism')
            })
Beispiel #9
0
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.query:
        r = requests.get(args.query,
                         auth=keypair,
                         headers={
                             'content-type': 'application/json',
                             'accept': 'application/json'
                         })
        experiments = r.json()['@graph']
        exp_ids = [e['accession'] for e in experiments]
    elif args.experiments:
        exp_ids = args.experiments
    else:
        exp_ids = args.infile

    logger.info('Checking %d experiments' % (len(exp_ids)))
    for (i, exp_id) in enumerate(exp_ids):
        exp_id = exp_id.strip()
        #logger.info('%s' %(exp_id))

        url = urlparse.urljoin(server, '/experiments/%s' % (exp_id))
        experiment_object = common.encoded_get(url, keypair)
        original_files = [
            common.encoded_get(urlparse.urljoin(server, '%s' % (uri)), keypair)
            for uri in experiment_object.get('original_files')
        ]
        bams = [
            f for f in original_files if f.get('file_format') == 'bam'
            and f.get('status') not in ['revoked', 'deleted', 'replaced']
        ]
        fastqs = [
            f for f in original_files if f.get('file_format') == 'fastq'
            and f.get('status') not in ['revoked', 'deleted', 'replaced']
        ]
        for f in fastqs:
            f['replicate'] = common.encoded_get(
                urlparse.urljoin(server, '%s' % (f.get('replicate'))), keypair)
        for bam in bams:
            bioreps = common.biorep_ns(bam.get('accession'), server, keypair)
            if len(bioreps) != 1:
                logger.error(
                    "Expected to find 1 biorep for bam %s, found %s.  Skipping."
                    % (bam.get('accession'), bioreps))
                continue
            else:
                bam_biorep = bioreps[0]
            try:
                derived_from = [
                    common.encoded_get(urlparse.urljoin(server, '%s' % (uri)),
                                       keypair)
                    for uri in bam.get('derived_from')
                ]
            except:
                derived_from = None
            if not derived_from:
                logger.error('bam %s is derived from nothing. Skipping' %
                             (bam.get('accession')))
                continue
            for f in derived_from:
                if f.get('output_category') == 'reference':
                    continue
                if f.get('file_format') != 'fastq':
                    logger.error(
                        "bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files."
                        % (bam.get('accession'), f.get('accession')))
                    continue
                try:
                    if common.after(f.get('date_created'),
                                    bam.get('date_created')):
                        logger.error(
                            "Date conflict. Bam %s is derived from newer Fastq %s"
                            % (bam.get('accession'), f.get('accession')))
                except:
                    logger.error(
                        "Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files."
                        % (bam.get('date_created'), f.get('date_created')))
                    continue
            for f in fastqs:
                if f.get('replicate').get(
                        'biological_replicate_number') == bam_biorep:
                    if common.after(f.get('date_created'),
                                    bam.get('date_created')):
                        logger.info(
                            "bam %s is out-of-date.  fastq %s is newer" %
                            (bam.get('accession'), f.get('accession')))
                        if re.search('control',
                                     experiment_object.get('target').lower()):
                            logger.info(
                                "WARNING, %s is a control experiment so many other experiments may be out-of-date."
                                % (experiment_object.get('accession')))