def ProcessExperimentSeparate(experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, skip_files=False): m = ExtractExperimentMetadata(experiment_id, json) if m.valid_metadata(): # Check if a run ID was submitted, and if so only process that if experiment_id in m.runIDs: m.runIDs = [experiment_id] # Process the runIDs as samples _logger.info("Found Following Runs: %s", ', '.join(m.runIDs)) for runid in m.runIDs: with TemporaryDirectory() as tmpdir: os.chdir(batch_dir) sample_dir = "%s/%s/"%(batch_dir, sample_dir_id) if os.path.exists(sample_dir): sfiles = [x for x in os.listdir(sample_dir) if any([y in x for y in ['fq','fastq']])] else: sfiles = [] if not preserve or not skip_files or len(sfiles) == 0: sfiles = DownloadRunFiles(runid, tmpdir) if sfiles is not None: success = CreateSampleDir(sfiles, m, sample_dir, preserve, skip_files) if success: sample_dir_id += 1 else: failed_accession.append(runid) else: _logger.error("Files could not be retrieved! (%s)", runid) failed_accession.append(runid) else: _logger.error("Metadata Invalid! (%s) - %s", experiment_id, m.metadata.items()) failed_accession.append(experiment_id) return sample_dir_id
def CreateSampleDir(sfiles, m, sample_dir, preserve=False, skip_files=False): sample_dir = str(sample_dir) if not skip_files and len(sfiles) == 0: _logger.error("Error: No files were found! (%s)", sample_dir) return False if not os.path.exists(sample_dir): _logger.info("Create sample dir: %s", sample_dir) # Create 'sample' dir os.mkdir(sample_dir) # Move files from tmpdir to sample dir for sf in sfiles: move(sf, sample_dir) elif not preserve and not skip_files: # Empty sample directory for fn in os.listdir(sample_dir): os.unlink("%s/%s"%(sample_dir, fn)) # Move files from tmpdir to sample dir for sf in sfiles: move(sf, sample_dir) # Update and create metadata file try: m.metadata["file_names"] = ' '.join( [os.path.basename(sf).replace(' ','_') for sf in sfiles if not os.path.basename(sf) == 'meta.json'] ) m.save_metadata(sample_dir) except ValueError, e: _logger.error(e) return False
def download_fastq_from_list(accession_list, output, json, preserve=False, all_runs_as_samples=False, skip_files=False): """ Get Fastq from list of IDs :param accession_list: List of accessions :param dir: Output folder """ metadata = [] cwd = os.getcwd() with open(accession_list, 'r') as f: # Setup batch dir batch_dir = "%s/%s/"%(cwd, output) if not os.path.exists(batch_dir): os.mkdir(batch_dir) os.chdir(batch_dir) # Set logging _logger.Set(filename="%s/download-acceession-list.log"%batch_dir) # Count samples in accession_list n_samples = sum(1 for l in f) f.seek(0) _logger.info("Number of samples to download: %s", n_samples) # Start progress bar pbar = ProgressBar( widgets = [ETA(), ' - ', Percentage(), ' : ', Bar()], maxval = n_samples ).start() pbar.update(0) failed_accession = [] sample_dir_id = 0 for i, l in enumerate(f): accession = l.strip() if accession == '': continue # Determine accession type if accession[:3] in acctypes: accession_type = acctypes[accession[:3]] else: _logger.error("unknown accession type for '%s'!", accession) failed_accession.append(accession) continue _logger.info("Acc Found: %s (%s)", accession, accession_type) if accession_type in ['study', 'sample']: for experiment_id in ExtractExperimentIDs_acc(accession): sample_dir_id = ProcessExperiment( experiment_id, json, batch_dir,sample_dir_id, preserve, failed_accession, all_runs_as_samples, skip_files) elif accession_type == 'experiment': sample_dir_id = ProcessExperiment( accession, json, batch_dir,sample_dir_id, preserve, failed_accession, all_runs_as_samples, skip_files) elif accession_type == 'run': sample_dir_id = ProcessExperiment( accession, json, batch_dir,sample_dir_id, preserve, failed_accession, all_runs_as_samples, skip_files) pbar.update(i) pbar.finish() if failed_accession: _logger.info("The following accessions were not downloaded!") _logger.info('\n'.join(failed_accession)) else: _logger.info("All accessions downloaded succesfully!")
def DownloadRunFiles(runid, tmpdir): # Download run files try: s = Sequence(runid, tmpdir) s.download_fastq() if not s.error: _logger.info("Downloaded files: %s", ','.join(s.files)) return s.files else: return None except ValueError, e: _logger.error(e) return None
def download_fastq(self): ''' Download Fastq associated with Accession from ENA :param run_accession: Run Accession ID from ENA :return: True ''' try: Path(self.dir).makedirs_p() retcode = call(self.download, stdout=PIPE) except OSError as e: _logger.error('FastQ Failed: %s [%s]', self.accession, e) _logger.error('CMD: %s', self.download) Sequence.__errors[self.accession] = 'FastQ Failed' self.error = True else: if retcode < 0: _logger.error('Child was terminated by signal') self.error = True Sequence.__errors[self.accession] = 'Child was'\ 'terminated'\ '(signal)' else: _logger.info('Success: %s', self.accession) self.files = [ f.abspath() for f in Path(self.dir).files() ] Sequence.__sequence_id += 1
def ProcessExperimentCombined(experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, skip_files=False): m = ExtractExperimentMetadata(experiment_id, json) if m.valid_metadata(): # Check if a run ID was submitted, and if so only process that if experiment_id in m.runIDs: m.runIDs = [experiment_id] # Process the runs as one sample _logger.info("Found Following Runs: %s", ', '.join(m.runIDs)) with TemporaryDirectory() as tmpdir: os.chdir(batch_dir) sample_dir = "%s/%s/"%(batch_dir, sample_dir_id) csfiles = [] if preserve and os.path.exists(sample_dir): csfiles = [x for x in os.listdir(sample_dir) if any([y in x for y in ['fq','fastq']])] if csfiles == [] and not skip_files: sfiles = [] for runid in m.runIDs: sf = DownloadRunFiles(runid, tmpdir) if sf is not None: sfiles.append(sf) else: _logger.error("Run files could not be retrieved! (%s)", runid) _logger.info("Found Following files sets:\n%s\n", '\n'.join([', '.join(sf) for sf in sfiles])) # Combine sfiles into one entry if len(sfiles) > 1: for file_no, file_set in enumerate(zip(*sfiles)): ext = '.'.join(file_set[0].split('/')[-1].split('.')[1:]) if len(sfiles[0]) > 1: new_file = "%s_%s.combined.%s"%(experiment_id,file_no+1, ext) else: new_file = "%s.combined.%s"%(experiment_id, ext) with open(new_file, 'w') as nf: for fn in file_set: with open(fn, 'rb') as f: nf.write(f.read()) if os.path.exists(new_file): csfiles.append(new_file) else: _logger.error("Combined file creation failed! (%s: %s)", experiment_id, file_no) break elif isinstance(sfiles[0], list): csfiles = sfiles[0] if csfiles == []: _logger.error("Files could not be combined! (%s)", experiment_id) failed_accession.append(experiment_id) if csfiles != [] or skip_files: success = CreateSampleDir(csfiles, m, sample_dir, preserve, skip_files) if success: sample_dir_id += 1 else: failed_accession.append(experiment_id) else: _logger.error("Files could not be retrieved! (%s)", experiment_id) failed_accession.append(experiment_id) else: _logger.error("Metadata Invalid! (%s) - %s", experiment_id, m.metadata.items()) failed_accession.append(experiment_id) return sample_dir_id