def dispatcher(success_fp, fail_fp, partitions): """Dispatch execution over a pool of processors Parameters ---------- success_fp : file-like object A file-like object to write a list of successful sample IDs too fail_fp : file-like object A file-like object to write a list of unsuccessful sample IDs too, and any associated error information partitions : Iterable of (function, Iterable of str) Yields a function and an iterable of IDs. It is expected that the functions yielded will have the following signature: {str: list} <- function(list of str) """ if ag.is_test_env(): logger = mp.log_to_stderr() logger.setLevel(logging.INFO) pool = mp.Pool(processes=agenv.get_cpu_count()) success_fp.write('%s\n' % '#SampleID') fail_fp.write('%s\t%s\n' % ('#SampleID', 'Error(s)')) for func, ids in partitions: functor = partial(run_functor, func) for success_details in pool.map(functor, list(agru.chunk_list(ids))): for id_, detail in success_details.items(): if detail: fail_fp.write("%s\t%s\n" % (id_, '\t'.join(detail))) else: success_fp.write("%s\n" % id_)
def fetch_study(study_accession, base_dir): """Fetch and dump a study Grab and dump a study. If sample_accessions are specified, then only those specified samples will be fetched and dumped Parameters ---------- study_accession : str Accession ID for the study base_dir : str Path of base directory to save the fetched results Note ---- If sample_accession is None, then the entire study will be fetched """ if ag.is_test_env(): return 0 study_dir = os.path.join(base_dir, study_accession) if ag.staged_raw_data() is not None: os.symlink(ag.staged_raw_data(), study_dir) elif not os.path.exists(study_dir): os.mkdir(study_dir) new_samples = 0 for sample, fastq_url in fetch_study_details(study_accession): sample_dir = os.path.join(study_dir, sample) if not os.path.exists(sample_dir): # fetch files if it isn't already present os.mkdir(sample_dir) metadata_path = os.path.join(sample_dir, '%s.txt' % sample) fasta_path = os.path.join(sample_dir, '%s.fna' % sample) # write out fasta with open(fasta_path, 'w') as fasta_out: for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)): fasta_out.write(">%s\n%s\n" % (id_, seq)) # write mapping xml url_fmt = "http://www.ebi.ac.uk/ena/data/view/" + \ "%(accession)s&display=xml" res = fetch_url(url_fmt % {'accession': sample}) with open(metadata_path, 'w') as md_f: md_f.write(res.read()) new_samples += 1 return new_samples
def get_reference_set(): """Get the reference set to use for OTU picking Returns ------- str The file path to the reference sequences. str The file path to the reference taxonomy. """ if ag.is_test_env(): repo = get_repository_dir() ref_seqs = os.path.join(repo, 'tests/data/otus.fna') ref_tax = os.path.join(repo, 'tests/data/otus.txt') return ref_seqs, ref_tax else: return qdr.get_reference_sequences(), qdr.get_reference_taxonomy()
def get_study_accessions(): """Get the accessions to use, or redirect to test data Returns ------- list of str The accessions, which are expected to be basenames for the actual data. For instance, the accession "foo" would have sequences as "foo.fna" and metadata as "foo.txt". Notes ----- If $AG_TESTING == 'True', then the accessions returned will correspond to the test dataset. """ if ag.is_test_env(): _stage_test_accessions() return _TEST_ACCESSIONS[:] else: return _EBI_ACCESSIONS[:]
def _get_data(data_dir, tag): """Get a non-AG table and mapping file Parameters ---------- data_dir : str The base data path tag : str The filetag (e.g., HMPv35_100nt) Returns ------- (str, str) The filepath to the table, and the filepath to the mapping file. Notes ----- If $AG_TESTING == 'True', then the data returned will correspond to the test dataset. Raises ------ IOError If the filepaths are not accessible """ repo = get_repository_dir() data = 'tests/data' if ag.is_test_env() else 'data' base = os.path.join(repo, data) table = os.path.join(base, data_dir, '%s.biom' % tag) mapping = os.path.join(base, data_dir, '%s.txt' % tag) if not os.path.exists(table): raise IOError("Unable to access: %s" % table) if not os.path.exists(mapping): raise IOError("Unable to access: %s" % table) return table, mapping
def get_rarefaction_depth(): """Return the rarefaction depth to use""" if ag.is_test_env(): return "100" else: return "1000"