def test_massive_offline(block_internet, tmp_path): """Test massive project offline functionality""" ppx.find_project(MSVID) # Doesn't need internet to resolve proj = ppx.find_project(MSVID, repo="mAsSiVe") assert isinstance(proj, ppx.MassiveProject) assert proj.local == tmp_path / MSVID assert not proj.fetch proj = ppx.find_project(MSVID, local=(tmp_path / "test"), repo="massive") assert proj.local == tmp_path / "test" proj = ppx.find_project(MSVID, fetch=True, repo="massive") assert proj.fetch
def test_pride_offline(block_internet, tmp_path): """Test pride project offline functionality""" with pytest.raises(Exception): ppx.find_project(PXID) proj = ppx.find_project(PXID, repo="PrIdE") assert isinstance(proj, ppx.PrideProject) assert proj.local == tmp_path / PXID assert not proj.fetch proj = ppx.find_project(PXID, local=(tmp_path / "test"), repo="pride") assert proj.local == tmp_path / "test" proj = ppx.find_project(PXID, fetch=True, repo="pride") assert proj.fetch
def prepare_data(self): """Verify that the spectrum indices contain the expected spectra. If a mass spectrometry data file is missing from a spectrum index, this function will download the missing mzML file and add it. """ index_map = { "train": self.train_index, "validation": self.valid_index, "test": self.test_index, } for split, projects in self.splits.items(): if split == "rejected": continue index = index_map[split] for project, ms_files in projects.items(): for ms_file in ms_files: fname = str(Path(project, ms_file)) if any(fname in f for f in index.ms_files): continue proj = ppx.find_project(project) downloaded = proj.download(ms_file)[0] index.add_file(downloaded) if not self.keep_files: downloaded.unlink() # Delete the file when we're done
def _add_remote_dataset(self, accession, filter_str): # Initiate ppx call for accesion validation cache_path = os.makedirs(os.path.join(os.getcwd(), ".ppx_cache"), exist_ok=True) project = ppx.find_project(accession, local=cache_path) # Check if dataset is already in database dataset_query = self.database\ .session\ .query(Dataset)\ .filter(Dataset.accession == accession) if self.database.safe_run(dataset_query.count) == 0: print("==> {} not in database, adding now".format(accession)) dataset = Dataset(accession=accession, title=project.title) else: print("==> {} already present in database, updating files".format( accession)) dataset = self.database.safe_run(dataset_query.one) # Check for files in remote file_list = project.remote_files("*.raw") file_list = [ f for f in file_list if re.search(filter_str, f) is not None ] file_list.sort() # Add files if not present for file_name in file_list: sample = Sample( parentDataset=dataset.accession, sampleName=os.path.splitext(file_name)[0], fileType=os.path.splitext(file_name)[1].lstrip(".").lower(), fileName=file_name, fileLocation="remote") sample_query = self.database.session\ .query(Sample)\ .filter(Sample.parentDataset == sample.parentDataset)\ .filter(Sample.sampleName == sample.sampleName) if self.database.safe_run(sample_query.count) == 0: dataset.samples.append(sample) # Update database self.database.safe_add(dataset)
def add_projects(self, split, num): """Add random projects to a split. Parameters ---------- split : str, {"train", "validation", "test"} The split to add projects to. num : int The number of random projects to add. """ added = 0 pattern = re.compile(r"ccms_peak/.*\.mzml$", flags=re.IGNORECASE) for idx, msvid in enumerate(self._projects): proj = ppx.find_project(msvid) keep = [] file_info = [l.split(",") for l in proj.file_info().splitlines()] ms2_idx = file_info[0].index("spectra_ms2") for info in file_info[1:]: fname = info[0].split("/", 1)[1] if pattern.search(fname) and int(info[ms2_idx]): keep.append(fname) if keep and validate(proj, keep[0]): self.splits[split][msvid] = keep added += 1 LOGGER.info("Found %i/%i...", added, num) else: self.splits["rejected"].append(msvid) if added == num: break self.save() # Remove the projects we've sampled from consideration: del self._projects[:idx + 1] if added < num: LOGGER.warn("Not enough projects for the request. Added %i", added) return added
def test_remote_update(self): """Test remote connection to ProteomeXchange""" prev_wd = os.getcwd() with tempfile.TemporaryDirectory() as temp_path: os.chdir(temp_path) test_db_path = "sqlite:///" + temp_path + "/phosphopedia.db" print() accession = "PXD001492" manager = managers.DatasetManager(test_db_path) manager.add_datasets([accession]) # Make sure all files added project = ppx.find_project(accession, local=temp_path) nfiles = len(project.remote_files("*.raw")) sample_query = manager.database\ .session\ .query(schema.Sample)\ .order_by(schema.Sample.sampleName) sample_entries = manager.database.safe_run(sample_query.all) self.assertEqual(len(sample_entries), nfiles) os.chdir(prev_wd)
def test_timeout(): """Try a value that is too small.""" with pytest.raises((ConnectTimeout, ReadTimeout)): proj = ppx.find_project(PXID, timeout=0.0000000000001)
def test_massive_project_with_pxd(): proj = ppx.find_project(MSVPXD, timeout=10) assert isinstance(proj, ppx.MassiveProject) assert proj.id == MSVID
def test_massive_project(): """Test massive project resolution""" proj = ppx.find_project(MSVID, timeout=10) assert isinstance(proj, ppx.MassiveProject)
def test_pride_online(): """Test pride project resolution""" proj = ppx.find_project(PXID, timeout=10) assert isinstance(proj, ppx.PrideProject)