def find_dataset_metadata(dataset_accession, useftp=False, massive_host=None): print("Finding Files %s " % dataset_accession) if useftp: try: list_names = massive_host.listdir("/") except Exception as e: print("MassIVE connection broken, reconnecting", e) massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "") all_other_files = [] all_update_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(dataset_accession, "updates", includefilemetadata=True, massive_host=massive_host) else: import credentials all_other_files = ming_proteosafe_library.get_all_files_in_dataset_folder(dataset_accession, "other", credentials.USERNAME, credentials.PASSWORD, includefilemetadata=True) all_update_files = ming_proteosafe_library.get_all_files_in_dataset_folder(dataset_accession, "updates", credentials.USERNAME, credentials.PASSWORD, includefilemetadata=True) print(dataset_accession, len(all_other_files), len(all_update_files)) #Finding gnps_metadata.tsv files metadata_files = [fileobject for fileobject in all_other_files if os.path.basename(fileobject["path"]) == "gnps_metadata.tsv" ] metadata_files += [fileobject for fileobject in all_update_files if os.path.basename(fileobject["path"]) == "gnps_metadata.tsv" ] metadata_files = sorted(metadata_files, key=lambda myfile: myfile["timestamp"], reverse=True) if len(metadata_files) > 0: return metadata_files[0] return None
def get_dataset_files(dataset_accession, collection_name): try: massive_host.keep_alive() except: print("MassIVE connection broken, reconnecting") massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "") dataset_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(dataset_accession, collection_name, massive_host=massive_host) return dataset_files
def _get_massive_files_ftp(dataset_accession, dataset_password=""): import ftputil import ming_proteosafe_library if len(dataset_password) > 0: massive_host = ftputil.FTPHost("massive.ucsd.edu", dataset_accession, dataset_password) else: massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "") all_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp( dataset_accession, "ccms_peak", massive_host=massive_host, dataset_password=dataset_password) all_files += ming_proteosafe_library.get_all_files_in_dataset_folder_ftp( dataset_accession, "peak", massive_host=massive_host, dataset_password=dataset_password) all_files += ming_proteosafe_library.get_all_files_in_dataset_folder_ftp( dataset_accession, "raw", massive_host=massive_host, dataset_password=dataset_password) acceptable_extensions = [".mzml", ".mzxml", ".cdf", ".raw"] all_files = [ filename for filename in all_files if os.path.splitext(filename)[1].lower() in acceptable_extensions ] all_files_df = pd.DataFrame() all_files_df["filepath"] = all_files return all_files_df
def find_dataset_metadata(dataset_accession, useftp=False): print("Finding Files %s " % dataset_accession) if useftp: all_other_files = [] all_update_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp( dataset_accession, "updates", includefilemetadata=True, massive_host=massive_host) else: import credentials all_other_files = ming_proteosafe_library.get_all_files_in_dataset_folder( dataset_accession, "other", credentials.USERNAME, credentials.PASSWORD, includefilemetadata=True) all_update_files = ming_proteosafe_library.get_all_files_in_dataset_folder( dataset_accession, "updates", credentials.USERNAME, credentials.PASSWORD, includefilemetadata=True) print(dataset_accession, len(all_other_files), len(all_update_files)) #Finding gnps_metadata.tsv files metadata_files = [ fileobject for fileobject in all_other_files if os.path.basename(fileobject["path"]) == "gnps_metadata.tsv" ] metadata_files += [ fileobject for fileobject in all_update_files if os.path.basename(fileobject["path"]) == "gnps_metadata.tsv" ] metadata_files = sorted(metadata_files, key=lambda myfile: myfile["timestamp"], reverse=True) if len(metadata_files) > 0: return metadata_files[0] return None
def get_dataset_files(dataset_accession, collection_name, massive_host=None): dataset_files = None try: dataset_files = json.loads(redis_client.get(dataset_accession)) print("Read from Redis", len(dataset_files)) except: dataset_files = None if dataset_files is None: dataset_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(dataset_accession, collection_name, massive_host=massive_host) try: redis_client.set(dataset_accession, json.dumps(dataset_files), ex=3600) except: x = 1 print(dataset_accession, len(dataset_files)) return dataset_files