def find_dataset_metadata(dataset_accession, useftp=False, massive_host=None):
    print("Finding Files %s " % dataset_accession)
    if useftp:
        try:
            list_names = massive_host.listdir("/")
        except Exception as e:
            print("MassIVE connection broken, reconnecting", e)
            massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "")
            
        all_other_files = []
        all_update_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(dataset_accession, "updates", includefilemetadata=True, massive_host=massive_host)
    else:
        import credentials
        all_other_files = ming_proteosafe_library.get_all_files_in_dataset_folder(dataset_accession, "other", credentials.USERNAME, credentials.PASSWORD, includefilemetadata=True)
        all_update_files = ming_proteosafe_library.get_all_files_in_dataset_folder(dataset_accession, "updates", credentials.USERNAME, credentials.PASSWORD, includefilemetadata=True)

    print(dataset_accession, len(all_other_files), len(all_update_files))

    #Finding gnps_metadata.tsv files
    metadata_files = [fileobject for fileobject in all_other_files if os.path.basename(fileobject["path"]) == "gnps_metadata.tsv" ]
    metadata_files += [fileobject for fileobject in all_update_files if os.path.basename(fileobject["path"]) == "gnps_metadata.tsv" ]

    metadata_files = sorted(metadata_files, key=lambda myfile: myfile["timestamp"], reverse=True)

    if len(metadata_files) > 0:
        return metadata_files[0]

    return None
def get_dataset_files(dataset_accession, collection_name):
    try:
        massive_host.keep_alive()
    except:
        print("MassIVE connection broken, reconnecting")
        massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "")

    dataset_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(dataset_accession, collection_name, massive_host=massive_host)
    return dataset_files
Beispiel #3
0
def _get_massive_files_ftp(dataset_accession, dataset_password=""):
    import ftputil
    import ming_proteosafe_library

    if len(dataset_password) > 0:
        massive_host = ftputil.FTPHost("massive.ucsd.edu", dataset_accession,
                                       dataset_password)
    else:
        massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "")

    all_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(
        dataset_accession,
        "ccms_peak",
        massive_host=massive_host,
        dataset_password=dataset_password)
    all_files += ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(
        dataset_accession,
        "peak",
        massive_host=massive_host,
        dataset_password=dataset_password)
    all_files += ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(
        dataset_accession,
        "raw",
        massive_host=massive_host,
        dataset_password=dataset_password)

    acceptable_extensions = [".mzml", ".mzxml", ".cdf", ".raw"]

    all_files = [
        filename for filename in all_files
        if os.path.splitext(filename)[1].lower() in acceptable_extensions
    ]

    all_files_df = pd.DataFrame()
    all_files_df["filepath"] = all_files

    return all_files_df
def find_dataset_metadata(dataset_accession, useftp=False):
    print("Finding Files %s " % dataset_accession)
    if useftp:
        all_other_files = []
        all_update_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(
            dataset_accession,
            "updates",
            includefilemetadata=True,
            massive_host=massive_host)
    else:
        import credentials
        all_other_files = ming_proteosafe_library.get_all_files_in_dataset_folder(
            dataset_accession,
            "other",
            credentials.USERNAME,
            credentials.PASSWORD,
            includefilemetadata=True)
        all_update_files = ming_proteosafe_library.get_all_files_in_dataset_folder(
            dataset_accession,
            "updates",
            credentials.USERNAME,
            credentials.PASSWORD,
            includefilemetadata=True)

    print(dataset_accession, len(all_other_files), len(all_update_files))

    #Finding gnps_metadata.tsv files
    metadata_files = [
        fileobject for fileobject in all_other_files
        if os.path.basename(fileobject["path"]) == "gnps_metadata.tsv"
    ]
    metadata_files += [
        fileobject for fileobject in all_update_files
        if os.path.basename(fileobject["path"]) == "gnps_metadata.tsv"
    ]

    metadata_files = sorted(metadata_files,
                            key=lambda myfile: myfile["timestamp"],
                            reverse=True)

    if len(metadata_files) > 0:
        return metadata_files[0]

    return None
Beispiel #5
0
def get_dataset_files(dataset_accession, collection_name, massive_host=None):
    dataset_files = None

    try:
        dataset_files = json.loads(redis_client.get(dataset_accession))
        print("Read from Redis", len(dataset_files))
    except:
        dataset_files = None

    if dataset_files is None:
        dataset_files = ming_proteosafe_library.get_all_files_in_dataset_folder_ftp(dataset_accession, collection_name, massive_host=massive_host)
        try:
            redis_client.set(dataset_accession, json.dumps(dataset_files), ex=3600)
        except:
            x = 1

    print(dataset_accession, len(dataset_files))

    return dataset_files