Ejemplo n.º 1
0
def process_metadata_import(dataset_accession, dryrun=False, massive_host=None):
    dataset_metadatum = find_dataset_metadata(dataset_accession, useftp=True, massive_host=massive_host)

    if dataset_metadatum == None:
        print("Not Importing %s, no metadata" % dataset_accession)
        return -2, -2
    else:
        print("Importing %s" % dataset_accession, dataset_metadatum)

    #Save files Locally
    local_metadata_path = os.path.join("tempuploads", dataset_accession + ".tsv")

    try:
        massive_host.download(dataset_metadatum["path"], local_metadata_path)
    except:
        print("CANT DOWNLOAD", dataset_metadatum["path"])
        raise

    metadata_validator.rewrite_metadata(local_metadata_path)
    
    #Validate
    pass_validation, failures, errors_list, valid_rows, total_rows_count = metadata_validator.perform_validation(local_metadata_path)

    #Filtering out lines that are not valid
    local_filtered_metadata_path = os.path.join("tempuploads", "filtered_" + dataset_accession + ".tsv")
    if len([error for error in errors_list if error["error_string"].find("Missing column") != -1]) > 0:
        print("Missing Columns, Rejected")
        return -1, -1

    #Filtering out lines that do not match the dataset accession
    metadata_df = pd.DataFrame(valid_rows)
    try:
        metadata_df = metadata_df[metadata_df['MassiveID'] == dataset_accession]
    except:
        metadata_df = pd.DataFrame(valid_rows)

    metadata_df.to_csv(local_filtered_metadata_path, sep="\t", index=False)

    try:
        pass_validation, failures, errors_list, valid_rows, total_rows_count = metadata_validator.perform_validation(local_filtered_metadata_path)
    except:
        pass_validation = False

    added_files_count = 0
    if pass_validation:
        print("Importing Data")
        if not dryrun:
            added_files_count = populate_metadata.populate_dataset_metadata(local_filtered_metadata_path, massive_host=massive_host)
    else:
        print("Filtered File is not valid")

    return len(metadata_df), added_files_count
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description='Importing Database')
    parser.add_argument('--importmetadata', default=None, help='Imports metadata, options are all, dataset, file')
    parser.add_argument('--metadatafile', help='Imports metadata filename')
    parser.add_argument('--metadataaccession', help='Imports metadata accession')

    parser.add_argument('--importidentifications', default=None, help='Imports identifications, from task file')
    parser.add_argument('--identifications_output', help='identifications_output')

    massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "")

    args = parser.parse_args()

    # Importing Metadata First
    if args.importmetadata == "all":
        summary_list = []

        all_datasets = ming_proteosafe_library.get_all_datasets()
        for dataset in all_datasets:
            if not "GNPS" in dataset["title"].upper():
                continue

            # Checking the FTP host
            try:
                list_names = massive_host.listdir("/")
            except Exception as e:
                print("MassIVE connection broken, reconnecting", e)
                massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "")
                
            try:
                print("Importing, ", dataset["dataset"])
                total_valid_metadata_entries, files_added = process_metadata_import(dataset["dataset"], dryrun=False, massive_host=massive_host)
            except KeyboardInterrupt:
                raise
            except:
                total_valid_metadata_entries = -1
                files_added = -1

            summary_dict = {}
            summary_dict["total_valid_metadata_entries"] = total_valid_metadata_entries
            summary_dict["files_added"] = files_added
            summary_dict["accession"] = dataset["dataset"]

            summary_list.append(summary_dict)

            try:
                pd.DataFrame(summary_list).to_csv("/app/database/add_metadata_summary.tsv", sep="\t", index=False)
            except:
                continue
        
    elif args.importmetadata == "dataset":
        total_valid_metadata_entries, files_added = process_metadata_import(args.metadataaccession, massive_host=massive_host)
        print(total_valid_metadata_entries, files_added)

    elif args.importmetadata == "file":
        populate_metadata.populate_dataset_metadata(args.metadatafile, massive_host=massive_host)


    # Import Library Identifications
    if args.importidentifications is not None:
        import_identification(args.importidentifications, args.identifications_output, force=True)
def main():
    parser = argparse.ArgumentParser(description='Importing Database')
    parser.add_argument(
        '--importmetadata',
        default=None,
        help='Imports metadata, options are all, dataset, file')
    parser.add_argument('--metadatafile', help='Imports metadata filename')
    parser.add_argument('--metadataaccession',
                        help='Imports metadata accession')

    parser.add_argument('--importidentifications',
                        default=None,
                        help='Imports identifications, from task file')
    parser.add_argument('--identifications_output',
                        help='identifications_output')

    args = parser.parse_args()

    # Importing Metadata First
    if args.importmetadata == "all":
        summary_list = []

        all_datasets = ming_proteosafe_library.get_all_datasets()
        for dataset in all_datasets:
            if not "GNPS" in dataset["title"].upper():
                continue

            try:
                total_valid_metadata_entries, files_added = process_metadata_import(
                    dataset["dataset"], dryrun=False)
            except KeyboardInterrupt:
                raise
            except:
                total_valid_metadata_entries = -1
                files_added = -1

            summary_dict = {}
            summary_dict[
                "total_valid_metadata_entries"] = total_valid_metadata_entries
            summary_dict["files_added"] = files_added
            summary_dict["accession"] = dataset["dataset"]

            summary_list.append(summary_dict)

            try:
                pd.DataFrame(summary_list).to_csv(
                    "/app/database/add_metadata_summary.tsv",
                    sep="\t",
                    index=False)
            except:
                continue

    elif args.importmetadata == "dataset":
        total_valid_metadata_entries, files_added = process_metadata_import(
            args.metadataaccession)
        print(total_valid_metadata_entries, files_added)

    elif args.importmetadata == "file":
        populate_metadata.populate_dataset_metadata(args.metadatafile)

    # Import Library Identifications
    if args.importidentifications is not None:
        import_identification(args.importidentifications,
                              args.identifications_output)