Example #1
0
def main():
    argparse = ArgumentParser(description='Prepare to process backlog study and validate VCFs.')
    argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission')
    argparse.add_argument('--force_config', action='store_true', default=False,
                          help='Overwrite the configuration file after backing it up.')
    argparse.add_argument('--debug', action='store_true', default=False,
                          help='Set the script to output logging information at debug level')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    preparation = EloadBacklog(args.eload)
    preparation.fill_in_config(args.force_config)
    preparation.report()

    validation = EloadValidation(args.eload)
    validation_tasks = ['assembly_check', 'vcf_check']
    validation.validate(validation_tasks)

    logger.info('Preparation complete, if files are valid please run ingestion as normal.')
def main():
    argparse = ArgumentParser(description='Accession and ingest submission data into EVA')
    argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.')
    argparse.add_argument('--instance', required=False, type=int, choices=range(1, 13), default=1,
                          help='The instance id to use for accessioning. Only needed if running accessioning.')
    argparse.add_argument('--tasks', required=False, type=str, nargs='+',
                          default=EloadIngestion.all_tasks, choices=EloadIngestion.all_tasks,
                          help='Task or set of tasks to perform during ingestion.')
    argparse.add_argument('--vep_cache_assembly_name', required=False, type=str,
                          help='The assembly name used in the VEP cache to help the script to find the correct cache '
                               'to use. This should be only used rarely when the script cannot find the VEP cache but '
                               'we know it exists.')
    argparse.add_argument('--resume', action='store_true', default=False,
                          help='Whether to resume an existing Nextflow process within ingestion.')
    argparse.add_argument('--debug', action='store_true', default=False,
                          help='Set the script to output logging information at debug level.')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    with EloadIngestion(args.eload) as ingestion:
        ingestion.upgrade_config_if_needed()
        ingestion.ingest(
            instance_id=args.instance,
            tasks=args.tasks,
            vep_cache_assembly_name=args.vep_cache_assembly_name,
            resume=args.resume
        )
Example #3
0
def main():
    argparse = ArgumentParser(
        description='Retrieve file information from ENA and add them to EVAPRO. '
        'Remove extra vcf and index files in EVAPRO is they are not in ENA')
    argparse.add_argument(
        '--project_accession',
        required=False,
        type=str,
        help=
        'Specify the project accession for which the retrieval should be done. This will apply to the whole projetc'
    )
    argparse.add_argument(
        '--analysis_accession',
        required=False,
        type=str,
        help=
        'Specify the analysis accession for which the retrieval should be done.'
    )

    log_cfg.add_stdout_handler()
    args = argparse.parse_args()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()
    if args.analysis_accession:
        populate_files_info_from_ena(args.analysis_accession)
    elif args.project_accession:
        populate_files_info_from_ena(args.project_accession)
    else:
        logger.warning(
            'You need to provide a project of analysis accession to use.')
def main():
    argparse = ArgumentParser(
        description='Update metadata after study has been ingested')
    argparse.add_argument('--eload',
                          required=True,
                          type=int,
                          help='The ELOAD number for this submission.')
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level.')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    with EloadIngestion(args.eload) as ingestion:
        ingestion.upgrade_config_if_needed()
        ingestion.update_assembly_set_in_analysis()
        ingestion.insert_browsable_files()
        ingestion.update_browsable_files_with_date()
        ingestion.update_files_with_ftp_path()
        ingestion.refresh_study_browser()
        ingestion.update_loaded_assembly_in_browsable_files()
        ingestion.check_assembly_set_id_coherence()
def main():
    argparse = ArgumentParser(
        description=
        'Upgrade ELOAD config to a format compatible with current automation')
    argparse.add_argument('--eload',
                          required=True,
                          type=int,
                          help='The ELOAD number for this submission')
    argparse.add_argument('--analysis_alias',
                          required=False,
                          type=str,
                          help='Analysis alias to use')
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    with Eload(args.eload) as eload:
        eload.upgrade_config_if_needed(args.analysis_alias)
def main():
    argparse = ArgumentParser(description='Copies data from the ftp (if specified) and search for VCF and metadata files.'
                                          'then create a config file storing information about the eload')
    argparse.add_argument('--ftp_box', required=False, type=int, choices=range(1, 21),
                          help='box number where the data should have been uploaded. Required to copy the data from the FTP')
    argparse.add_argument('--submitter', required=False, type=str,
                          help='the name of the directory for that submitter. Required to copy the data from the FTP')
    argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission')
    argparse.add_argument('--taxid', required=False, type=str,
                          help='Override and replace the taxonomy id provided in the metadata spreadsheet.')
    argparse.add_argument('--reference', required=False, type=str,
                          help='Override and replace the reference sequence accession provided in the metadata '
                               'spreadsheet.')
    argparse.add_argument('--debug', action='store_true', default=False,
                          help='Set the script to output logging information at debug level')
    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    eload = EloadPreparation(args.eload)
    if args.ftp_box and args.submitter:
        eload.copy_from_ftp(args.ftp_box, args.submitter)
    eload.detect_all(args.taxid, args.reference)
Example #7
0
def main():
    argparse = ArgumentParser(
        description='Migrate an in-progress submission to the current cluster')
    argparse.add_argument('--eload',
                          required=True,
                          type=int,
                          help='The ELOAD number of the submission to migrate')
    argparse.add_argument('--project',
                          required=False,
                          type=str,
                          help='Optional associated project accession')
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    with EloadMigration(args.eload) as eload:
        eload.migrate(args.project)
def main():
    parser = argparse.ArgumentParser(
        description='Create and load the clustering and release tracking table',
        add_help=False)
    parser.add_argument("--private-config-xml-file",
                        help="ex: /path/to/eva-maven-settings.xml",
                        required=True)
    parser.add_argument("--release-version",
                        help="version of the release",
                        type=int,
                        required=True)
    parser.add_argument(
        "--reference-directory",
        help=
        "Directory where the reference genomes exists or should be downloaded",
        required=True)
    parser.add_argument(
        "--taxonomy",
        help="taxonomy id for which rs count needs to be updated",
        type=int,
        required=False)
    parser.add_argument('--tasks',
                        required=False,
                        type=str,
                        nargs='+',
                        default=all_tasks,
                        choices=all_tasks,
                        help='Task or set of tasks to perform.')
    parser.add_argument('--help',
                        action='help',
                        help='Show this help message and exit')
    args = parser.parse_args()

    logging_config.add_stdout_handler()

    if not args.tasks:
        args.tasks = all_tasks

    if 'create_and_fill_table' in args.tasks:
        create_table(args.private_config_xml_file)
        fill_in_from_previous_inventory(args.private_config_xml_file,
                                        args.release_version)
        fill_in_table_from_remapping(args.private_config_xml_file,
                                     args.release_version,
                                     args.reference_directory)

    if 'fill_rs_count' in args.tasks:
        if not args.taxonomy:
            raise Exception(
                "For running task 'fill_rs_count', it is mandatory to provide taxonomy arguments"
            )
        mongo_source_uri = get_mongo_uri_for_eva_profile(
            'production', args.private_config_xml_file)
        mongo_source = MongoDatabase(uri=mongo_source_uri,
                                     db_name="eva_accession_sharded")
        fill_num_rs_id_for_taxonomy_and_assembly(mongo_source,
                                                 args.private_config_xml_file,
                                                 args.release_version,
                                                 args.taxonomy,
                                                 args.reference_directory)
def main():
    arg_parser = argparse.ArgumentParser(
        description=
        'Compare the sample name in the VCF file and the one specified in the metadata.'
    )
    arg_parser.add_argument('--metadata-file',
                            required=True,
                            dest='metadata_file',
                            help='EVA Submission Metadata Excel sheet')
    arg_parser.add_argument(
        '--vcf-dir',
        required=True,
        dest='vcf_dir',
        help='Path to the directory in which submitted files can be found')
    arg_parser.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level',
    )
    args = arg_parser.parse_args()

    log_cfg.add_stdout_handler()

    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    compare_spreadsheet_and_vcf(args.metadata_file, args.vcf_dir)
def main():
    argparse = ArgumentParser(description='Validate an ELOAD by checking the data and metadata format and semantics.')
    argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission')
    argparse.add_argument('--validation_tasks', required=False, type=str, nargs='+',
                          default=EloadValidation.all_validation_tasks, choices=EloadValidation.all_validation_tasks,
                          help='task or set of tasks to perform during validation')
    argparse.add_argument('--set_as_valid', action='store_true', default=False,
                          help='Set the script to consider all validation tasks performed as valid in the final '
                               'evaluation. This does not affect the actual report but only change the final '
                               'evaluation')
    argparse.add_argument('--merge_per_analysis', action='store_true', default=False,
                          help='Whether to merge vcf files per analysis if possible.')
    argparse.add_argument('--report', action='store_true', default=False,
                          help='Set the script to only report the results based on previously run validation.')
    argparse.add_argument('--debug', action='store_true', default=False,
                          help='Set the script to output logging information at debug level')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    with EloadValidation(args.eload) as eload:
        eload.upgrade_config_if_needed()
        if not args.report:
            eload.validate(args.validation_tasks, args.set_as_valid, args.merge_per_analysis)
        eload.report()
Example #11
0
def main():
    argparse = ArgumentParser(description='Broker validated ELOAD to BioSamples and ENA')
    argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission')
    argparse.add_argument('--debug', action='store_true', default=False,
                          help='Set the script to output logging information at debug level')
    argparse.add_argument('--vcf_files', required=False, type=str, help='VCF files to use in the brokering', nargs='+')
    argparse.add_argument('--metadata_file', required=False, type=str, help='VCF files to use in the brokering')
    argparse.add_argument('--force', required=False, type=str, nargs='+', default=[],
                          choices=EloadBrokering.all_brokering_tasks,
                          help='When not set, the script only performs the tasks that were not successful. Can be '
                               'set to specify one or several tasks to force during the brokering regardless of '
                               'previous status')
    argparse.add_argument('--report', action='store_true', default=False,
                          help='Set the script to only report the results based on previously run brokering.')
    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    # Optionally Set the valid VCF and metadata file
    brokering = EloadBrokering(args.eload, args.vcf_files, args.metadata_file)
    brokering.upgrade_config_if_needed()
    if not args.report:
        brokering.broker(brokering_tasks_to_force=args.force)
    brokering.report()
def main():
    argparse = ArgumentParser(
        description=
        'Run entire variant remapping pipeline for a given assembly and taxonomy.'
    )
    argparse.add_argument('--assembly', help='Assembly to be process')
    argparse.add_argument('--taxonomy_id', help='Taxonomy id to be process')
    argparse.add_argument('--list_jobs',
                          help='Display the list of jobs to be run.',
                          action='store_true',
                          default=False)
    argparse.add_argument(
        '--resume',
        help='If a process has been run already this will resume it.',
        action='store_true',
        default=False)

    args = argparse.parse_args()

    load_config()

    if args.list_jobs:
        RemappingJob().list_assemblies_to_process()
    elif args.assembly and args.taxonomy_id:
        logging_config.add_stdout_handler()
        RemappingJob().process_one_assembly(args.assembly, args.taxonomy_id,
                                            args.resume)
    else:
        raise ArgumentError(
            'One of (--assembly and --taxonomy_id) or --list_jobs options is required'
        )
def main():
    parser = argparse.ArgumentParser(
        description='Generate custom assembly report for a given assembly',
        add_help=False)
    parser.add_argument(
        "-a",
        "--assembly-accession",
        help=
        "Assembly for which the process has to be run, e.g. GCA_000002315.3",
        required=True)
    parser.add_argument("-f",
                        "--fasta-file",
                        help="Path to the fasta file containing the assembly",
                        required=True)
    parser.add_argument(
        "-r",
        "--report-file",
        help="Path to the assembly report file containing the assembly",
        required=True)
    parser.add_argument('--help',
                        action='help',
                        help='Show this help message and exit')

    args = parser.parse_args()

    load_config()
    logging_config.add_stdout_handler()

    assembly = CustomAssemblyFromDatabase(args.assembly_accession,
                                          args.fasta_file, args.report_file)
    assembly.generate_assembly_report()
    assembly.generate_fasta()
Example #14
0
def main():
    argparse = ArgumentParser(
        description='Inspect FTP boxes to detect new submission. '
        'Provide a report that specify the project title')
    argparse.add_argument(
        '--ftp_box',
        required=True,
        type=int,
        choices=range(1, 21),
        help='box number where the data should have been uploaded')
    argparse.add_argument('--submitter',
                          required=False,
                          type=str,
                          help='the name of the directory for that submitter.')
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level',
    )
    args = argparse.parse_args()
    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    if args.submitter:
        inspect_one_user(args.ftp_box, args.submitter)
    else:
        inspect_all_users(args.ftp_box)
def main():
    parser = argparse.ArgumentParser(description='Download and store a reference sequence or assembly.')
    parser.add_argument("-a", "--assembly-accession",
                        help="Assembly for which the process has to be run, e.g. GCA_000002285.2", required=True)
    parser.add_argument("-s", "--species",
                        help="Species scientific name under which this accession should be stored. "
                             "This is only used to create the directory", required=True)
    parser.add_argument("-o", "--output-directory",
                        help="Base directory under which all species assemblies are stored. "
                             "Will use the one defined in config file if omitted")
    parser.add_argument("-c", "--clear", help="Flag to clear existing data in FASTA file and starting from scratch",
                        action='store_true')
    parser.add_argument('--debug', action='store_true', default=False,
                        help='Set the script to output logging information at debug level')
    args = parser.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    try:
        assembly_fasta_path, assembly_report_path = get_reference_fasta_and_report(
            args.species, args.assembly_accession, args.output_directory, args.clear
        )
        logger.info('FASTA: ' + assembly_fasta_path)
        logger.info('REPORT: ' + assembly_report_path)
    except Exception as ex:
        logger.exception(ex)
        return 1

    return 0
def main():
    argparse = ArgumentParser()
    argparse.add_argument('--settings_xml_file', help='File containing the connection to the database', required=False)
    argparse.add_argument('--study', help='The study in the assembly to correct', required=True)
    argparse.add_argument('--assembly', help='The assembly accession of the entities that needs to be changed',
                          required=True)
    argparse.add_argument('--contigs', help='The contigs to modify. they should be provided as they appeared in the record', nargs='+')
    args = argparse.parse_args()
    log_cfg.add_stdout_handler()

    deprecate(args.settings_xml_file, args.study, args.assembly, args.contigs)

    logger.info("Finished successfully.")
Example #17
0
def main():
    argparse = ArgumentParser()
    argparse.add_argument(
        '--settings_xml_file',
        help='File containing the connection to the database',
        required=False)
    argparse.add_argument(
        '--database_name',
        help='The name of the database from the variant warehouse',
        required=True)
    argparse.add_argument(
        '--contigs',
        help=
        'The contigs to modify. they should be provided as they appeared in the record',
        nargs='+')
    args = argparse.parse_args()
    log_cfg.add_stdout_handler()

    deprecate(args.settings_xml_file, args.database_name, args.contigs)

    logger.info("Finished successfully.")
def main():
    parser = argparse.ArgumentParser(
        description=
        'Download analyses for processing from the Covid-19 DP project',
        formatter_class=argparse.RawTextHelpFormatter,
        add_help=False)
    parser.add_argument(
        "--project",
        default='PRJEB45554',
        required=False,
        help="project from which analyses needs to be downloaded")
    parser.add_argument("--batch-size",
                        default=100000,
                        required=False,
                        help="batch size of ENA analyses download")
    parser.add_argument(
        "--processed-file-directory",
        required=True,
        help=
        "full path to the directory where all the processed files are present")
    parser.add_argument(
        "--target-file",
        required=True,
        help="full path to the target file that will be created")
    parser.add_argument(
        "--field",
        choices=['run_ref', 'analysis_accession'],
        required=True,
        help=
        "field whose names has been used as file name and should be used for lookup"
    )

    args = parser.parse_args()
    logging_config.add_stdout_handler()

    prepare_processed_analyses_file(args.project, args.batch_size,
                                    args.processed_file_directory,
                                    args.target_file, args.field)
def main():
    argparser = ArgumentParser()
    argparser.add_argument("--private-config-xml-file",
                           help="ex: /path/to/eva-maven-settings.xml",
                           required=True)
    argparser.add_argument("--assembly_accession",
                           help="GCA_000003205.1",
                           required=True)
    argparser.add_argument("--assembly_report_path",
                           help="path to the report to check contigs against",
                           required=True)
    args = argparser.parse_args()

    genbank_to_row = get_contig_genbank(args.assembly_report_path)

    log_cfg.add_stdout_handler()

    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", args.private_config_xml_file),
                          user="******") as pg_conn:
        eva_contigs, dbSNP_contigs = get_contigs_accessions_for(
            pg_conn, args.assembly_accession)

        for contig in eva_contigs:
            if contig not in genbank_to_row:
                logger.warning(
                    'For assembly {} contig {} found in EVA is not genbank in the report {}'
                    .format(args.assembly_accession, contig,
                            args.assembly_report_path))
        for contig in dbSNP_contigs:
            if contig not in genbank_to_row:
                logger.warning(
                    'For assembly {} contig {} found in dbSNP is not genbank in the report {}'
                    .format(args.assembly_accession, contig,
                            args.assembly_report_path))

    return 0
Example #20
0
def main():
    argparse = ArgumentParser(
        description=
        'Create a database with the provided name if it does not exist already'
    )
    argparse.add_argument('--database_name',
                          required=True,
                          type=int,
                          help='The database name')
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level')

    log_cfg.add_stdout_handler()
    args = argparse.parse_args()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    provision_new_database_for_variant_warehouse(args.database_name)
def main():
    parser = argparse.ArgumentParser(
        description=
        'Correct assembly error in assembly GCA_015227675.1 by replacing it GCA_015227675.2',
        add_help=False)
    parser.add_argument(
        "--mongo-source-uri",
        help=
        "Mongo Source URI (ex: mongodb://user:@mongos-source-host:27017/admin)",
        required=True)
    parser.add_argument(
        "--mongo-source-secrets-file",
        help=
        "Full path to the Mongo Source secrets file (ex: /path/to/mongo/source/secret)",
        required=True)
    parser.add_argument("--batch-size",
                        help="number of document processed at once",
                        required=False,
                        type=int,
                        default=1000)
    parser.add_argument("--debug",
                        help="Set the script to output debug message",
                        default=False,
                        action='store_true')
    args = parser.parse_args()

    if args.debug:
        logging_config.add_stdout_handler(logging.DEBUG)
    else:
        logging_config.add_stdout_handler()

    mongo_source = MongoDatabase(uri=args.mongo_source_uri,
                                 secrets_file=args.mongo_source_secrets_file,
                                 db_name="eva_accession_sharded")
    replace_variant_entities(mongo_source, batch_size=int(args.batch_size))
    del mongo_source
Example #22
0
def main():
    validation_tasks = ['aggregation_check', 'assembly_check', 'vcf_check']
    forced_validation_tasks = ['metadata_check', 'sample_check']

    argparse = ArgumentParser(
        description='Prepare to process backlog study and validate VCFs.')
    argparse.add_argument('--eload',
                          required=True,
                          type=int,
                          help='The ELOAD number for this submission')
    argparse.add_argument(
        '--project_accession',
        required=False,
        type=str,
        help='Set this project instead of the one associated with this eload. '
        'Useful when the association is not set in the database. '
        'The project needs to exists in the DB.')
    argparse.add_argument(
        '--analysis_accessions',
        required=False,
        type=str,
        nargs='+',
        help=
        'Set these analysis instead of the ones associated with the project. '
        'Useful when wanting to use a subset of the analysis. '
        'The analyses need to exists in the DB.')
    argparse.add_argument(
        '--force_config',
        action='store_true',
        default=False,
        help='Overwrite the configuration file after backing it up.')
    argparse.add_argument(
        '--keep_config',
        action='store_true',
        default=False,
        help=
        'Keep the configuration file as it is and only run the validation on it.'
    )
    argparse.add_argument(
        '--validation_tasks',
        required=False,
        type=str,
        nargs='+',
        default=validation_tasks,
        choices=validation_tasks,
        help='task or set of tasks to perform during validation')
    argparse.add_argument(
        '--merge_per_analysis',
        action='store_true',
        default=False,
        help='Whether to merge vcf files per analysis if possible.')
    argparse.add_argument(
        '--set_as_valid',
        action='store_true',
        default=False,
        help=
        'Set the script to consider all validation tasks performed as valid in the final '
        'evaluation. This does not affect the actual report but only change the final '
        'evaluation')
    argparse.add_argument(
        '--report',
        action='store_true',
        default=False,
        help=
        'Set the script to only report the results based on previously run preparation.'
    )
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    with EloadBacklog(
            args.eload,
            project_accession=args.project_accession,
            analysis_accessions=args.analysis_accessions) as preparation:
        # Pass the eload config object to validation so that the two objects share the same state
        with EloadValidation(args.eload, preparation.eload_cfg) as validation:
            if not args.report and not args.keep_config:
                preparation.fill_in_config(args.force_config)

            if not args.report:
                validation.validate(args.validation_tasks)
                # Also mark the other validation tasks as force so they are all passable

                if args.set_as_valid:
                    forced_validation_tasks = validation.all_validation_tasks
                for validation_task in forced_validation_tasks:
                    validation.eload_cfg.set('validation',
                                             validation_task,
                                             'forced',
                                             value=True)
                validation.mark_valid_files_and_metadata(
                    args.merge_per_analysis)
                if args.merge_per_analysis:
                    preparation.copy_valid_config_to_brokering_after_merge()

            preparation.report()
            validation.report()
            logger.info(
                'Preparation complete, if files are valid please run ingestion as normal.'
            )
def main():
    argparse = ArgumentParser(
        description='Accession and ingest submission data into EVA')
    argparse.add_argument('--eload',
                          required=True,
                          type=int,
                          help='The ELOAD number for this submission.')
    argparse.add_argument(
        '--instance',
        required=False,
        type=int,
        choices=range(1, 13),
        help=
        'The instance id to use for accessioning. Only needed if running accessioning.'
    )
    # TODO infer aggregation from vcf files, VEP version & cache version from species
    argparse.add_argument('--aggregation',
                          required=False,
                          type=str.lower,
                          choices=['basic', 'none'],
                          help='The aggregation type (case insensitive).')
    action_vep_version = argparse.add_argument(
        '--vep_version',
        required=False,
        type=int,
        help=
        'VEP version to use for annotation. Only needed if running variant load.'
    )
    argparse.add_argument(
        '--vep_cache_version',
        required=False,
        type=int,
        help=
        'VEP cache version to use for annotation. Only needed if running variant load.'
    )
    argparse.add_argument(
        '--db_name',
        required=False,
        type=str,
        help=
        'Name of an existing variant database in MongoDB. Submission should have a single '
        'assembly accession. Only needed if adding a new database. ex: db_name'
    )
    argparse.add_argument(
        '--db_name_mapping',
        required=False,
        type=str,
        nargs='+',
        help=
        'List with the mapping for assembly accession and existing variant database in MongoDB.'
        'Only needed if adding a new databases.'
        'ex: GCA_000000001.1,db_name1 GCA_000000002.2,db_name2')
    argparse.add_argument(
        '--tasks',
        required=False,
        type=str,
        nargs='+',
        default=EloadIngestion.all_tasks,
        choices=EloadIngestion.all_tasks,
        help='Task or set of tasks to perform during ingestion.')
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level.')
    action_skip_annotation = argparse.add_argument(
        '--skip_annotation',
        action='store_true',
        default=False,
        help='Flag to skip VEP annotation running variant load.')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    if args.skip_annotation is True and (args.vep_version is not None or
                                         args.vep_cache_version is not None):
        raise ArgumentError(
            action_skip_annotation,
            "Can't provide both \"--skip_annotation\" and \"--vep_version and --vep_cache_version\". Remove VEP/Cache versions or the skip flag and try again."
        )
    if (args.vep_version is None and args.vep_cache_version is not None) or (
            args.vep_version is not None and args.vep_cache_version is None):
        raise ArgumentError(
            action_vep_version,
            "Both \"--vep_version and --vep_cache_version\" should be specified together. Skip both arguments for auto-detection of these versions."
        )

    # Load the config_file from default location
    load_config()

    ingestion = EloadIngestion(args.eload)
    ingestion.upgrade_config_if_needed()
    ingestion.ingest(aggregation=args.aggregation,
                     instance_id=args.instance,
                     vep_version=args.vep_version,
                     vep_cache_version=args.vep_cache_version,
                     skip_annotation=args.skip_annotation,
                     db_name=args.db_name,
                     db_name_mapping=args.db_name_mapping,
                     tasks=args.tasks)
def main():
    argparse = ArgumentParser(
        description='Accession and ingest submission data into EVA')
    argparse.add_argument('--eload',
                          required=True,
                          type=int,
                          help='The ELOAD number for this submission.')
    argparse.add_argument(
        '--instance',
        required=False,
        type=int,
        choices=range(1, 13),
        help=
        'The instance id to use for accessioning. Only needed if running accessioning.'
    )
    # TODO infer aggregation from vcf files, VEP version & cache version from species
    argparse.add_argument('--aggregation',
                          required=False,
                          type=str.lower,
                          choices=['basic', 'none'],
                          help='The aggregation type (case insensitive).')
    argparse.add_argument(
        '--vep_version',
        required=False,
        type=int,
        help=
        'VEP version to use for annotation. Only needed if running variant load.'
    )
    argparse.add_argument(
        '--vep_cache_version',
        required=False,
        type=int,
        help=
        'VEP cache version to use for annotation. Only needed if running variant load.'
    )
    argparse.add_argument(
        '--db_name',
        required=False,
        type=str,
        help=
        'Name of existing variant database in MongoDB. Only needed if adding a new database.'
    )
    argparse.add_argument(
        '--tasks',
        required=False,
        type=str,
        nargs='+',
        default=EloadIngestion.all_tasks,
        choices=EloadIngestion.all_tasks,
        help='Task or set of tasks to perform during ingestion.')
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level.')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    ingestion = EloadIngestion(args.eload)
    ingestion.ingest(aggregation=args.aggregation,
                     instance_id=args.instance,
                     vep_version=args.vep_version,
                     vep_cache_version=args.vep_cache_version,
                     db_name=args.db_name,
                     tasks=args.tasks)
Example #25
0
import argparse
import hashlib
import traceback

import pymongo
from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_common_pyutils.mongodb import MongoDatabase
from pymongo import WriteConcern
from pymongo.read_concern import ReadConcern

logger = logging_config.get_logger(__name__)
logging_config.add_stdout_handler()


def get_SHA1(variant_rec):
    """Calculate the SHA1 digest from the seq, study, contig, start, ref, and alt attributes of the variant"""
    h = hashlib.sha1()
    keys = ['seq', 'study', 'contig', 'start', 'ref', 'alt']
    h.update('_'.join([str(variant_rec[key]) for key in keys]).encode())
    return h.hexdigest().upper()


def get_contig_equivalents():
    return {
        '1_random.1': 'AABR07046142.1',
        '1_random.15': 'KL567881.1',
        '1_random.2': 'KL567884.1',
        '1_random.21': 'KL567886.1',
        '1_random.23': 'KL567887.1',
        '1_random.27': 'KL567889.1',
        '1_random.4': 'KL567892.1',
Example #26
0
def main():
    argparse = ArgumentParser()
    argparse.add_argument(
        '--input',
        help='Path to the file containing the taxonomies and assemblies',
        required=True)
    argparse.add_argument(
        '--properties_dir',
        help=
        'Path to the directory where the release1 application.properties are stored',
        required=True)
    argparse.add_argument(
        '--assembly_dirs',
        help=
        'Path to the directory containing pre-downloaded species assemblies',
        required=True,
        nargs='+')
    argparse.add_argument(
        '--download_dir',
        help=
        'Path to the temporary directory where additional species assemblies will be downloaded',
        required=True,
    )
    argparse.add_argument(
        '--release2_reference_folder',
        help=
        'Path to the directory where selected fasta and report will be copied',
        required=True)
    argparse.add_argument(
        '--output_assemblies_tsv',
        help=
        'Path to the tsv file that will contain the list of assemblies to process',
        required=True)
    argparse.add_argument(
        '--output_taxonomy_tsv',
        help=
        'Path to the tsv file that will contain the list of species to process',
        required=True)
    argparse.add_argument(
        '--eva_accession_path',
        help=
        'path to the directory that contain eva-accession code and private json file.'
    )
    argparse.add_argument("--private_config_xml_file",
                          help="ex: /path/to/eva-maven-settings.xml",
                          required=True)
    argparse.add_argument('--debug',
                          help='Set login level to debug',
                          action='store_true',
                          default=False)

    args = argparse.parse_args()
    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(level=logging.DEBUG)

    global eva_accession_path
    if args.eva_accession_path:
        eva_accession_path = args.eva_accession_path

    aggregate_list_of_species(args.input, args.properties_dir,
                              args.assembly_dirs, args.download_dir,
                              args.release2_reference_folder,
                              args.output_assemblies_tsv,
                              args.output_taxonomy_tsv,
                              args.private_config_xml_file)