def create_dataset(project, analysis_type, source_file_path, is_loaded=False, loaded_date=None, dataset_id=None): # compute a dataset_id based on source_file_path if dataset_id is None: file_stats = get_file_stats(source_file_path) dataset_id = "_".join( map(str, [ datetime.datetime.fromtimestamp(float( file_stats.ctime)).strftime('%Y%m%d'), os.path.basename(source_file_path).split(".")[0][:20], file_stats.size ])) # create the Dataset dataset = Dataset.objects.create( project=project, analysis_type=analysis_type, dataset_id=dataset_id, source_file_path=source_file_path, is_loaded=is_loaded, loaded_date=loaded_date, ) return dataset
def _validate_dataset_path(dataset_path): try: dataset_file = does_file_exist(dataset_path) if dataset_file is None: raise Exception('"{}" not found'.format(dataset_path)) # check that dataset_path is accessible dataset_file_stats = get_file_stats(dataset_path) if dataset_file_stats is None: raise Exception('Unable to access "{}"'.format(dataset_path)) except Exception as e: raise Exception("Dataset path error: " + str(e))
def _validate_dataset_path(dataset_path): try: dataset_file = file_utils.does_file_exist(dataset_path) if dataset_file is None: raise Exception('"{}" not found'.format(dataset_path)) # check that dataset_path is accessible dataset_file_stats = file_utils.get_file_stats(dataset_path) if dataset_file_stats is None: raise Exception('Unable to access "{}"'.format(dataset_path)) except Exception as e: raise Exception("Dataset path error: " + str(e))
def validate_dataset(project, sample_type, analysis_type, genome_version, dataset_path, max_edit_distance=0, dataset_id=None): """Validates the given dataset. Args: project (object): sample_type (string): analysis_type (string): genome_version (string): dataset_path (string): max_edit_distance (int): dataset_id (string): Return: (errors, warnings, info) tuple Dataset.ANALYSIS_TYPE_VARIANT_CALLS """ #elasticsearch_host = options["elasticsearch_host"] #elasticsearch_index = options["elasticsearch_index"] #is_loaded = options["is_loaded"] # check args errors = [] warnings = [] info = [] # basic file path checks if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS: if not dataset_path.endswith(".vcf.gz") and not dataset_path.endswith( ".vds"): errors.append("Dataset path must end with .vcf.gz or .vds") elif analysis_type == Dataset.ANALYSIS_TYPE_ALIGNMENT: if not any([ dataset_path.endswith(suffix) for suffix in ('.txt', '.tsv', '.xls', '.xlsx') ]): errors.append( "BAM / CRAM table must have a .txt or .xls extension") else: errors.append("dataset type not supported: %s" % (analysis_type, )) if errors: return errors, warnings, info # check that dataset file exists try: dataset_file = does_file_exist(dataset_path) if dataset_file is None: errors.append("Unable to access %s" % (dataset_path, )) else: # check that dataset_path is accessible dataset_file_stats = get_file_stats(dataset_path) if dataset_file_stats is None: errors.append("Unable to access %s" % (dataset_path, )) except Exception as e: errors.append("dataset path error: " + str(e)) if errors: return errors, warnings, info # validate dataset contents if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS: # validate VCF and get sample ids try: sample_ids = _validate_vcf(dataset_path, sample_type=sample_type, genome_version=genome_version) except ValueError as e: errors.append(str(e)) return errors, warnings, info matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project, sample_ids=sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, ) if len(matched_sample_id_to_sample_record) == 0: all_vcf_sample_id_count = len(sample_ids) all_project_sample_id_count = len( Sample.objects.filter(individual__family__project=project, sample_type=sample_type)) errors.append( "None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals()) return errors, warnings, info # if Dataset record exists, retrieve it and check if it's already been loaded previously try: dataset = get_dataset( project=project, analysis_type=analysis_type, genome_version=genome_version, source_file_path=dataset_path, #elasticsearch_host=elasticsearch_host, #elasticsearch_index=elasticsearch_index, #is_loaded=is_loaded, ) except ObjectDoesNotExist as e: logger.warning("No existing dataset found") # check if all VCF samples loaded already - TODO update this? vcf_sample_ids = set(matched_sample_id_to_sample_record.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: info.append("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return errors, warnings, info elif not dataset.is_loaded: info.append("Dataset not loaded. Loading...") elif len(vcf_sample_ids - existing_sample_ids) != 0: info.append("Data will be loaded for these samples: %s" % (vcf_sample_ids - existing_sample_ids, )) return errors, warnings, info
def add_dataset(project, sample_type, analysis_type, genome_version, dataset_path, max_edit_distance=0, dataset_id=None, name=None, description=None, ignore_extra_samples_in_callset=False): """Validates the given dataset. Args: project (object): sample_type (string): analysis_type (string): genome_version (string): dataset_path (string): max_edit_distance (int): dataset_id (string): ignore_extra_samples_in_callset (bool): Return: (errors, warnings, info) tuple Dataset.ANALYSIS_TYPE_VARIANT_CALLS """ #elasticsearch_host = options["elasticsearch_host"] #elasticsearch_index = options["elasticsearch_index"] #is_loaded = options["is_loaded"] # check args errors = [] warnings = [] info = [] # basic file path checks if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS: if not dataset_path.endswith(".vcf.gz") and not dataset_path.endswith( ".vds"): errors.append("Dataset path must end with .vcf.gz or .vds") elif analysis_type == Dataset.ANALYSIS_TYPE_ALIGNMENT: if not any([ dataset_path.endswith(suffix) for suffix in ('.txt', '.tsv', '.xls', '.xlsx') ]): errors.append( "BAM / CRAM table must have a .txt or .xls extension") else: errors.append("dataset type not supported: %s" % (analysis_type, )) if errors: return errors, warnings, info # check that dataset file exists try: dataset_file = does_file_exist(dataset_path) if dataset_file is None: errors.append("Unable to access %s" % (dataset_path, )) else: # check that dataset_path is accessible dataset_file_stats = get_file_stats(dataset_path) if dataset_file_stats is None: errors.append("Unable to access %s" % (dataset_path, )) except Exception as e: errors.append("dataset path error: " + str(e)) if errors: return errors, warnings, info # validate dataset contents if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS: # validate VCF and get sample ids try: all_vcf_sample_ids = _validate_vcf(dataset_path, sample_type=sample_type, genome_version=genome_version) except ValueError as e: errors.append(str(e)) return errors, warnings, info matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project, sample_ids=all_vcf_sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, create_sample_records=True, ) if not ignore_extra_samples_in_callset and len( matched_sample_id_to_sample_record) < len(all_vcf_sample_ids): errors.append( "Matches not found for VCF sample ids: " + ", ".join( set(all_vcf_sample_ids) - set(matched_sample_id_to_sample_record.keys())) + ". Select the 'Ignore extra samples in callset' checkbox to ignore this." ) if len(matched_sample_id_to_sample_record) == 0: errors.append( "None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals()) return errors, warnings, info # if Dataset record exists, retrieve it and check if it's already been loaded previously # retrieve or create Dataset record and link it to sample(s) dataset = get_or_create_elasticsearch_dataset( project=project, analysis_type=analysis_type, genome_version=genome_version, source_file_path=dataset_path, elasticsearch_index=dataset_id, ) if dataset_id is not None: dataset.is_loaded = True dataset.loaded_date = timezone.now() dataset.name = name dataset.description = description dataset.save() link_dataset_to_sample_records( dataset, matched_sample_id_to_sample_record.values()) # check if all VCF samples loaded already - TODO update this? vcf_sample_ids = set(matched_sample_id_to_sample_record.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: info.append("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return errors, warnings, info elif not dataset.is_loaded: info.append("Dataset not loaded. Loading...") elif len(vcf_sample_ids - existing_sample_ids) != 0: info.append("Data will be loaded for these samples: %s" % (vcf_sample_ids - existing_sample_ids, )) return errors, warnings, info