def _populate_lib_info(self, sample_dir): lib_info = Info() sample_dir = os.path.abspath(sample_dir) sample_name = os.path.basename(sample_dir) project_name = os.path.basename(os.path.dirname(sample_dir)) lib_info[ELEMENT_SAMPLE_INTERNAL_ID]= sample_name lib_info[ELEMENT_LIBRARY_INTERNAL_ID]= sample_name lib_info[ELEMENT_PROJECT]= project_name plate, well = self.get_plate_id_and_well(sample_name) lib_info[ELEMENT_SAMPLE_PLATE] = plate lib_info[ELEMENT_SAMPLE_PLATE_WELL] = well fastq_file = glob.glob(os.path.join(sample_dir,"*_R1.fastq.gz"))[0] external_sample_name = os.path.basename(fastq_file)[:-len("_R1.fastq.gz")] lib_info[ELEMENT_SAMPLE_EXTERNAL_ID]= external_sample_name fastqc_file = os.path.join(sample_dir,external_sample_name+"_R1_fastqc.html") if os.path.exists(fastqc_file): nb_reads = get_nb_sequence_from_fastqc_html(fastqc_file) lib_info[ELEMENT_NB_READS_PASS_FILTER]= int(nb_reads) lib_info[ELEMENT_NB_BASE]= int(nb_reads)*300 bamtools_path = glob.glob(os.path.join(sample_dir, 'bamtools_stats.txt')) if not bamtools_path: bamtools_path = glob.glob(os.path.join(sample_dir,'.qc', 'bamtools_stats.txt')) if bamtools_path: total_reads, mapped_reads, duplicate_reads, proper_pairs = parse_bamtools_stats(bamtools_path[0]) lib_info[ELEMENT_NB_READS_IN_BAM]= int(total_reads) lib_info[ELEMENT_NB_MAPPED_READS]= int(mapped_reads) lib_info[ELEMENT_NB_DUPLICATE_READS]= int(duplicate_reads) lib_info[ELEMENT_NB_PROPERLY_MAPPED]= int(proper_pairs) yaml_metric_paths = glob.glob(os.path.join(sample_dir, '*%s-sort-highdepth-stats.yaml'%external_sample_name)) if not yaml_metric_paths: yaml_metric_paths = glob.glob(os.path.join(sample_dir, '.qc', '*%s-sort-highdepth-stats.yaml'%external_sample_name)) if yaml_metric_paths: yaml_metric_path = yaml_metric_paths[0] median_coverage = parse_highdepth_yaml_file(yaml_metric_path) lib_info[ELEMENT_MEDIAN_COVERAGE]= median_coverage else: logging.critical('Missing %s-sort-highdepth-stats.yaml'%sample_name) bed_file_paths = glob.glob(os.path.join(sample_dir,'*%s-sort-callable.bed'%external_sample_name)) if not bed_file_paths: bed_file_paths = glob.glob(os.path.join(sample_dir, '.qc', '*%s-sort-callable.bed'%external_sample_name)) if bed_file_paths: bed_file_path = bed_file_paths[0] coverage_per_type = parse_callable_bed_file(bed_file_path) callable_bases = coverage_per_type.get('CALLABLE') total = sum(coverage_per_type.values()) lib_info[ELEMENT_PC_BASES_CALLABLE]= callable_bases/total else: logging.critical('Missing *%s-sort-callable.bed'%sample_name) sex_file_paths = glob.glob(os.path.join(sample_dir,'%s.sex'%external_sample_name)) if not sex_file_paths: sex_file_paths = glob.glob(os.path.join(sample_dir,'.qc','%s.sex'%external_sample_name)) if sex_file_paths: with open(sex_file_paths[0]) as open_file: sex = open_file.read().strip() gender_from_lims = self.get_sex_from_lims(sample_name) lib_info[ELEMENT_GENDER]= match_gender(sex, gender_from_lims) genotype_file_paths = glob.glob(os.path.join(sample_dir,'%s_genotype_validation.txt'%external_sample_name)) if genotype_file_paths: samples = parse_genotype_concordance(genotype_file_paths[0]) total_snps = sum(samples[sample_name].values()) no_call = samples[sample_name].get('no_call_seq') + samples[sample_name].get('no_call_chip') matching = samples[sample_name].get('matching_snps') lib_info[ELEMENT_GENOTYPE_PC_NOCALL] = float(no_call) / float(total_snps) lib_info[ELEMENT_GENOTYPE_PC_MISMATCH] = float(matching) / float(total_snps) return lib_info
def test_parse_bamtools_stats(self): self.assertEqual(parse_bamtools_stats(self.bamtools_stat_file), (988805087, 975587288, 171911966, 949154225))