def sample_sheet(sspath): Sheet = SampleSheet(sspath) data = {} for sample in Sheet.samples: sample_name = sample['Sample_Name'] sample_description = sample['Description'] if len(sample_description.split( "_")) != 15 or sample_description.split("_")[-1] == 'null': sample_criteria = 7 #'Information saknas' else: sample_criteria = int(sample_description.split("_")[-1]) #Skip controls and eurofins samples if sample_name.startswith(('PosCtrl', 'PosKon', 'NegCtrl', 'NegKon')): continue #Skip samples set to runType = 01 (desc. field 3) #These are samples which have been re-sequenced, so they have already been uploaded to GENSAM. runtype = sample_description.split("_")[2] if runtype == '01': continue else: #populate a dictionary with samples and their selection criteria value sample_id = sample_name.split('_')[0] data[sample_id] = sample_criteria return data
def sample_sheet_criteria(path, inhouse_dict, week, run_name): Sheet = SampleSheet(path) data = {} data['general'] = 0 data['selection'] = 0 data['unknown'] = 0 for sample in Sheet.samples: sample_name = sample['Sample_ID'] description = sample['Description'] # Skip controls or if not in dict if sample_name.lower().startswith(('neg', 'pos')): continue elif sample_name not in inhouse_dict[week][run_name].keys(): continue else: #Check if the last element is the selection criterion. This is a pretty weak check... criterion = description.split("_")[-1] if criterion.isdigit() and int(criterion) <= 10: # print(criterion) if int(criterion) == 1 or int(criterion) == 8 or int(criterion) == 9: data['general'] += 1 elif int(criterion) == 7: data['unknown'] += 1 else: data['selection'] += 1 else: data['unknown'] += 1 return data
def writeSammpleSheets(sample_list, sheet_path, template_sheet): samplesheet_name = os.path.basename(sheet_path) samplesheet_dir = os.path.dirname(os.path.realpath(sheet_path)) count = 0 exit_status = "success" for key in sample_list: count += 1 logger.debug( f"{len(sample_list[key])} samples with idx lengths {key[1]}/{key[2]} for {key[0]} dataset" ) new_sample_sheet = SampleSheet() new_sample_sheet.Header = template_sheet.Header new_sample_sheet.Reads = template_sheet.Reads new_sample_sheet.Settings = template_sheet.Settings for sample in sample_list[key]: new_sample_sheet.add_sample(sample) new_sample_sheet_file = os.path.join( samplesheet_dir, samplesheet_name + ".custom." + str(count) + "." + key[0]) logger.info(f"Creating custom sample sheet: {new_sample_sheet_file}") try: with open(new_sample_sheet_file, "w") as ss_writer: new_sample_sheet.write(ss_writer) except Exception as error: logger.error(f"Exception writing new sample sheet: {error}") exit_status = "failure" logger.debug(f"Created custom sample sheet: {new_sample_sheet_file}") return exit_status
def main(samplesheet_file_path, check_only): logger.info(f"Checking SampleSheet {samplesheet_file_path}") original_sample_sheet = SampleSheet(samplesheet_file_path) # Run some consistency checks #import_library_sheet_from_google('2019') # TODO: replace has_error return with enum and expand to error, warning, info? #has_header_error = checkSampleSheetMetadata(original_sample_sheet) #has_id_error = checkSampleAndLibraryIdFormat(original_sample_sheet) #has_index_error = checkSampleSheetForIndexClashes(original_sample_sheet) #has_metadata_error = checkMetadataCorrespondence(original_sample_sheet) # Only fail on metadata or id errors #if has_header_error or has_id_error or has_index_error or has_metadata_error: # raise ValueError(f"Validation detected errors. Please review the error logs!") # Split and write individual SampleSheets, based on indexes and technology (10X) if not check_only: # Sort samples based on technology (truseq/10X and/or index length) # Also replace N indexes with "" sorted_samples = getSortedSamples(original_sample_sheet) # Now that the samples have been sorted, we can write one or more custom sample sheets # (which may be the same as the original if no processing was necessary) logger.info(f"Writing {len(sorted_samples)} sample sheets.") writeSammpleSheets(sample_list=sorted_samples, sheet_path=samplesheet_file_path, template_sheet=original_sample_sheet) logger.info("All done.")
def sample_sheet(path,run): Sheet = SampleSheet(path) data = {} for sample in Sheet.samples: sample_name = sample['Sample_ID'] description = sample['Description'] # Skip controls if sample_name.startswith(('NegCtrl', 'PosCtrl', 'PosKon', 'NegKon')): continue else: data[sample['Sample_ID']] = [] data[sample['Sample_ID']].append({ 'referensnummer': description.split("_")[0], 'date': description.split("_")[1], 'runtype': description.split("_")[2], 'age': description.split("_")[3], 'gender': description.split("_")[4], 'lab_reference': description.split("_")[5], 'postalcode': description.split("_")[6], 'ct_value': description.split("_")[7] }) with open(f"/medstore/results/clinical/SARS-CoV-2-typing/nextseq_data/{run}/metadata/{run}_metadata.json", 'w') as outfile: json.dump(data, outfile,indent=4)
def parse_samplesheet(args): ss = SampleSheet(args.input) df = pd.DataFrame([s.to_json() for s in ss.samples]) df["is_umi"] = args.is_umi df["fwd_adapter"] = args.fwd_adapter df["rev_adapter"] = args.rev_adapter if args.merge_lanes: log.info("Merging samples across all lanes!") df["Lane"] = "all" else: if "Lane" not in df: log.error("No lanes specified in SampleSheet.csv; use --merge-lanes or update sample sheet.") sys.exit(1) df["Sample_Project"] = args.project_name df["library_type"] = args.library_type if "Sample_ID" in df: df["Sample_Name"] = df.Sample_ID elif "Sample_Name" in df: df["Sample_ID"] = df.Sample_Name else: log.error("Samplesheet must specify Sample_ID or Sample_Name!") sys.exit(1) df = df.drop_duplicates() # needed in case merge_lanes is true, which can result in duplicates return df
def create_investigators(samplesheet_path): samplesheet_data = SampleSheet(samplesheet_path) header_info = samplesheet_data.Header investigator_initials_list = header_info['Investigator Name'].split(";") investigator_dict = {} for investigator_initials in investigator_initials_list: investigator_dict[investigator_initials] = Investigator( investigator_initials) return investigator_dict
def __parse_sample_sheet(self, path_to_sample_sheet): parsed_data = { "header": { "experiment_name": None, "instrument_type": None, "investigator_name": None, "workflow": None, "chemistry": None, }, "reads": [], "settings": {}, "data": [], } try: sample_sheet = json.loads( SampleSheet(path_to_sample_sheet).to_json()) except Exception as e: print(e) sample_sheet_keys_to_message_keys_sample_sheet_header = { 'Experiment Name': 'experiment_name', 'Instrument Type': 'instrument_type', 'Investigator Name': 'investigator_name', 'Workflow': 'workflow', 'Chemistry': 'chemistry', } for sample_sheet_key, message_key in sample_sheet_keys_to_message_keys_sample_sheet_header.items( ): try: parsed_data['header'][message_key] = sample_sheet['Header'][ sample_sheet_key] except Exception as e: print(e) for read in sample_sheet['Reads']: parsed_data['reads'].append(read) for key, val in sample_sheet['Settings'].items(): if key == 'ReverseComplement': key = 'reverse_complement' else: key = key.lower() parsed_data['settings'][key] = val for sample in sample_sheet['Data']: sample_to_append = {} for key, val in sample.items(): sample_to_append[key.lower()] = val parsed_data['data'].append(sample_to_append) return parsed_data
def setUp(self): qc_config = { 'name': 'UnidentifiedIndexHandler', 'significance_threshold': 1, 'white_listed_indexes': ['.*N.*', 'G{8,}'] } self.unidentifiedIndexHandler = UnidentifiedIndexHandler(qc_config) conversion_results_key = "ConversionResults" conversion_results = get_stats_json()["ConversionResults"] samplesheet_key = "samplesheet" self.samplesheet = SampleSheet() sample_1 = Sample( dict(Lane=1, Sample_ID='1823A', Sample_Name='1823A-tissue', index='AAAA')) sample_2 = Sample( dict(Lane=2, Sample_ID='1823B', Sample_Name='1823B-tissue', index='TTTT')) sample_3 = Sample( dict(Lane=3, Sample_ID='1823C', Sample_Name='1823C-tissue', index='AAAA', index2='TTTT')) sample_4 = Sample( dict(Lane=4, Sample_ID='1823D', Sample_Name='1823D-tissue', index='GGGG', index2='CCCC')) sample_5 = Sample( dict(Lane=6, Sample_ID='1823E', Sample_Name='1823D-tissue', index='ATCG')) self.samplesheet.add_sample(sample_1) self.samplesheet.add_sample(sample_2) self.samplesheet.add_sample(sample_3) self.samplesheet.add_sample(sample_4) self.samplesheet.add_sample(sample_5) self.unidentifiedIndexHandler.collect( (conversion_results_key, conversion_results)) self.unidentifiedIndexHandler.collect( (samplesheet_key, self.samplesheet)) self.samplesheet_searcher = _SamplesheetSearcher(self.samplesheet)
def build_samplesheet(df, args): samplesheet = SampleSheet() for ix, row in df.iterrows(): s = { 'Sample_ID': row.Sample_ID, 'Sample_Name': row.Sample_Name, 'Sample_Project': row.Sample_Project, 'index': row["index"], 'index2': row.index2 } if not args.merge_lanes: s["Lane"] = row.Lane samplesheet.add_sample(Sample(s)) return samplesheet
def make_sample_sheet(body: Mapping[str, Any], adapter_result_type=None) -> SampleSheet: wfa = body[WORKFLOW_ACTIVITY] activity_id = wfa[ID] wf = wfa[WORKFLOW] samples = wf[SAMPLES] sample_sheet = SampleSheet() for sample in samples: sample_sheet.add_samples( sample_records(activity_id, sample, adapter_result_type=adapter_result_type)) return sample_sheet
def get_sample_sheet_info(sample_sheet_path, header_name, data_name): sample_sheet = SampleSheet(sample_sheet_path) ## Header section header = pd.DataFrame(list(sample_sheet.Header.values()), index=list(sample_sheet.Header.keys())).transpose() for i in header_name: if i not in header.columns: header[i] = np.nan header = header.rename(columns={'Description': 'Header.Description'}) ## Reads section reads = pd.DataFrame(sample_sheet.Reads, index=['Read1', 'Read2']).transpose() ## Settings section if len(sample_sheet.Settings) == 0: setting = pd.DataFrame({'Adapter': [np.nan]}) else: setting = pd.DataFrame(list(sample_sheet.Settings.values()), index=list( sample_sheet.Settings.keys())).transpose() setting = setting.rename(columns={'adapter': 'Adapter'}) ## Data section run_name = '_'.join([ sample_sheet_path.split('/')[3], sample_sheet_path.split('/')[4].split('_')[2] ]) data = pd.DataFrame() for i in range(0, len(sample_sheet)): tmp = pd.DataFrame([dict(sample_sheet.samples[i])]) data = data.append(tmp, sort=False) for i in data_name: if i not in data.columns: data[i] = np.nan data['Run_barcode'] = run_name # + '_' + data['Sample_ID'].astype(str) data = data.rename(columns={'Description': 'Data.Description'}) ## Combine all sections data = pd.concat([data, header], axis=1) data = pd.concat([data, setting], axis=1) data = pd.concat([data, reads], axis=1) # out_header = ['Date', 'Run_barcode', 'Sample_ID', 'Read1', 'Read2', "Adapter", 'Sample_Name', 'Data.Description', 'Index_Plate_Well', 'index', 'I7_Index_ID', 'index2', 'I5_Index_ID', # 'Sample_Project', 'Sample_Plate', 'Sample_Well', 'Local Run Manager Analysis Id', 'IEMFileVersion', 'Experiment Name', 'Module', 'Workflow', 'Application', 'Instrument Type', 'Assay', # 'Index Adapters', 'Library Prep Kit', 'Header.Description', 'Chemistry'] data = data.fillna('-') return data
def main(samplesheet_file_path, check_only): logger.info(f"Checking SampleSheet {samplesheet_file_path}") original_sample_sheet = SampleSheet(samplesheet_file_path) # Run some consistency checks years = get_years_from_samplesheet(original_sample_sheet) logger.info(f"Samplesheet contains IDs from {len(years)} years: {years}") for year in years: library_tracking_spreadsheet[year] = get_library_sheet_from_google( year) import_library_sheet_validation_from_google() # TODO: replace has_error return with enum and expand to error, warning, info? has_header_error = checkSampleSheetMetadata(original_sample_sheet) has_id_error = checkSampleAndLibraryIdFormat(original_sample_sheet) has_index_error = checkSampleSheetForIndexClashes(original_sample_sheet) has_metadata_error = checkMetadataCorrespondence(original_sample_sheet) # Only fail on metadata or id errors if has_index_error: print( "Index errors detected. Note: the pipeline will ignore those, please make sure to review those errors!" ) if has_header_error or has_id_error or has_metadata_error: raise ValueError( "Pipeline breaking validation detected errors. Please review the error logs!" ) # Split and write individual SampleSheets, based on indexes and technology (10X) if not check_only: # Sort samples based on technology (truseq/10X and/or index length) # Also replace N indexes with "" sorted_samples = getSortedSamples(original_sample_sheet) # Now that the samples have been sorted, we can write one or more custom sample sheets # (which may be the same as the original if no processing was necessary) logger.info(f"Writing {len(sorted_samples)} sample sheets.") writeSammpleSheets(sample_list=sorted_samples, sheet_path=samplesheet_file_path, template_sheet=original_sample_sheet) logger.info("All done.")
'SampleSheet.csv') else: logger.info( "Processing successful run. Using generated sample sheet(s).") samplesheet_path_pattern = os.path.join(runfolder_base_dir, runfolder, 'SampleSheet.csv.custom.*') samplesheet_paths = glob(samplesheet_path_pattern) if len(samplesheet_paths) < 1: raise ValueError("No sample sheets found!") logger.info(f"Using {len(samplesheet_paths)} sample sheet(s).") for samplesheet in samplesheet_paths: logger.info(f"Processing samplesheet {samplesheet}") name, extension = os.path.splitext(samplesheet) samples = SampleSheet(samplesheet).samples logger.info(f"Found {len(samples)} samples.") for sample in samples: logger.debug( f"Looking up metadata with {sample.Sample_Name} for samplesheet.Sample_ID (UMCCR SampleID); " + f"{sample.Sample_ID} and samplesheet.sample_Name (UMCCR LibraryID): {sample.Sample_Name}" ) column_values = get_meta_data_by_library_id(sample.Sample_Name) fastq_pattern = os.path.join(bcl2fastq_base_dir, runfolder, sample.Sample_Project, sample.Sample_ID, sample.Sample_Name + "*.fastq.gz") s3_fastq_pattern = os.path.join(fastq_hpc_base_dir, runfolder, sample.Sample_Project,
def scan_data(sample_sheet_path): sample_sheet = SampleSheet(sample_sheet_path) data = dict(sample_sheet.samples[0]).keys() return list(data)
def scan_header(sample_sheet_path): sample_sheet = SampleSheet(sample_sheet_path) header = pd.DataFrame(list(sample_sheet.Header.values()), index=list(sample_sheet.Header.keys())).transpose() return list(header.columns)