def readAnnoFile(anno_filename): libraries_by_master_id = {} remaps = {} with open(anno_filename, errors='surrogateescape') as anno_file: headers = anno_file.readline().split('\t') num_headers = len(headers) # find header indices, and map them to correct IDs master_id_index = headers.index('Master ID') libraries_index = headers.index('LibraryID(s)') for line in anno_file: try: fields = re.split('\t|\n', line) master_id = fields[master_id_index] libraries = fields[libraries_index].split(',') master_id_number = int(master_id[1:]) # remove leading 'I' libraries_by_master_id[master_id_number] = libraries for library in libraries: library_id = LibraryID(library) if library_id.sample in remaps and master_id_number != remaps[library_id.sample]: raise ValueError('{} maps to {} and {}'.format(library_id.sample, master_id_number, remaps[library_id.sample])) remaps[library_id.sample] = master_id_number except: print(line, file=sys.stderr) return libraries_by_master_id, remaps
def read_pipeline_analysis_report(pipeline_report_filename, library_headers, library_info, sample_headers, sample_info, field_failures): with open(pipeline_report_filename) as f: f.readline() # first line is read count header_line = f.readline() # second line is header fields headers = re.split('\t|\n', header_line) report_library_id_index = headers.index('library_id') experiment_index = headers.index('experiment') # each line is one library # iterate through report libraries and update corresponding library info for line in f: fields = re.split('\t|\n', line) library_id = fields[report_library_id_index] experiment = fields[experiment_index] if library_id.startswith( 'S'): # is not '' and library_id is not 'Contl.Capture': current_library = library_info[library_id] # sample file data try: sample_id = LibraryID(library_id).sample for sample_header in header_mapping_sample: index = sample_headers.index(sample_header) value = sample_info['S' + str(sample_id)][index] if value != '..': mapped_header = header_mapping_sample[ sample_header] #print(mapped_header, library_headers.index(mapped_header), value) current_library[library_headers.index( mapped_header)] = value except Exception as exception: print(exception, file=sys.stderr) print('missing: {}'.format(sample_id), file=sys.stderr) if len(fields) == len( headers ): # no data will have fewer fields than headers if '1240k' in experiment: replace_capture_fields(current_library, library_headers, fields, headers, field_failures) elif 'Raw' in experiment: replace_shotgun_fields(current_library, library_headers, fields, headers, field_failures)
libraries.append(library_info) # deduplicate pool = Pool(processes=args.num_threads) results = [] for library in libraries: source = str(prefix_path / library.input_bam) destination = library.deduplicated_bam print('{}\t{}'.format(source, destination), file=sys.stderr) result = pool.apply_async(deduplicate_bam, args=(source, destination, args.picard)) results.append(result) pool.close() pool.join() for result in results: result.get() # read groups pool = Pool(processes=args.num_threads) results = [] for library in libraries: destination = fields[1] lib_obj = LibraryID(library.library_id) individual = 'I{:04d}'.format(lib_obj.sample) print('{}'.format(library.library_id), file=sys.stderr) result = pool.apply_async(add_read_groups, args=(args.adna_jar, library.deduplicated_bam, library.final_bam, library.date, library.label, library.library_id, individual, args.working_directory, jvm_mem_string, leniency)) results.append(result) pool.close() pool.join() for result in results: result.get()
library_info[library_id][library_headers.index( 'mtDNA_Haplogroup')] ] instance.mt_contamination_by_library = [ library_info[library_id][library_headers.index( 'mtDNA_Consensus_Match')] ] logfile, bamfile = library_id_in_pulldown( library_id, args.names, args.pulldown_dir_root) instance.pulldown_logfile = logfile instance.pulldown_1_sample_id = library_id instance.pulldown_3_bam = str(bamfile) sample_id = 'S{:d}'.format( LibraryID(library_id).sample) instance.sample_id = sample_id instance_list[library_id] = instance # Read in merge lists. Each merges gets an anno entry if args.merge_lists is not None: for merge_list in args.merge_lists: with open(merge_list) as f: for line in f: fields = re.split('\t|\n', line.rstrip()) instance_id = fields[0] individual_id = fields[1] libraries = fields[2:] if '' in libraries: print(line, file=sys.stderr)
parser.add_argument("-r", "--reference", help="For example: hg19, rsrs", default='hg19') parser.add_argument("-e", "--experiment", help="Examples: 1240k, BigYoruba", default='1240k') parser.add_argument("--version_policy", choices=[ONLY, LATEST], default=LATEST, help='Policy for pipeline bams. Only will raise exception if there is more than one version.') parser.add_argument("requested_ids", help="Individual IDs to process from command line", nargs='*') parser.add_argument("-c", "--copy", action='store_true', help="Copy file to current directory") args = parser.parse_args() requestedIDs = [] if args.filename: #print ('opening {}'.format(args.filename)) with open(args.filename) as f: for line in f: if args.library_filter: # restrict to library format try: libraryID = LibraryID(line) requestedIDs.append(libraryID) except: pass else: requestedIDs.append(line.strip()) requestedIDs.extend(args.requested_ids) parent_directory = args.parent_directory if args.reference == 'rsrs': parent_directory = MT_default_dir requestedIDDict = {x : x for x in requestedIDs} bam_paths = {}
return libraries_by_master_id, remaps if __name__ == "__main__": parser = argparse.ArgumentParser(description="Augment the bam list for a release with a prior existing version of the library") parser.add_argument("-a", "--anno", help="Use anno file for bam hints and read groups", required=True) parser.add_argument("-l", "--label", help="Label to apply to instance names", required=True) parser.add_argument("libraries", help="Use anno file for bam hints and read groups", nargs='+') args = parser.parse_args() libraries_by_master_id, remaps = readAnnoFile(args.anno) master_ids = {} for libraries_file in args.libraries: with open(libraries_file) as f: f.readline() # skip header for line in f: fields = re.split('\t|\n', line) library_id = LibraryID(fields[0]) master_id_number = library_id.sample if master_id_number in remaps: master_id_number = remaps[master_id_number] if master_id_number not in libraries_by_master_id: libraries_by_master_id[master_id_number] = [] if str(library_id) not in libraries_by_master_id[master_id_number]: libraries_by_master_id[master_id_number].append(str(library_id)) master_ids[master_id_number] = len(libraries_by_master_id[master_id_number]) for master_id_number, count in master_ids.items(): if count > 1: print('I{:04d}_{}\t{}'.format(master_id_number, args.label, '\t'.join(libraries_by_master_id[master_id_number])))