Example #1
0
def readAnnoFile(anno_filename):
	libraries_by_master_id = {}
	remaps = {}
	with open(anno_filename, errors='surrogateescape') as anno_file:
		headers = anno_file.readline().split('\t')
		num_headers = len(headers)
		# find header indices, and map them to correct IDs
		master_id_index = headers.index('Master ID')
		libraries_index = headers.index('LibraryID(s)')

		for line in anno_file:
			try:
				fields = re.split('\t|\n', line)
				master_id = fields[master_id_index]
				libraries = fields[libraries_index].split(',')
				
				master_id_number = int(master_id[1:]) # remove leading 'I'
				libraries_by_master_id[master_id_number] = libraries
				for library in libraries:
					library_id = LibraryID(library)
					if library_id.sample in remaps and master_id_number != remaps[library_id.sample]:
						raise ValueError('{} maps to {} and {}'.format(library_id.sample, master_id_number, remaps[library_id.sample]))
					remaps[library_id.sample] = master_id_number
			except:
				print(line, file=sys.stderr)
			
	return libraries_by_master_id, remaps
def read_pipeline_analysis_report(pipeline_report_filename, library_headers,
                                  library_info, sample_headers, sample_info,
                                  field_failures):
    with open(pipeline_report_filename) as f:
        f.readline()  # first line is read count
        header_line = f.readline()  # second line is header fields
        headers = re.split('\t|\n', header_line)
        report_library_id_index = headers.index('library_id')
        experiment_index = headers.index('experiment')

        # each line is one library
        # iterate through report libraries and update corresponding library info
        for line in f:
            fields = re.split('\t|\n', line)

            library_id = fields[report_library_id_index]
            experiment = fields[experiment_index]
            if library_id.startswith(
                    'S'):  # is not '' and library_id is not 'Contl.Capture':
                current_library = library_info[library_id]
                # sample file data
                try:
                    sample_id = LibraryID(library_id).sample
                    for sample_header in header_mapping_sample:
                        index = sample_headers.index(sample_header)
                        value = sample_info['S' + str(sample_id)][index]
                        if value != '..':
                            mapped_header = header_mapping_sample[
                                sample_header]
                            #print(mapped_header, library_headers.index(mapped_header), value)
                            current_library[library_headers.index(
                                mapped_header)] = value
                except Exception as exception:
                    print(exception, file=sys.stderr)
                    print('missing: {}'.format(sample_id), file=sys.stderr)

                if len(fields) == len(
                        headers
                ):  # no data will have fewer fields than headers
                    if '1240k' in experiment:
                        replace_capture_fields(current_library,
                                               library_headers, fields,
                                               headers, field_failures)
                    elif 'Raw' in experiment:
                        replace_shotgun_fields(current_library,
                                               library_headers, fields,
                                               headers, field_failures)
Example #3
0
			libraries.append(library_info)

	# deduplicate
	pool = Pool(processes=args.num_threads)
	results = []
	for library in libraries:
		source = str(prefix_path / library.input_bam)
		destination = library.deduplicated_bam
		print('{}\t{}'.format(source, destination), file=sys.stderr)
		result = pool.apply_async(deduplicate_bam, args=(source, destination, args.picard))
		results.append(result)
	pool.close()
	pool.join()
	for result in results:
		result.get()
		
	# read groups
	pool = Pool(processes=args.num_threads)
	results = []
	for library in libraries:
		destination = fields[1]
		lib_obj = LibraryID(library.library_id)
		individual = 'I{:04d}'.format(lib_obj.sample)
		print('{}'.format(library.library_id), file=sys.stderr)
		result = pool.apply_async(add_read_groups, args=(args.adna_jar, library.deduplicated_bam, library.final_bam, library.date, library.label, library.library_id, individual, args.working_directory, jvm_mem_string, leniency))
		results.append(result)
	pool.close()
	pool.join()
	for result in results:
		result.get()
Example #4
0
                                library_info[library_id][library_headers.index(
                                    'mtDNA_Haplogroup')]
                            ]
                            instance.mt_contamination_by_library = [
                                library_info[library_id][library_headers.index(
                                    'mtDNA_Consensus_Match')]
                            ]

                            logfile, bamfile = library_id_in_pulldown(
                                library_id, args.names, args.pulldown_dir_root)
                            instance.pulldown_logfile = logfile
                            instance.pulldown_1_sample_id = library_id
                            instance.pulldown_3_bam = str(bamfile)

                            sample_id = 'S{:d}'.format(
                                LibraryID(library_id).sample)
                            instance.sample_id = sample_id

                            instance_list[library_id] = instance

    # Read in merge lists. Each merges gets an anno entry
    if args.merge_lists is not None:
        for merge_list in args.merge_lists:
            with open(merge_list) as f:
                for line in f:
                    fields = re.split('\t|\n', line.rstrip())
                    instance_id = fields[0]
                    individual_id = fields[1]
                    libraries = fields[2:]
                    if '' in libraries:
                        print(line, file=sys.stderr)
Example #5
0
	parser.add_argument("-r", "--reference", help="For example: hg19, rsrs", default='hg19')
	parser.add_argument("-e", "--experiment", help="Examples: 1240k, BigYoruba", default='1240k')
	parser.add_argument("--version_policy", choices=[ONLY, LATEST], default=LATEST, help='Policy for pipeline bams. Only will raise exception if there is more than one version.')
	parser.add_argument("requested_ids", help="Individual IDs to process from command line", nargs='*')
	parser.add_argument("-c", "--copy", action='store_true', help="Copy file to current directory")
	args = parser.parse_args()

	requestedIDs = []

	if args.filename:
		#print ('opening {}'.format(args.filename))
		with open(args.filename) as f:
			for line in f:
				if args.library_filter: # restrict to library format
					try:
						libraryID = LibraryID(line)
						requestedIDs.append(libraryID)
					except:
						pass
				else:
					requestedIDs.append(line.strip())
	
	requestedIDs.extend(args.requested_ids)
	
	parent_directory = args.parent_directory
	if args.reference == 'rsrs':
		parent_directory = MT_default_dir
	
	requestedIDDict = {x : x for x in requestedIDs}
	
	bam_paths = {}
Example #6
0
	return libraries_by_master_id, remaps

if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Augment the bam list for a release with a prior existing version of the library")
	parser.add_argument("-a", "--anno", help="Use anno file for bam hints and read groups", required=True)
	parser.add_argument("-l", "--label", help="Label to apply to instance names", required=True)
	parser.add_argument("libraries", help="Use anno file for bam hints and read groups", nargs='+')
	args = parser.parse_args()
	
	libraries_by_master_id, remaps = readAnnoFile(args.anno)

	master_ids = {}
	for libraries_file in args.libraries:
		with open(libraries_file) as f:
			f.readline() # skip header
			for line in f:
				fields = re.split('\t|\n', line)
				library_id = LibraryID(fields[0])
				master_id_number = library_id.sample
				if master_id_number in remaps:
					master_id_number = remaps[master_id_number]
				if master_id_number not in libraries_by_master_id:
					libraries_by_master_id[master_id_number] = []
				if str(library_id) not in libraries_by_master_id[master_id_number]:
					libraries_by_master_id[master_id_number].append(str(library_id))
				master_ids[master_id_number] = len(libraries_by_master_id[master_id_number])
				
	for master_id_number, count in master_ids.items():
		if count > 1:
			print('I{:04d}_{}\t{}'.format(master_id_number, args.label, '\t'.join(libraries_by_master_id[master_id_number])))