Beispiel #1
0
def run_gglab_pipeline(input_files, species, loci, group_name=''):
    # Unzip files
    print('Processing raw fastq files')
    processed_files = []
    for i, f in enumerate(input_files):
        folder_path = os.path.dirname(f)
        if f.endswith('.gz'):
            print('Unzipping: ', f)
            f = useful.gunzip_python(f)
        annotated_f = igfft.igfft_multiprocess(f,
                                               file_type='FASTQ',
                                               species=species,
                                               locus=loci,
                                               parsing_settings={
                                                   'isotype':
                                                   isotyping_barcodes,
                                                   'remove_insertions':
                                                   remove_insertions
                                               },
                                               num_processes=number_threads,
                                               delete_alignment_file=True)
        annotated_files.append(annotated_f[0])
    output_file_list = ','.join(annotated_files)
    print output_file_list
    return output_file_list
def run_gglab_pipeline(input_files, species, loci, group_name=''):
	# Unzip files
	print('Processing raw fastq files')
	processed_files = []
	for i, f in enumerate(input_files):
		folder_path = os.path.dirname(f)
		if f.endswith('.gz'):
			print('Unzipping: ', f)
			f = useful.gunzip_python(f)

		# Run trimmomatic
		trimming_parameters = {
			'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),				
			'MINLEN': min_read_len_post_trim
		}
		method = 'SE'		
		trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0]		
		# Run quality filtering
		filtered_trimmed_file = fastx.Run_Quality_Filter(trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases)		
		os.remove(trimmedf)
		processed_files.append(filtered_trimmed_file)
	
	print('Annotating processed fastq files')
	annotated_files = []
	for i, f in enumerate(processed_files):
		annotated_f = igfft.igfft_multiprocess(f, file_type='FASTQ', species=species, locus=loci, parsing_settings={'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions}, num_processes=number_threads, delete_alignment_file=True)			
		annotated_files.append(annotated_f[0])
	
	print('Pairing sequences')	
	output_dir = os.path.dirname(annotated_files[0])
	pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='GEORGIOU_INHOUSE', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff)
	print('Pipeline complete')
def Run_Quality_Filter(files, output_dir, quality, percent, encoding='-Q33'):

    if not type(files) is list:
        files = [files]

    for i, each_file in enumerate(files):
        if each_file.endswith('.gz'):
            print "Unzipping file: {0}...".format(each_file)
            files[i] = useful.gunzip_python(each_file)
            print "Unzipping complete"

    file_list = 'cat ' + ' '.join(['"' + f + '"' for f in files]) + ' | '

    outfile = os.path.join(
        output_dir,
        os.path.basename(files[0]).replace(
            '.fastq', '')) + '.filtered.{0}.fastq'.format('q' + str(quality) +
                                                          'p' + str(percent))

    print "Running filtering..."
    subprocess.check_output('{3} {5} -v {4} -o "{0}" -q {1} -p {2}'.format(
        outfile, str(quality), str(percent), file_list, encoding,
        fastq_quality_filter_location),
                            shell=True)
    print "filtering complete"

    return outfile
def run_gglab_pipeline(input_files, species, loci, group_name=''):
    # Unzip files
    print('Processing raw fastq files')
    processed_files = []
    for i, f in enumerate(input_files):
        folder_path = os.path.dirname(f)
        if f.endswith('.gz'):
            print('Unzipping: ', f)
            f = useful.gunzip_python(f)

        # Run trimmomatic
        trimming_parameters = {
            'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),
            'MINLEN': min_read_len_post_trim
        }
        method = 'SE'
        trimmedf = processing.run_trimmomatic(f, folder_path, method,
                                              phred_encode,
                                              trimming_parameters)[0]
        # Run quality filtering
        filtered_trimmed_file = fastx.Run_Quality_Filter(
            trimmedf,
            output_dir=folder_path,
            quality=quality_cutoff,
            percent=percent_bases)
        os.remove(trimmedf)
        processed_files.append(filtered_trimmed_file)

    print('Annotating processed fastq files')
    annotated_files = []
    for i, f in enumerate(processed_files):
        annotated_f = igfft.igfft_multiprocess(f,
                                               file_type='FASTQ',
                                               species=species,
                                               locus=loci,
                                               parsing_settings={
                                                   'isotype':
                                                   isotyping_barcodes,
                                                   'remove_insertions':
                                                   remove_insertions
                                               },
                                               num_processes=number_threads,
                                               delete_alignment_file=True)
        annotated_files.append(annotated_f[0])

    print('Pairing sequences')
    output_dir = os.path.dirname(annotated_files[0])
    pairing.RunPairing(annotated_files,
                       annotated_file_formats='TAB',
                       analysis_method='GEORGIOU_INHOUSE',
                       output_folder_path=output_dir,
                       prefix_output_files=group_name,
                       cluster_cutoff=cluster_setting,
                       annotation_cluster_setting=annotation_cluster_cutoff)
    print('Pipeline complete')
Beispiel #5
0
def run_gglab_pipeline(input_files, species, loci, group_name=''):
	# Unzip files
	print('Processing raw fastq files')
	processed_files = []
	
	for pair_of_files in input_files:		
		folder_path = os.path.dirname(pair_of_files[0])
		for i, f in enumerate(pair_of_files):		
			if f.endswith('.gz'):
				print('Unzipping: ', f)
				pair_of_files[i] = useful.gunzip_python(f)

		# Run trimmomatic
		if trim_seqs:
			print('Trimming low quality bases')
			trimming_parameters = {
				'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),				
				'MINLEN': min_read_len_post_trim
			}
			method = 'PE'		
			input_files = processing.run_trimmomatic(pair_of_files, folder_path, method, phred_encode, trimming_parameters)
		else:
			input_files = pair_of_files

		# Stitch R1-R2 files
		pairing_parameters = {
			'v': min_overlap_length,
			'm': max_assembly_length,
			'n': min_assembly_length,
			'u': max_fraction_uncalled,					
		}
		print('Stitching R1-R2 reads')
		pear_results = processing.run_pear(input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory)[0]		
		# Run quality filtering
		filtered_file = fastx.Run_Quality_Filter(pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases)		
		os.remove(pear_results)
		processed_files.append(filtered_file)
	
	print('Annotating processed fastq files')
	annotated_files = []
	for i, f in enumerate(processed_files):
		output_file = useful.removeFileExtension(f) + '.mixcr.alignment'
		output_file_annotation = useful.removeFileExtension(f) + '.mixcr.annotation'
		# Run MIXCR file
		print('Running MIXCR')
		[annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads)
		# Parse MIXCR file
		print('Parsing MIXCR')
		annotated_file = mixcr.parseMIXCR(f, output_file, 'FASTQ', output_file_annotation, command_val=command_val)  # again, annotated_file should be equal to outfile_annotation
		annotated_files.append(annotated_file[0])
	print('Pipeline complete')
Beispiel #6
0
def run_gglab_pipeline(input_files, species, loci, group_name=''):
	# Unzip files
	print('Processing raw fastq files')
	annotated_files = []
	for i, f in enumerate(input_files):
		folder_path = os.path.dirname(f)
		if f.endswith('.gz'):
			print('Unzipping: ', f)
			f = useful.gunzip_python(f)
		annotated_f = igfft.igfft_multiprocess(f, file_type='FASTQ', species=species, locus=loci, parsing_settings={'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions}, num_processes=number_threads, delete_alignment_file=True)			
		annotated_files.append(annotated_f[0])
	output_file_list = ','.join(annotated_files)
	print output_file_list
	return output_file_list
Beispiel #7
0
def run_gglab_pipeline(input_files, species, loci, group_name=''):
	# Unzip files
	print('Processing raw fastq files')
	processed_files = []
	
	for pair_of_files in input_files:		
		folder_path = os.path.dirname(pair_of_files[0])
		for i, f in enumerate(pair_of_files):		
			if f.endswith('.gz'):
				print('Unzipping: ', f)
				pair_of_files[i] = useful.gunzip_python(f)

		# Run trimmomatic
		if trim_seqs:
			print('Trimming low quality bases')
			trimming_parameters = {
				'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),				
				'MINLEN': min_read_len_post_trim
			}
			method = 'PE'		
			input_files = processing.run_trimmomatic(pair_of_files, folder_path, method, phred_encode, trimming_parameters)
		else:
			input_files = pair_of_files

		# Stitch R1-R2 files
		pairing_parameters = {
			'v': min_overlap_length,
			'm': max_assembly_length,
			'n': min_assembly_length,
			'u': max_fraction_uncalled,					
		}
		print('Stitching R1-R2 reads')
		pear_results = processing.run_pear(input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory)[0]		
		# Run quality filtering
		filtered_file = fastx.Run_Quality_Filter(pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases)		
		os.remove(pear_results)
		processed_files.append(filtered_file)	
	
	print('Annotating processed fastq files')
	annotated_files = []
	for i, f in enumerate(processed_files):
		annotated_f = igfft.igfft_multiprocess(f, species=species, locus=loci, parsing_settings={'isotype': isotyping_barcodes, 'remove_insertions': remove_insertions}, num_processes=number_threads, delete_alignment_file=True)			
		annotated_files.append(annotated_f[0])
	print('Pipeline complete')
Beispiel #8
0
def run_gglab_pipeline(input_files, species, loci, group_name=""):
    # Unzip files
    print ("Processing raw fastq files")
    processed_files = []
    for i, f in enumerate(input_files):
        folder_path = os.path.dirname(f)
        if f.endswith(".gz"):
            print ("Unzipping: ", f)
            f = useful.gunzip_python(f)
        annotated_f = igfft.igfft_multiprocess(
            f,
            file_type="FASTQ",
            species=species,
            locus=loci,
            parsing_settings={"isotype": isotyping_barcodes, "remove_insertions": remove_insertions},
            num_processes=number_threads,
            delete_alignment_file=True,
        )
        annotated_files.append(annotated_f[0])
    output_file_list = ",".join(annotated_files)
    print output_file_list
    return output_file_list
def Run_Quality_Filter(files, output_dir, quality, percent, encoding='-Q33'):
		
	if not type(files) is list:
		files = [files]
			
	for i, each_file in enumerate(files):
		if each_file.endswith('.gz'):
			print "Unzipping file: {0}...".format(each_file)
			files[i] = useful.gunzip_python(each_file)				
			print "Unzipping complete"
				
	
	file_list = 'cat '+' '.join(['"'+f+'"' for f in files]) +' | '
	
	outfile = os.path.join(output_dir, os.path.basename(files[0]).replace('.fastq', '')) + '.filtered.{0}.fastq'.format('q' + str(quality) + 'p' + str(percent))	

	print "Running filtering..."
	subprocess.check_output('{3} {5} -v {4} -o "{0}" -q {1} -p {2}'.format(outfile,str(quality),str(percent),file_list,encoding, fastq_quality_filter_location), shell=True)
	print "filtering complete"

	
	return outfile
def run_gglab_pipeline(input_files, species, loci, group_name=''):
	# Unzip files
	print('Processing raw fastq files')
	processed_files = []
	for i, f in enumerate(input_files):
		folder_path = os.path.dirname(f)
		if f.endswith('.gz'):
			print('Unzipping: ', f)
			f = useful.gunzip_python(f)

		# Run trimmomatic
		trimming_parameters = {
			'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),				
			'MINLEN': min_read_len_post_trim
		}
		method = 'SE'		
		trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0]		
		# Run quality filtering
		filtered_trimmed_file = fastx.Run_Quality_Filter(trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases)		
		os.remove(trimmedf)
		processed_files.append(filtered_trimmed_file)
	
	print('Annotating processed fastq files')
	annotated_files = []
	for i, f in enumerate(processed_files):
		output_file = useful.removeFileExtension(f) + '.mixcr.alignment'
		output_file_annotation = useful.removeFileExtension(f) + '.mixcr.annotation'
		# Run MIXCR file
		print('Running MIXCR')
		[annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads)
		# Parse MIXCR file
		print('Parsing MIXCR')
		annotated_file = mixcr.parseMIXCR(f, output_file, 'FASTQ', output_file_annotation, command_val=command_val)  # again, annotated_file should be equal to outfile_annotation
		annotated_files.append(annotated_file)	
	print('Pairing sequences')	
	output_dir = os.path.dirname(annotated_files[0])
	pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='MIXCR', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff)
	print('Pipeline complete')
Beispiel #11
0
def run_gglab_pipeline(input_files, species, loci, group_name=''):
    # Unzip files
    print('Processing raw fastq files')
    processed_files = []

    for pair_of_files in input_files:
        folder_path = os.path.dirname(pair_of_files[0])
        for i, f in enumerate(pair_of_files):
            if f.endswith('.gz'):
                print('Unzipping: ', f)
                pair_of_files[i] = useful.gunzip_python(f)

        # Run trimmomatic
        if trim_seqs:
            print('Trimming low quality bases')
            trimming_parameters = {
                'SLIDINGWINDOW':
                str(window_trim) + ':' + str(quality_cutoff_trim),
                'MINLEN': min_read_len_post_trim
            }
            method = 'PE'
            input_files = processing.run_trimmomatic(pair_of_files,
                                                     folder_path, method,
                                                     phred_encode,
                                                     trimming_parameters)
        else:
            input_files = pair_of_files

        # Stitch R1-R2 files
        pairing_parameters = {
            'v': min_overlap_length,
            'm': max_assembly_length,
            'n': min_assembly_length,
            'u': max_fraction_uncalled,
        }
        print('Stitching R1-R2 reads')
        pear_results = processing.run_pear(input_files[0],
                                           input_files[1],
                                           working_directory=folder_path,
                                           parameters=pairing_parameters,
                                           num_threads=number_threads,
                                           memory=pear_memory)[0]
        # Run quality filtering
        filtered_file = fastx.Run_Quality_Filter(pear_results,
                                                 output_dir=folder_path,
                                                 quality=quality_cutoff,
                                                 percent=percent_bases)
        os.remove(pear_results)
        processed_files.append(filtered_file)

    print('Annotating processed fastq files')
    annotated_files = []
    for i, f in enumerate(processed_files):
        annotated_f = igfft.igfft_multiprocess(f,
                                               species=species,
                                               locus=loci,
                                               parsing_settings={
                                                   'isotype':
                                                   isotyping_barcodes,
                                                   'remove_insertions':
                                                   remove_insertions
                                               },
                                               num_processes=number_threads,
                                               delete_alignment_file=True)
        annotated_files.append(annotated_f[0])
    print('Pipeline complete')
def Run_FASTX_Barcode_Splitter(files,output_dir,settings={'orientation':'bol'},search_reverse_complement=True):	
	
	parameters = copy.deepcopy(settings)
	if 'orientation' not in parameters:
		raise Exception('"Orientation" is required in the parameters field')
	
	if not type(files) is list:
		files = [files]
	
	for i,each_file in enumerate(files):
		if each_file.endswith('.gz'):
			print "Unzipping file: {0}...".format(each_file)
			files[i] = useful.gunzip_python(each_file)		
			files[i] = each_file[:-3]			
			print "Unzipping complete"		
	
	suffix = parameters.pop('suffix') if 'suffix' in parameters else ''
	
	barcode_splitter_command = 'cat '+' '.join(files)+' | '
	
	if output_dir[-1] == '/':
		output_dir = output_dir[:-1]
	
		
	if 'prefix' in parameters and parameters['prefix'] != '':
		prefix = output_dir+'/'+parameters['prefix'] 
	else:
	 	prefix = output_dir+'/'	
	
	parameters.pop('prefix',None)
	
	additional_folders = os.path.dirname(prefix)
		
	if not os.path.isdir(additional_folders):
		os.mkdir(additional_folders)		
		
	orientation = parameters.pop('orientation',None)
	
	barcode_splitter_command += 'fastx_barcode_splitter.pl '
	for p in parameters:
		barcode_splitter_command+='--{0} {1} '.format(p,parameters[p])
	
	barcode_splitter_command +='--prefix '+prefix+ ' --suffix '+suffix+' --'+orientation
	
	output = useful.get_stdout(barcode_splitter_command).rstrip(' \n').split('\n')#output = subprocess.check_output(barcode_splitter_command,shell=True).rstrip(' \n').split('\n')

	if output[0].lower().startswith('error'):
		raise Exception("Error found in barcode split program: "+output[0])
	
	
	result = {'barcodes':defaultdict(int)}
	for line in output[1:-2]:
		line = line.split('\t')
		result['barcodes'][line[2]] = int(line[1])
	result['total'] = int(output[-1].split('\t')[1])
	result['unmatched'] = int(output[-2].split('\t')[1])
	
	
	if search_reverse_complement:		
		initial_file = []
		new_file = []
		map_barcode_to_file = {}
		with open(parameters['bcfile']) as file:
			lines=file.readlines()
			new_bcfile=open(settings['bcfile']+'rc','w')
			for l in lines:
				c = l.split('\t')
				initial_file.append(c[0].strip())
				new_file.append(c[0].strip()+'rev')
				map_barcode_to_file[c[0].strip()+'rev'] = prefix+c[0].strip()+suffix
				new_bcfile.write(c[0].strip()+'rev'+'\t'+Reverse_Complement(c[1].strip())+'\n')
			new_bcfile.close()
		
		shutil.copyfile(prefix + 'unmatched' + suffix, prefix + 'unmatched' + suffix + '.temp')		
		files = [prefix + 'unmatched' + suffix + '.temp']
		parameters['bcfile'] +='rc'
		
		if orientation == 'eol':
			orientation='bol'
		elif orientation == 'bol':
			orientation= 'eol'
			
		barcode_splitter_command = 'cat "'+' '.join(files)+'" | '

		barcode_splitter_command += barcode_split_perl_script
		for p in parameters:
			barcode_splitter_command+='--{0} {1} '.format(p,parameters[p])
		
		barcode_splitter_command += '--prefix ' + prefix + ' --suffix ' + suffix + ' --' + orientation
		
		output = useful.get_stdout(barcode_splitter_command).rstrip(' \n').split('\n')  # subprocess.check_output(barcode_splitter_command,shell=True).rstrip(' \n').split('\n')
		
		for i, line in enumerate(output[1:-2]):
			line = line.split('\t')
			result['barcodes'][map_barcode_to_file[line[0].strip()]] += int(line[1])
		
		result['unmatched'] = int(output[-2].split('\t')[1])
				
		cleanup_command = ''
		for i, each_bc_file in enumerate(initial_file):
			cleanup_command += "mv '{0}{1}{3}' '{0}{1}{3}.temp';cat '{0}{1}{3}.temp' '{0}{2}{3}' > '{0}{1}{3}'; rm '{0}{1}{3}.temp';rm '{0}{2}{3}';".format(prefix, each_bc_file, new_file[i], suffix)				
		cleanup_command += "rm '{0}{1}'; ".format(prefix, 'unmatched' + suffix + '.temp')
		subprocess.cal(cleanup_command, shell=True)		
		
		
	return result
def run_trimmomatic(files,
                    output_directory=None,
                    method='SE',
                    phred=None,
                    optional_parameters={}):
    '''
		Wrapper function for running trimmomatic program within python
		Trimmomatic will remove low quality bases from the ends of NGS reads using an average quality score in a given window size
		
		Parameters
		----------
		files : string or list of strings
			List of input filenames (fastq or fastq.gz) for the MISEQ files. We either accept a single string or a list of two strings.
		working_directory : string, default none
			Pathname of desired output directory
		outfile : string, default empty string
			Desired filename name
		method : SE or PE, default 'SE'
			String representing whether to treat input files as single (SE) or paired-end files (PE)
		phred : integer, default None
			If None, then will rely on trimmomatic to guess the quality encoding. If a number, then will pass this value into the phred field.
		optional_parameters : dict, default empty parameters
			An optional dict of all parameters you would like to pass to trimmomatic 
			http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf		
	'''

    method = method.upper()
    if method not in ['SE', 'PE']:
        raise Exception(
            'Incorrect value provided for parameter "method". Provided value: '
            + method)

    if not isinstance(files, list):
        files = [files]
    if len(files) > 2:
        raise Exception(
            str(len(files)) +
            'total files have been passed to function. We only except 1 or 2 filepaths representing the R1/R2 reads'
        )

    for i, f in enumerate(files):
        if f.endswith('.gz'):
            print('Unzipping: ', f)
            files[i] = useful.gunzip_python(f)

    output_directory = useful.get_parent_dir(
        files[0]) if not output_directory else os.path.abspath(
            output_directory)
    return_file_names = []
    command_loops = []

    if method == 'SE':
        # Trim each file at a time
        for f in files:
            input_file_names = []
            output_file_names = []
            input_file_names.append('"' + f + '"')
            out = f[:-6] if f.endswith('.fastq') else f
            output_file_names.extend(['"' + out + '.trimmed.fastq"'])
            return_file_names.append(out + '.trimmed.fastq')
            command_loops.append([input_file_names, output_file_names])
    else:
        input_file_names = []
        output_file_names = []
        # trim all files simultaneously
        for f in files:
            input_file_names.append('"' + f + '"')
            out = f[:-6] if f.endswith('.fastq') else f
            output_file_names.extend([
                '"' + out + '.trimmed.fastq"',
                '"' + out + '.trimmed.unpaired.fastq"'
            ])
            return_file_names.append(out + '.trimmed.fastq')
        command_loops.append([input_file_names, output_file_names])
    phred_var = '-phred' + str(phred) if phred else ''

    # We should change the java folder to recognize /usr/local/bin...
    for loops in command_loops:
        inputs = loops[0]
        outputs = loops[1]
        trim_command = 'java -jar {5} {0} {4} -threads 2 {1} {2} {3}'.format(
            method, ' '.join(inputs), ' '.join(outputs), ' '.join([
                key + ':' + str(value)
                for key, value in optional_parameters.iteritems()
            ]), phred_var, trimmomatic_location)
        worked = subprocess.call(trim_command, shell=True)
        if worked > 0:
            raise Exception('Trimmomatic failed')
    return return_file_names
Beispiel #14
0
def run_gglab_pipeline(input_files, species, loci, group_name=""):
    # Unzip files
    print("Processing raw fastq files")
    processed_files = []

    for pair_of_files in input_files:
        folder_path = os.path.dirname(pair_of_files[0])
        for i, f in enumerate(pair_of_files):
            if f.endswith(".gz"):
                print("Unzipping: ", f)
                pair_of_files[i] = useful.gunzip_python(f)

                # Run trimmomatic
        if trim_seqs:
            print("Trimming low quality bases")
            trimming_parameters = {
                "SLIDINGWINDOW": str(window_trim) + ":" + str(quality_cutoff_trim),
                "MINLEN": min_read_len_post_trim,
            }
            method = "PE"
            input_files = processing.run_trimmomatic(
                pair_of_files, folder_path, method, phred_encode, trimming_parameters
            )
        else:
            input_files = pair_of_files

            # Stitch R1-R2 files
        pairing_parameters = {
            "v": min_overlap_length,
            "m": max_assembly_length,
            "n": min_assembly_length,
            "u": max_fraction_uncalled,
        }
        print("Stitching R1-R2 reads")
        pear_results = processing.run_pear(
            input_files[0],
            input_files[1],
            working_directory=folder_path,
            parameters=pairing_parameters,
            num_threads=number_threads,
            memory=pear_memory,
        )[0]
        # Run quality filtering
        filtered_file = fastx.Run_Quality_Filter(
            pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases
        )
        os.remove(pear_results)
        processed_files.append(filtered_file)

    print("Annotating processed fastq files")
    annotated_files = []
    for i, f in enumerate(processed_files):
        output_file = useful.removeFileExtension(f) + ".mixcr.alignment"
        output_file_annotation = useful.removeFileExtension(f) + ".mixcr.annotation"
        # Run MIXCR file
        print("Running MIXCR")
        [annotated_f, command_val] = mixcr.RunMixcr(
            f,
            output_file,
            filetype="FASTQ",
            loci=[],
            species="",
            exportPrettyAlignment=False,
            num_threads=number_threads,
        )
        # Parse MIXCR file
        print("Parsing MIXCR")
        annotated_file = mixcr.parseMIXCR(
            f, output_file, "FASTQ", output_file_annotation, command_val=command_val
        )  # again, annotated_file should be equal to outfile_annotation
        annotated_files.append(annotated_file[0])
    print("Pipeline complete")
def run_flash(r1file,
              r2file,
              working_directory,
              outfile='',
              parameters={},
              suffix=''):
    r1_path = useful.get_parent_dir(r1file)  # '/'.join(r1file.split('/')[:-1])
    r2_path = useful.get_parent_dir(r2file)  # '/'.join(r2file.split('/')[:-1])

    if not parameters:
        print "PARAMETERS NOT PASSED INTO FLASH PROGRAM. USING DEFAULT IGSEQ PARAMETERS: R = 300, F = 400"
        parameters = {'r': 300, 'f': 400}

    if r1file.endswith('.gz'):
        print "Unzipping R1 File.."
        r1file = useful.gunzip_python(r1file)

    if r2file.endswith('.gz'):
        print "Unzipping R2 File.."
        r2file = useful.gunzip_python(r2file)

    working_directory = os.path.abspath(working_directory)
    if r1_path != working_directory:
        os.rename(r1file,
                  os.path.join(working_directory, os.path.basename(r1file)))
    if r2_path != working_directory:
        os.rename(r2file,
                  os.path.join(working_directory, os.path.basename(r2file)))

    if outfile == '':
        outfile = os.path.basename(r1file).split('.')
        for p, subs in enumerate(outfile):
            if '_R1' in subs:
                r_pos = subs.index("_R1")
                outfile[p] = subs[:r_pos]
                break
            elif '_R2' in subs:
                r_pos = subs.index("_R2")
                outfile[p] = subs[:r_pos]
                break
        outfile = '.'.join(outfile)
    else:
        outfile = os.path.basename(outfile)

    outfile = outfile.replace('.fastq', '').replace('.fasta', '')
    outfile += '.flashed' + suffix

    if os.path.isfile(os.path.join(working_directory,
                                   outfile)):  # in resulting_files:
        print(
            'WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'
            .format(working_directory + '/' + outfile))

    r1file = os.path.join(working_directory, os.path.basename(
        r1file))  # working_directory+'/'+os.path.basename(r1file)
    r2file = os.path.join(working_directory, os.path.basename(
        r2file))  # working_directory+'/'+os.path.basename(r2file)

    flash_command = "{2} {0} {1}".format(r1file, r2file, flash_location)

    parameters['o'] = outfile
    parameters['d'] = working_directory

    for p, val in parameters.iteritems():
        flash_command += ' -{0} {1}'.format(p, str(val))

    flash_command += ' -q'  # run on quiet command
    # os.system(flash_command)
    worked = subprocess.call(flash_command, shell=True)
    if worked > 0:
        raise Exception('Flash failed')
    os.rename(
        os.path.join(working_directory, outfile + '.extendedFrags.fastq'),
        os.path.join(working_directory, outfile))

    try:
        read_count_r1_file = useful.file_line_count(r1file)
    except Exception as e:
        read_count_r1_file = 1
        print("Could not get number of lines in read file: " + str(e))

    try:
        read_count_flashed_file = useful.file_line_count(
            os.path.join(working_directory, outfile))
    except Exception as e:
        read_count_flashed_file = 1
        print("Could not get number of lines in outfile read file: " + str(e))
    resulting_counts = (os.path.join(working_directory,
                                     outfile), read_count_flashed_file / 4,
                        read_count_r1_file / 4, float(100) *
                        (read_count_flashed_file / float(read_count_r1_file)))

    return resulting_counts
def run_flash(r1file, r2file, working_directory, outfile='', parameters={}, suffix=''):
	r1_path = useful.get_parent_dir(r1file)  # '/'.join(r1file.split('/')[:-1])	
	r2_path = useful.get_parent_dir(r2file)  # '/'.join(r2file.split('/')[:-1])
	
	if not parameters:
		print "PARAMETERS NOT PASSED INTO FLASH PROGRAM. USING DEFAULT IGSEQ PARAMETERS: R = 300, F = 400"
		parameters = {'r': 300, 'f': 400}
	
	if r1file.endswith('.gz'):
		print "Unzipping R1 File.."				
		r1file = useful.gunzip_python(r1file)
	
	if r2file.endswith('.gz'):		
		print "Unzipping R2 File.."
		r2file = useful.gunzip_python(r2file)
		
	working_directory = os.path.abspath(working_directory)
	if r1_path != working_directory:
		os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file)))		
	if r2_path != working_directory:	
		os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file)))
		
	if outfile == '':		
		outfile = os.path.basename(r1file).split('.')					
		for p, subs in enumerate(outfile):
			if '_R1' in subs:
				r_pos = subs.index("_R1")
				outfile[p] = subs[:r_pos]				
				break
			elif '_R2' in subs:
				r_pos = subs.index("_R2")
				outfile[p] = subs[:r_pos]				
				break
		outfile = '.'.join(outfile)
	else:		
		outfile = os.path.basename(outfile)
		
	outfile = outfile.replace('.fastq', '').replace('.fasta', '')
	outfile += '.flashed' + suffix		
			
	if os.path.isfile(os.path.join(working_directory, outfile)):  # in resulting_files:		
		print('WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'.format(working_directory + '/' + outfile))
							
	r1file = os.path.join(working_directory, os.path.basename(r1file))  # working_directory+'/'+os.path.basename(r1file)
	r2file = os.path.join(working_directory, os.path.basename(r2file))  # working_directory+'/'+os.path.basename(r2file)

	flash_command = "{2} {0} {1}".format(r1file, r2file, flash_location)
	
	parameters['o'] = outfile
	parameters['d'] = working_directory

	for p, val in parameters.iteritems():
		flash_command += ' -{0} {1}'.format(p, str(val))
	
	flash_command += ' -q'  # run on quiet command
	# os.system(flash_command)
	worked = subprocess.call(flash_command, shell=True)
	if worked > 0:
		raise Exception('Flash failed')
	os.rename(os.path.join(working_directory, outfile + '.extendedFrags.fastq'), os.path.join(working_directory, outfile))
	
	try:
		read_count_r1_file = useful.file_line_count(r1file)
	except Exception as e:
		read_count_r1_file = 1
		print("Could not get number of lines in read file: " + str(e))
	
	try:
		read_count_flashed_file = useful.file_line_count(os.path.join(working_directory, outfile))
	except Exception as e:
		read_count_flashed_file = 1
		print("Could not get number of lines in outfile read file: " + str(e))
	resulting_counts = (
		os.path.join(working_directory, outfile),
		read_count_flashed_file / 4,
		read_count_r1_file / 4,
		float(100) * (read_count_flashed_file / float(read_count_r1_file))
	)
	
	return resulting_counts
def run_pear(r1file, r2file, working_directory, outfile='', parameters={}, suffix='', num_threads=1, memory='1G'):
	r1_path = useful.get_parent_dir(r1file)
	r2_path = useful.get_parent_dir(r2file)

	if r1file.endswith('.gz'):
		print("Unzipping R1 File..")
		r1file = useful.gunzip_python(r1file)
	
	if r2file.endswith('.gz'):		
		print("Unzipping R2 File..")
		r2file = useful.gunzip_python(r2file)
				
	working_directory = os.path.abspath(working_directory)
	if r1_path != working_directory:
		os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file)))		
	if r2_path != working_directory:	
		os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file)))		
		
	if outfile == '':		
		outfile = os.path.basename(r1file).split('.')					
		for p, subs in enumerate(outfile):
			if '_R1' in subs:
				r_pos = subs.index("_R1")
				outfile[p] = subs[:r_pos]				
				break
			elif '_R2' in subs:
				r_pos = subs.index("_R2")
				outfile[p] = subs[:r_pos]				
				break
		outfile = '.'.join(outfile)
	else:		
		outfile = os.path.basename(outfile)
	
	outfile = outfile.replace('.fastq', '').replace('.fasta', '')
	
	outfile = os.path.join(working_directory, outfile)
	if os.path.isfile(os.path.join(working_directory, outfile)):  # in resulting_files:
		print('WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'.format(working_directory + '/' + outfile))
							
	r1file = os.path.join(working_directory, os.path.basename(r1file))
	r2file = os.path.join(working_directory, os.path.basename(r2file))

	pear_command = "{2} -f {0} -r {1}".format(r1file, r2file, pear_location)
	
	parameters['o'] = outfile
	parameters['y'] = memory
	parameters['j'] = num_threads
	
	for p, val in parameters.iteritems():
		pear_command += ' -{0} {1}'.format(p, str(val))
			
	worked = subprocess.call(pear_command, shell=True)
	
	if worked > 0:
		raise Exception('Error in pear program')
	
	try:
		read_count_r1_file = useful.file_line_count(r1file)
	except Exception as e:
		read_count_r1_file = 1
		print("Could not get number of lines in read file: " + str(e))
	
	try:
		read_count_flashed_file = useful.file_line_count(outfile + '.assembled.fastq')
	except Exception as e:
		read_count_flashed_file = 1
		print("Could not get number of lines in outfile read file: " + str(e))

	resulting_counts = (
		outfile + '.assembled.fastq',
		read_count_flashed_file / 4,
		read_count_r1_file / 4,
		float(100) * (read_count_flashed_file / float(read_count_r1_file))
	)
	
	return resulting_counts
def run_trimmomatic(files, output_directory=None, method='SE', phred=None, optional_parameters={}):
	'''
		Wrapper function for running trimmomatic program within python
		Trimmomatic will remove low quality bases from the ends of NGS reads using an average quality score in a given window size
		
		Parameters
		----------
		files : string or list of strings
			List of input filenames (fastq or fastq.gz) for the MISEQ files. We either accept a single string or a list of two strings.
		working_directory : string, default none
			Pathname of desired output directory
		outfile : string, default empty string
			Desired filename name
		method : SE or PE, default 'SE'
			String representing whether to treat input files as single (SE) or paired-end files (PE)
		phred : integer, default None
			If None, then will rely on trimmomatic to guess the quality encoding. If a number, then will pass this value into the phred field.
		optional_parameters : dict, default empty parameters
			An optional dict of all parameters you would like to pass to trimmomatic 
			http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf		
	'''
	
	method = method.upper()
	if method not in ['SE', 'PE']:
		raise Exception('Incorrect value provided for parameter "method". Provided value: ' + method)
	
	if not isinstance(files, list):
		files = [files]
	if len(files) > 2:
		raise Exception(str(len(files)) + 'total files have been passed to function. We only except 1 or 2 filepaths representing the R1/R2 reads')
	
	for i, f in enumerate(files):
		if f.endswith('.gz'):
			print('Unzipping: ', f)
			files[i] = useful.gunzip_python(f)		
	
	output_directory = useful.get_parent_dir(files[0]) if not output_directory else os.path.abspath(output_directory)		
	return_file_names = []	
	command_loops = []
	
	if method == 'SE':
		# Trim each file at a time
		for f in files:
			input_file_names = []
			output_file_names = []
			input_file_names.append('"' + f + '"')			
			out = f[:-6] if f.endswith('.fastq') else f
			output_file_names.extend(['"' + out + '.trimmed.fastq"'])
			return_file_names.append(out + '.trimmed.fastq')
			command_loops.append([input_file_names, output_file_names])
	else:
		input_file_names = []
		output_file_names = []
		# trim all files simultaneously		
		for f in files:
			input_file_names.append('"' + f + '"')			
			out = f[:-6] if f.endswith('.fastq') else f
			output_file_names.extend(['"' + out + '.trimmed.fastq"', '"' + out + '.trimmed.unpaired.fastq"'])
			return_file_names.append(out + '.trimmed.fastq')
		command_loops.append([input_file_names, output_file_names])
	phred_var = '-phred' + str(phred) if phred else ''	
	
	# We should change the java folder to recognize /usr/local/bin...	
	for loops in command_loops:
		inputs = loops[0]
		outputs = loops[1]
		trim_command = 'java -jar {5} {0} {4} -threads 2 {1} {2} {3}'.format(method, ' '.join(inputs), ' '.join(outputs), ' '.join([key + ':' + str(value) for key, value in optional_parameters.iteritems()]), phred_var, trimmomatic_location)		
		worked = subprocess.call(trim_command, shell=True)
		if worked > 0:
			raise Exception('Trimmomatic failed')
	return return_file_names
def run_pear(r1file,
             r2file,
             working_directory,
             outfile='',
             parameters={},
             suffix='',
             num_threads=1,
             memory='1G'):
    r1_path = useful.get_parent_dir(r1file)
    r2_path = useful.get_parent_dir(r2file)

    if r1file.endswith('.gz'):
        print("Unzipping R1 File..")
        r1file = useful.gunzip_python(r1file)

    if r2file.endswith('.gz'):
        print("Unzipping R2 File..")
        r2file = useful.gunzip_python(r2file)

    working_directory = os.path.abspath(working_directory)
    if r1_path != working_directory:
        os.rename(r1file,
                  os.path.join(working_directory, os.path.basename(r1file)))
    if r2_path != working_directory:
        os.rename(r2file,
                  os.path.join(working_directory, os.path.basename(r2file)))

    if outfile == '':
        outfile = os.path.basename(r1file).split('.')
        for p, subs in enumerate(outfile):
            if '_R1' in subs:
                r_pos = subs.index("_R1")
                outfile[p] = subs[:r_pos]
                break
            elif '_R2' in subs:
                r_pos = subs.index("_R2")
                outfile[p] = subs[:r_pos]
                break
        outfile = '.'.join(outfile)
    else:
        outfile = os.path.basename(outfile)

    outfile = outfile.replace('.fastq', '').replace('.fasta', '')

    outfile = os.path.join(working_directory, outfile)
    if os.path.isfile(os.path.join(working_directory,
                                   outfile)):  # in resulting_files:
        print(
            'WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'
            .format(working_directory + '/' + outfile))

    r1file = os.path.join(working_directory, os.path.basename(r1file))
    r2file = os.path.join(working_directory, os.path.basename(r2file))

    pear_command = "{2} -f {0} -r {1}".format(r1file, r2file, pear_location)

    parameters['o'] = outfile
    parameters['y'] = memory
    parameters['j'] = num_threads

    for p, val in parameters.iteritems():
        pear_command += ' -{0} {1}'.format(p, str(val))

    worked = subprocess.call(pear_command, shell=True)

    if worked > 0:
        raise Exception('Error in pear program')

    try:
        read_count_r1_file = useful.file_line_count(r1file)
    except Exception as e:
        read_count_r1_file = 1
        print("Could not get number of lines in read file: " + str(e))

    try:
        read_count_flashed_file = useful.file_line_count(outfile +
                                                         '.assembled.fastq')
    except Exception as e:
        read_count_flashed_file = 1
        print("Could not get number of lines in outfile read file: " + str(e))

    resulting_counts = (outfile + '.assembled.fastq',
                        read_count_flashed_file / 4, read_count_r1_file / 4,
                        float(100) *
                        (read_count_flashed_file / float(read_count_r1_file)))

    return resulting_counts
def Run_FASTX_Barcode_Splitter(files,
                               output_dir,
                               settings={'orientation': 'bol'},
                               search_reverse_complement=True):

    parameters = copy.deepcopy(settings)
    if 'orientation' not in parameters:
        raise Exception('"Orientation" is required in the parameters field')

    if not type(files) is list:
        files = [files]

    for i, each_file in enumerate(files):
        if each_file.endswith('.gz'):
            print "Unzipping file: {0}...".format(each_file)
            files[i] = useful.gunzip_python(each_file)
            files[i] = each_file[:-3]
            print "Unzipping complete"

    suffix = parameters.pop('suffix') if 'suffix' in parameters else ''

    barcode_splitter_command = 'cat ' + ' '.join(files) + ' | '

    if output_dir[-1] == '/':
        output_dir = output_dir[:-1]

    if 'prefix' in parameters and parameters['prefix'] != '':
        prefix = output_dir + '/' + parameters['prefix']
    else:
        prefix = output_dir + '/'

    parameters.pop('prefix', None)

    additional_folders = os.path.dirname(prefix)

    if not os.path.isdir(additional_folders):
        os.mkdir(additional_folders)

    orientation = parameters.pop('orientation', None)

    barcode_splitter_command += 'fastx_barcode_splitter.pl '
    for p in parameters:
        barcode_splitter_command += '--{0} {1} '.format(p, parameters[p])

    barcode_splitter_command += '--prefix ' + prefix + ' --suffix ' + suffix + ' --' + orientation

    output = useful.get_stdout(barcode_splitter_command).rstrip(' \n').split(
        '\n'
    )  #output = subprocess.check_output(barcode_splitter_command,shell=True).rstrip(' \n').split('\n')

    if output[0].lower().startswith('error'):
        raise Exception("Error found in barcode split program: " + output[0])

    result = {'barcodes': defaultdict(int)}
    for line in output[1:-2]:
        line = line.split('\t')
        result['barcodes'][line[2]] = int(line[1])
    result['total'] = int(output[-1].split('\t')[1])
    result['unmatched'] = int(output[-2].split('\t')[1])

    if search_reverse_complement:
        initial_file = []
        new_file = []
        map_barcode_to_file = {}
        with open(parameters['bcfile']) as file:
            lines = file.readlines()
            new_bcfile = open(settings['bcfile'] + 'rc', 'w')
            for l in lines:
                c = l.split('\t')
                initial_file.append(c[0].strip())
                new_file.append(c[0].strip() + 'rev')
                map_barcode_to_file[c[0].strip() +
                                    'rev'] = prefix + c[0].strip() + suffix
                new_bcfile.write(c[0].strip() + 'rev' + '\t' +
                                 Reverse_Complement(c[1].strip()) + '\n')
            new_bcfile.close()

        shutil.copyfile(prefix + 'unmatched' + suffix,
                        prefix + 'unmatched' + suffix + '.temp')
        files = [prefix + 'unmatched' + suffix + '.temp']
        parameters['bcfile'] += 'rc'

        if orientation == 'eol':
            orientation = 'bol'
        elif orientation == 'bol':
            orientation = 'eol'

        barcode_splitter_command = 'cat "' + ' '.join(files) + '" | '

        barcode_splitter_command += barcode_split_perl_script
        for p in parameters:
            barcode_splitter_command += '--{0} {1} '.format(p, parameters[p])

        barcode_splitter_command += '--prefix ' + prefix + ' --suffix ' + suffix + ' --' + orientation

        output = useful.get_stdout(barcode_splitter_command).rstrip(
            ' \n'
        ).split(
            '\n'
        )  # subprocess.check_output(barcode_splitter_command,shell=True).rstrip(' \n').split('\n')

        for i, line in enumerate(output[1:-2]):
            line = line.split('\t')
            result['barcodes'][map_barcode_to_file[line[0].strip()]] += int(
                line[1])

        result['unmatched'] = int(output[-2].split('\t')[1])

        cleanup_command = ''
        for i, each_bc_file in enumerate(initial_file):
            cleanup_command += "mv '{0}{1}{3}' '{0}{1}{3}.temp';cat '{0}{1}{3}.temp' '{0}{2}{3}' > '{0}{1}{3}'; rm '{0}{1}{3}.temp';rm '{0}{2}{3}';".format(
                prefix, each_bc_file, new_file[i], suffix)
        cleanup_command += "rm '{0}{1}'; ".format(
            prefix, 'unmatched' + suffix + '.temp')
        subprocess.cal(cleanup_command, shell=True)

    return result
def run_gglab_pipeline(input_files, species, loci, group_name=''):
    # Unzip files
    print('Processing raw fastq files')
    processed_files = []
    for i, f in enumerate(input_files):
        folder_path = os.path.dirname(f)
        if f.endswith('.gz'):
            print('Unzipping: ', f)
            f = useful.gunzip_python(f)

        # Run trimmomatic
        trimming_parameters = {
            'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),
            'MINLEN': min_read_len_post_trim
        }
        method = 'SE'
        trimmedf = processing.run_trimmomatic(f, folder_path, method,
                                              phred_encode,
                                              trimming_parameters)[0]
        # Run quality filtering
        filtered_trimmed_file = fastx.Run_Quality_Filter(
            trimmedf,
            output_dir=folder_path,
            quality=quality_cutoff,
            percent=percent_bases)
        os.remove(trimmedf)
        processed_files.append(filtered_trimmed_file)

    print('Annotating processed fastq files')
    annotated_files = []
    for i, f in enumerate(processed_files):
        output_file = useful.removeFileExtension(f) + '.mixcr.alignment'
        output_file_annotation = useful.removeFileExtension(
            f) + '.mixcr.annotation'
        # Run MIXCR file
        print('Running MIXCR')
        [annotated_f,
         command_val] = mixcr.RunMixcr(f,
                                       output_file,
                                       filetype='FASTQ',
                                       loci=[],
                                       species='',
                                       exportPrettyAlignment=False,
                                       num_threads=number_threads)
        # Parse MIXCR file
        print('Parsing MIXCR')
        annotated_file = mixcr.parseMIXCR(
            f,
            output_file,
            'FASTQ',
            output_file_annotation,
            command_val=command_val
        )  # again, annotated_file should be equal to outfile_annotation
        annotated_files.append(annotated_file)
    print('Pairing sequences')
    output_dir = os.path.dirname(annotated_files[0])
    pairing.RunPairing(annotated_files,
                       annotated_file_formats='TAB',
                       analysis_method='MIXCR',
                       output_folder_path=output_dir,
                       prefix_output_files=group_name,
                       cluster_cutoff=cluster_setting,
                       annotation_cluster_setting=annotation_cluster_cutoff)
    print('Pipeline complete')