Ejemplo n.º 1
0
def run_gglab_pipeline(input_files, species, loci, group_name=''):
	# Unzip files
	print('Processing raw fastq files')
	processed_files = []
	
	for pair_of_files in input_files:		
		folder_path = os.path.dirname(pair_of_files[0])
		for i, f in enumerate(pair_of_files):		
			if f.endswith('.gz'):
				print('Unzipping: ', f)
				pair_of_files[i] = useful.gunzip_python(f)

		# Run trimmomatic
		if trim_seqs:
			print('Trimming low quality bases')
			trimming_parameters = {
				'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),				
				'MINLEN': min_read_len_post_trim
			}
			method = 'PE'		
			input_files = processing.run_trimmomatic(pair_of_files, folder_path, method, phred_encode, trimming_parameters)
		else:
			input_files = pair_of_files

		# Stitch R1-R2 files
		pairing_parameters = {
			'v': min_overlap_length,
			'm': max_assembly_length,
			'n': min_assembly_length,
			'u': max_fraction_uncalled,					
		}
		print('Stitching R1-R2 reads')
		pear_results = processing.run_pear(input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory)[0]		
		# Run quality filtering
		filtered_file = fastx.Run_Quality_Filter(pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases)		
		os.remove(pear_results)
		processed_files.append(filtered_file)
	
	print('Annotating processed fastq files')
	annotated_files = []
	for i, f in enumerate(processed_files):
		output_file = useful.removeFileExtension(f) + '.mixcr.alignment'
		output_file_annotation = useful.removeFileExtension(f) + '.mixcr.annotation'
		# Run MIXCR file
		print('Running MIXCR')
		[annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads)
		# Parse MIXCR file
		print('Parsing MIXCR')
		annotated_file = mixcr.parseMIXCR(f, output_file, 'FASTQ', output_file_annotation, command_val=command_val)  # again, annotated_file should be equal to outfile_annotation
		annotated_files.append(annotated_file[0])
	print('Pipeline complete')
def run_gglab_pipeline(input_files, species, loci, group_name=''):
	# Unzip files
	print('Processing raw fastq files')
	processed_files = []
	for i, f in enumerate(input_files):
		folder_path = os.path.dirname(f)
		if f.endswith('.gz'):
			print('Unzipping: ', f)
			f = useful.gunzip_python(f)

		# Run trimmomatic
		trimming_parameters = {
			'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),				
			'MINLEN': min_read_len_post_trim
		}
		method = 'SE'		
		trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0]		
		# Run quality filtering
		filtered_trimmed_file = fastx.Run_Quality_Filter(trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases)		
		os.remove(trimmedf)
		processed_files.append(filtered_trimmed_file)
	
	print('Annotating processed fastq files')
	annotated_files = []
	for i, f in enumerate(processed_files):
		output_file = useful.removeFileExtension(f) + '.mixcr.alignment'
		output_file_annotation = useful.removeFileExtension(f) + '.mixcr.annotation'
		# Run MIXCR file
		print('Running MIXCR')
		[annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads)
		# Parse MIXCR file
		print('Parsing MIXCR')
		annotated_file = mixcr.parseMIXCR(f, output_file, 'FASTQ', output_file_annotation, command_val=command_val)  # again, annotated_file should be equal to outfile_annotation
		annotated_files.append(annotated_file)	
	print('Pairing sequences')	
	output_dir = os.path.dirname(annotated_files[0])
	pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='MIXCR', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff)
	print('Pipeline complete')
Ejemplo n.º 3
0
def isotype_sequences(input_file,
                      input_file_type,
                      barcode_file='',
                      output_file=None,
                      output_format='TAB',
                      seq_var='sequence',
                      header_var='header',
                      helper_fields={},
                      alignment_settings={},
                      analysis_name=None):
    #####OVER HEAD FUNCTIONS

    help_1 = defaultdict(str, copy.deepcopy(helper_fields))
    recombination_var = help_1['recombination_var']
    strand_field = help_1['strand_field']
    end_of_ab_field = help_1['end_of_ab_field']

    al_1 = copy.deepcopy(alignment_settings)

    penalize_truncations = al_1[
        'penalize_truncations'] if 'penalize_truncations' in al_1 else True

    minimum_alignment_length = al_1[
        'minimum_alignment_length'] if 'minimum_alignment_length' in al_1 else 15

    #0=> only consider barcodes as provided
    #1=> only consider the reverse complmeent of barcodes provided
    #2=> consider both strands
    search_rc = al_1['search_rc'] if 'search_rc' in al_1 else 2

    allowed_mismatches_in_alignment = al_1[
        'allowed_mismatches_in_alignment'] if 'allowed_mismatches_in_alignment' in al_1 else 2

    #the sequence filed provided is the sequence of the SENSE AB gene not the antisense
    #when False, will consider both the forward and reverse copmlmement of sequence
    strand_corrected = al_1[
        'strand_corrected'] if 'strand_corrected' in al_1 else False

    #file locations
    seq_fasta_location = input_file  #  functionVars["folder_location"]+functionVars["input_file"] #location of input file

    translator_field = copy.deepcopy(translator)

    if analysis_name:
        translator_field['ANALYSIS_NAME'] = analysis_name.upper()

    translator_field = {translation_var: translator_field}
    if output_file == None or output_file == input_file:
        output_file = useful.removeFileExtension(
            input_file) + '.isotype.annotation'

    output_file_location = output_file

    output_file_format = output_format  #functionVars['write_format']
    #seqHandle = open(seq_fasta_location,"rU")

    outHandle = open(output_file_location, 'w')
    outHandle.write(
        descriptor_symbol + json.dumps(translator_field) + '\n'
    )  #write a translator line to this file so that we know how to add results to database
    if output_format == 'TAB' or output_format == 'CSV':
        outHandle.write('\t'.join(FileDelimFields) + '\n')

    if not barcode_file:  # 'barcodefilename' in functionVars:
        #manually using these primers
        barcodeSeqList = defaultBarcodes()
    elif not (os.path.isfile(barcode_file)):
        print('Barcode file not found! Using default barcodes')
        #manually using these primers
        barcodeSeqList = defaultBarcodes()
    else:
        barcodeSeqList = readBarcodeFile(barcode_file)

    command_string = json.dumps({
        'Barcodes':
        barcodeSeqList,
        'mismatch_cutoff':
        allowed_mismatches_in_alignment,
        'penalize_truncations':
        penalize_truncations,
        'minimum_length_cutoff':
        minimum_alignment_length
    })

    iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,
                                      filetype=input_file_type)

    #get maximum length of sequences in file
    [maxLen, numSeq] = maxSeqLen(iffile, seq_var)

    #make a call to the generator for alinging sequences to isotypes
    guessed_num_bases_after_jgene = 60
    isotype_predictor = fft_tools.BarcodeAligner(
        barcodeSeqList,
        penalize_truncations,
        search_rc,
        allowed_mismatches_in_alignment,
        minimum_alignment_length,
        nmax=maxLen,
        nmin=guessed_num_bases_after_jgene)

    ###END OF OVERHEAD FUNCTIONS

    #now lets read through sequences and start alignining
    algnLim = 10
    currentSeq = 0
    overlap_len = 10

    #seqHandle=open(seq_fasta_location,"rU")
    counter = 0
    startPer = 0

    num_isotype_found = {}
    total_isotype_found = 0
    total_found_score = 0
    total_notfound_score = 0

    print("Starting isotyping analysis for {0} sequences".format(numSeq))

    totaltime = 0
    a = int(round(time.time()))
    found = 0

    iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,
                                      filetype=input_file_type)
    summary_data = {
        'found': 0,
        'top_isotype': defaultdict(int),
        'average_mismatch': 0,
        'average_num_isotype_found': 0
    }

    for line_row in iffile.read():
        jsonVar = {}
        if not line_row:
            continue

        if header_var in line_row:
            if idIdentifier in line_row:
                jsonVar[idIdentifier] = line_row[idIdentifier]
                jsonVar['Header'] = line_row[header_var]
            else:
                [header, id] = GrabAdditionalHeaderInfo(line_row[header_var])
                jsonVar[idIdentifier] = id
                jsonVar['Header'] = header

        if seq_var not in line_row or line_row[seq_var] == '':
            jsonVar['Sequence'] = ''
            jsonVar['Notes'] = 'No sequence found'
            writeSeqResult(outHandle, jsonVar, output_format)
            continue

        #allow the user to monitor what percent of the sequences have been processed
        startPer = useful.LoopStatus(counter, numSeq, 10, startPer)

        bestScore = 0
        bestBarcode = -1

        jsonVar['Sequence'] = line_row[seq_var]
        jsonVar['Command'] = command_string
        counter += 1

        seqFwd = jsonVar['Sequence']

        if strand_corrected:
            all_seqs = [seqFwd]
        else:
            all_seqs = [seqFwd, str(Seq(seqFwd).reverse_complement())]

        found_strand = ''
        for pos, each_seq in enumerate(all_seqs):
            #determine if we should take a substring of the sequence
            #basically, only consider nucleotides AFTER the end of the ab field
            if end_of_ab_field in line_row and line_row[end_of_ab_field] != '':
                try:
                    end_of_ab = int(line_row[end_of_ab_field])
                except:
                    end_of_ab = 0
                #take substring
                if end_of_ab - overlap_len < len(
                        each_seq) and end_of_ab - overlap_len >= 0:
                    each_seq = each_seq[end_of_ab:]

            isotypes_results = isotype_predictor.AlignToSeq(each_seq)
            if isotypes_results:
                found_strand = strand_orientation_list[pos]
                break

        if isotypes_results:
            found += 1
            jsonVar = dict(jsonVar.items() + isotypes_results.items())

            jsonVar['Sequence strand'] = found_strand

            if recombination_var in line_row and line_row[recombination_var]:
                #always trust the recombination type from input file IF provided
                jsonVar['Recombination type'] = line_row[recombination_var]
            else:
                #if there is no results then attemp to guess it our selves
                jsonVar['Recombination type'] = GuessRecombType(
                    jsonVar['Isotype'][0])

            summary_data['top_isotype'][jsonVar['Isotype'][0]] += 1
            summary_data['average_num_isotype_found'] += len(
                jsonVar['Isotype'])
            summary_data['average_mismatch'] += jsonVar['Mismatches'][0]
        else:
            if recombination_var in line_row and line_row[recombination_var]:
                #always trust the recombination type from input file IF provided
                jsonVar['Recombination type'] = line_row[recombination_var]

            jsonVar['Isotype'] = ''
            jsonVar[
                'Notes'] = 'Could not identify isotype with alignment score above threshold'
            summary_data['top_isotype']['NotFound'] += 1

        writeSeqResult(outHandle, jsonVar, output_format)

    b = int(round(time.time()))

    summary_data['found'] = found
    if found:
        summary_data['average_mismatch'] = summary_data[
            'average_mismatch'] / float(found)
        summary_data['average_num_isotype_found'] = summary_data[
            'average_num_isotype_found'] / float(found)

    totaltime = (b - a)

    print "time: "
    print totaltime

    print "Summary of identified isotypes:"
    print summary_data

    #if total_isotype_found>0:
    #	print "\nAverage score for identified isotypes:"
    #	print str(total_found_score/float(total_isotype_found))

    #if numSeq-total_isotype_found>0:
    #	print "\nAverage score for unidentified isotypes:"
    #	print str(total_notfound_score/float(numSeq-total_isotype_found))

    outHandle.close()
    #if output_file_format=="txt":
    #	JSON_to_TXT(output_file_location, output_file_location, True,{'Header':1,'Seq':2,'dir':3,'isotype':4,'algnPos':5,'maxscore':6,'bestscore':7})
    return output_file
Ejemplo n.º 4
0
def run_gglab_pipeline(input_files, species, loci, group_name=""):
    # Unzip files
    print("Processing raw fastq files")
    processed_files = []

    for pair_of_files in input_files:
        folder_path = os.path.dirname(pair_of_files[0])
        for i, f in enumerate(pair_of_files):
            if f.endswith(".gz"):
                print("Unzipping: ", f)
                pair_of_files[i] = useful.gunzip_python(f)

                # Run trimmomatic
        if trim_seqs:
            print("Trimming low quality bases")
            trimming_parameters = {
                "SLIDINGWINDOW": str(window_trim) + ":" + str(quality_cutoff_trim),
                "MINLEN": min_read_len_post_trim,
            }
            method = "PE"
            input_files = processing.run_trimmomatic(
                pair_of_files, folder_path, method, phred_encode, trimming_parameters
            )
        else:
            input_files = pair_of_files

            # Stitch R1-R2 files
        pairing_parameters = {
            "v": min_overlap_length,
            "m": max_assembly_length,
            "n": min_assembly_length,
            "u": max_fraction_uncalled,
        }
        print("Stitching R1-R2 reads")
        pear_results = processing.run_pear(
            input_files[0],
            input_files[1],
            working_directory=folder_path,
            parameters=pairing_parameters,
            num_threads=number_threads,
            memory=pear_memory,
        )[0]
        # Run quality filtering
        filtered_file = fastx.Run_Quality_Filter(
            pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases
        )
        os.remove(pear_results)
        processed_files.append(filtered_file)

    print("Annotating processed fastq files")
    annotated_files = []
    for i, f in enumerate(processed_files):
        output_file = useful.removeFileExtension(f) + ".mixcr.alignment"
        output_file_annotation = useful.removeFileExtension(f) + ".mixcr.annotation"
        # Run MIXCR file
        print("Running MIXCR")
        [annotated_f, command_val] = mixcr.RunMixcr(
            f,
            output_file,
            filetype="FASTQ",
            loci=[],
            species="",
            exportPrettyAlignment=False,
            num_threads=number_threads,
        )
        # Parse MIXCR file
        print("Parsing MIXCR")
        annotated_file = mixcr.parseMIXCR(
            f, output_file, "FASTQ", output_file_annotation, command_val=command_val
        )  # again, annotated_file should be equal to outfile_annotation
        annotated_files.append(annotated_file[0])
    print("Pipeline complete")
def Descriptive_Statistics(list_of_files,input_file_type,analysis_name='',exp_names = [],output_file_prefix='',fields={},statistics_to_run=['ab_aa','cdr3','vgene','jgene','vjgene']):
	analysis_name = analysis_name.upper()
	if input_file_type=='IMGT' and not isinstance(list_of_files[0],list):
		list_of_files = [list_of_files]
	elif not isinstance(list_of_files,list):
		list_of_files = [list_of_files]
		
	if len(exp_names)!=len(list_of_files):
		exp_names = []
	
	#by default, save results to the same folder as the input file
	if not output_file_prefix:		
		output_file_prefix = useful.removeFileExtension(list_of_files[0])
	
	analysis_name = analysis_name.upper()
	supported_analyses = fields_for_analysis.keys()
	if (not analysis_name or analysis_name=='CUSTOM' or analysis_name not in supported_analyses) and not fields:
		raise Exception('The required fields for the provided analysis, {0}, is not currently automated. Please explicity provide the fields names'.format(str(analysis_name)))
	
	#first we use default fields defined ehere
	if analysis_name in supported_analyses:
		fields_to_use = copy.deepcopy(fields_for_analysis[analysis_name])
	else:
		fields_to_use = {}
	#next we add in user defined fields just in case there are any changes/mistakes
	for f,name in fields.iteritems():
		fields_to_use[f] = name 
	
	
	
	filenames_to_use = [f[0] if isinstance(f,list) else f for f in list_of_files]
	print('Performing descriptive statistics at {0}.'.format(str(datetime.datetime.now())))
	print('Analyzing the following files:\n\t {0}'.format('\n\t'.join(filenames_to_use)))
	unique_aa_file = None 
	unique_cdr3_file = None 	
	v_gene_analysis = None
	j_gene_analysis = None
	vj_gene_analysis = None
	gene_analysis_plot = output_file_prefix
	plots_created = []
	gene_summary_file = output_file_prefix+'.summary_of_stats.txt'
	
	
	output_file_names = {}
	
	aa_files = ['AB AA SEQUENCE','RECOMBINATION_TYPE','LOCUS','CDR1','CDR2','CDR3','STOP CODONS','PRODUCTIVE','VGENES','DGENES','JGENES','TOTAL COUNTS']
	fields_order = ['full_len_ab','recomb','locus','cdr1','cdr2','cdr3','stopc','functionality','vgene','dgene','jgene']
	num_exp= len(list_of_files)
	if not exp_names:
		if input_file_type=='IMGT':
			pass
		else:			
			exp_names = []
			for file in list_of_files:
				count = 1
				str_file = os.path.basename(file)
				while True:					
					if str_file in exp_names:
						str_file = os.path.basename(file)+'_'+str(count)
						count+=1
					else:
						exp_names.append(str_file)
						break			
		
	if 'ab_aa' in statistics_to_run:
		intermediate_file =  output_file_prefix+'.unique_aa_file_temp'
		#first we will use a temp file/intermeidate file 
		output_file_names['ab_aa'] = open(intermediate_file,'w')
		#output_file_names['ab_aa'].write('\t'.join(aa_files)+'\n')
	
	
	cdr3analysis = True if 'cdr3' in statistics_to_run else False
	aaanalysis = True if 'ab_aa' in statistics_to_run else False
	
	vjgene_dict=defaultdict(lambda:defaultdictgenes(num_exp))
	
	#cdr3_dict=defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_vdj = defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_vj = defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_unk = defaultdict(lambda:defaultdictcdr3(num_exp))	
	
	use_these_fields = fields_to_use.values()
	fields_to_use['stopc'] = 'stopc'
	num_results = [0]*(num_exp)
	num_cdr3 = [0]*(num_exp)
	num_stop_codon = [0]*(num_exp)
	num_vdj = [0]*(num_exp)
	num_vj = [0]*(num_exp)
	num_sequences = [0]*(num_exp)
	
	
	if not fields_to_use['recomb']:
		#maybe the user never defined a feild for recombinoation type..that coudl be a problem because we will have to guess it using the variable at the top of the script: recomb_call
		recomb_not_defined = True	
		fields_to_use['recomb'] = 'recomb'
	else:
		recomb_not_defined = False
	
	
	print('Reading through sequences in file(s)')
	seqnum=1
	#go through all of the files and report the relevant fields 
	#if we are creating a unique amino acid file, then report thiese fields to temp file
	for fnum,each_file in enumerate(list_of_files):				
		annotated_file = readfile.immunogrepFile(each_file,input_file_type,field_names = use_these_fields)
		#loop through each file 
		for seq_lines in annotated_file.read():						
			if not seq_lines:
				continue
			if seqnum%500000==0:
				print('Read {0} sequences'.format(str(seqnum)))
			seqnum+=1
			num_sequences[fnum]+=1			
			seq_lines = defaultdict(str,seq_lines)
			if seq_lines[fields_to_use['full_len_ab']]:
				#full length antibody sequence not found
				num_results[fnum]+=1				
												
			#only select the first gene in the list. alos remove allelic name ('*')
			seq_lines[fields_to_use['vgene']] = seq_lines[fields_to_use['vgene']].split(',')[0].split('*')[0]
			seq_lines[fields_to_use['dgene']] = seq_lines[fields_to_use['dgene']].split(',')[0].split('*')[0]
						
			#IF NO RECOMBINATION TYPE IS FOUND or provided, THEN guess it using the vgene or jgene call
			if recomb_not_defined or not seq_lines[fields_to_use['recomb']]:
				r = '' #not sure what the recombation type is yet
				#try to guess the recombination type 				
				if seq_lines[fields_to_use['vgene']]:
					#use vgene if present
					# look at the first three characters in vgene to predict recombioation type
					gn = ProcessGene(seq_lines[fields_to_use['vgene']])
					if gn[:3] in recomb_call:
						r = recomb_call[gn[:3]]
					elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example 
						r = recomb_call[gn[:2]]
				if not r and seq_lines[fields_to_use['jgene']]:
					#still not r found, so use jgene 
					gn = ProcessGene(seq_lines[fields_to_use['jgene']])
					if gn[:3] in recomb_call:
						r = recomb_call[gn[:3]]
					elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example 
						r = recomb_call[gn[:2]]					
				
				#update recomb result 
				seq_lines[fields_to_use['recomb']] = r								
			
			if not seq_lines[fields_to_use['recomb']]:
				continue
								
			if seq_lines[fields_to_use['recomb']] == 'VDJ':
				num_vdj[fnum]+=1								
			elif seq_lines[fields_to_use['recomb']] == 'VJ':				
				num_vj[fnum]+=1
				
			
			seq_lines[fields_to_use['jgene']] = seq_lines[fields_to_use['jgene']].split(',')[0].split('*')[0]
			seq_lines['stopc'] = 'YES' if '*' in seq_lines[fields_to_use['full_len_ab']] else 'NO'			
			if seq_lines['stopc'] == 'YES':
				num_stop_codon[fnum]+=1
			if aaanalysis:
				exp_str = str(fnum+1)
				#make an intermediate file where we only put the fields we want in the proper order from any file 
				#we will use this field for sorting afterwards
				#also output exp_num to account for which sequence came from which experiment 
				output_file_names['ab_aa'].write('\t'.join([seq_lines[fields_to_use[f]] for f in fields_order])+'\t'+str(exp_str)+'\n')
			if seq_lines[fields_to_use['vgene']] or seq_lines[fields_to_use['jgene']]:							
				key_v =delim.join([seq_lines[fields_to_use['vgene']],seq_lines[fields_to_use['jgene']],seq_lines[fields_to_use['recomb']]])
				vjgene_dict[key_v][fnum]+=1
			
			if not seq_lines[fields_to_use['cdr3']]:
				#no cdr3 found 	
				continue
			
			#add unique cdr3_recomb and vjgene info to dictionaires
			num_cdr3[fnum]+=1
			
			if cdr3analysis:
				key = seq_lines[fields_to_use['cdr3']]
				#key_cdr3 = delim.join([],seq_lines[fields_to_use['recomb']]])
				if seq_lines[fields_to_use['recomb']]=='VDJ':
					cdr3_dict_vdj[key][fnum]+=1
				elif seq_lines[fields_to_use['recomb']]=='VJ':
					cdr3_dict_vj[key][fnum]+=1					
				else:
					print('unknown recombination types: ',seq_lines[fields_to_use['recomb']])
					cdr3_dict_unk[key][fnum]+=1 
									  
			if seqnum>10000:
				break
				
					
	if aaanalysis:
		output_file_names['ab_aa'].close()
		print('Generating a file of unique AB amino acid sequences')
		unique_aa_file = output_file_prefix+'.unique_aa_file.txt'
		#Use some bash to make a unique amino acid file using sorting and then some awk 
		GenerateAAFile(intermediate_file,unique_aa_file,aa_files,exp_names)
		#number of amino acid sequences observed
		if not os.path.isfile(unique_aa_file):
			num_unique_aa = 0 
		else:
			num_unique_aa = useful.file_line_count(unique_aa_file)-1 #-1 => remove header row count
	
	#Now have some fun with pandas 	
	if set(['vgene','jgene','vjgene']) & set(statistics_to_run):
		#vjgene_dict format = {
			#'key' = 'vgene',_,'jgene',_,'recombtype'
			#value = [count,count] => a list of counts for presence of that key in EACH providced file/experiment. Length of list = number of experiments
		#}
		gene_df = pd.DataFrame(vjgene_dict).transpose()
		if 'VGENE' not in gene_df.columns:
			gene_df['VGENE'] = ''
		if 'JGENE' not in gene_df.columns:
			gene_df['JGENE'] = ''
		if 'recomb' not in gene_df.columns:
			gene_df['recomb'] = ''
		gene_df['TOTAL_COUNTS'] = gene_df.sum(axis=1)		
		gene_df = gene_df.reset_index()				
		gene_df = gene_df.apply(ModifyPDTable,axis=1,args=(['VGENE','JGENE','recomb'],delim))
		
		
		new_names = {}
		for f,v in enumerate(exp_names):
			new_names[f]=v
			#key = experiment index number
			#value = new name

		#rename the columns 0,1,...num experiments to match the experiment names 
		gene_df = gene_df.rename(columns=new_names)
		
		#format of gene_df:
			#index => no index set, just use default numbers
			#columns => start with column for each experiment, then add the following columns: VGENE, JGENE, recomb, TOTAL_COUNTS

		if 'vgene' in statistics_to_run:
			print('Performing V gene analysis')
			
			v_gene_analysis = output_file_prefix+'.vgenes.txt'
			#group elements by VH GENE CALLS and VL gene calls 
			sorted_v_counts =  gene_df.groupby(['recomb','VGENE']).sum()#.count()#.sort('VGENE',ascending=1)						
			
			#find out which level in multilevel index corresponds to 'VGENE' => looking at above code , it should be level 1 (recomb should be level 0)
			vgene_level = sorted_v_counts.index.names.index('VGENE')			
			
			#remove results where vGENE is empty
			if '' in list(sorted_v_counts.index.levels[vgene_level]):
				sorted_v_counts = sorted_v_counts.drop('',level='VGENE')			
			
			ignore_counts = ['TOTAL_COUNTS','JGENE']
			keep_col = [n for n in sorted_v_counts.columns if n not in ignore_counts]
			g = sorted_v_counts[keep_col]			
			
			#NOW PLOT the FREQUENCY for every exeprement 
			if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]):
				
				vdj_g = g.xs('VDJ',level='recomb')
				
				PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.vgenes','VH Gene Distribution','Frequency','V Gene',max_val=None,min_val=0)
				
				plots_created.append(gene_analysis_plot+'.vdj.vgenes.png') #.png extension is added in the function plotgenedist
				
			if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]):
				
				vj_g = g.xs('VJ',level='recomb')
				PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.vgenes','VL Gene Distribution','Frequency','V Gene',max_val=None,min_val=0)			
				plots_created.append(gene_analysis_plot+'.vj.vgenes.png') #.png extension is added in the function plotgenedist
			sorted_v_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(v_gene_analysis,sep='\t',index=False)			
			
		
		#do the same as above, except for J genes this time 
		if 'jgene' in statistics_to_run:
			print('Performing J gene analysis')
			j_gene_analysis = output_file_prefix+'.jgenes.txt'
			sorted_j_counts =  gene_df.groupby(['recomb','JGENE']).sum()#.sort('VGENE',ascending=1)						
			jgene_level = sorted_j_counts.index.names.index('JGENE')			
			if '' in list(sorted_j_counts.index.levels[jgene_level]):
				sorted_j_counts.drop('',level='JGENE',inplace=True)			
			ignore_counts = ['TOTAL_COUNTS','VGENE']
			keep_col = [n for n in sorted_j_counts.columns if n not in ignore_counts]
			g = sorted_j_counts[keep_col]			
			sorted_j_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(j_gene_analysis,sep='\t',index=False)
			
			#NOW CALCULATE FREQUENCY for every exeprement 						
			if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]):
				vdj_g = g.xs('VDJ',level='recomb')
				PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.jgenes','JH Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5)
				plots_created.append(gene_analysis_plot+'.vdj.jgenes.png') #.png extension is added in the function plotgenedist
			if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]):
				vj_g = g.xs('VJ',level='recomb')			
				PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.jgenes','JL Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5)
				plots_created.append(gene_analysis_plot+'.vj.jgenes.png') #.png extension is added in the function plotgenedist		
		
		#now perform a V-J gene analysis (heat map) for each experiment 
		if 'vjgene' in statistics_to_run:
			print('Performing V-J gene analysis')
			vj_gene_analysis = output_file_prefix+'.v_and_jgene_analysis.txt'
			#group datafraom by recombination, vgene, and jgene 
			#first rename all V and J gnees that are empyt as No call 						
			#Then Group H / L results by  by v and j gnees and take the sum of each column in the group 
			vj_df =  gene_df.replace([''],[' No call']).groupby(['recomb','VGENE','JGENE']).sum()
			vj_df.to_csv(vj_gene_analysis,sep='\t')			
			
			#remove TOTAL_COUNTS			
			vj_df.drop('TOTAL_COUNTS', axis=1, inplace=True)
			
			#calculate frequency for each recomb type 
			if 'VDJ' in list(vj_df.index.levels[g.index.names.index('recomb')]):						
				v1 =  vj_df.loc['VDJ',:]/vj_df.loc['VDJ',:].sum()
				PlotVJGeneHeatMap(v1,gene_analysis_plot+'.vdj.v_and_jgene_analysis',max_val=None,min_val=None)
				plots_created.append(gene_analysis_plot+'.vdj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist
			if 'VJ' in list(vj_df.index.levels[g.index.names.index('recomb')]):
				v2 =  vj_df.loc['VJ',:]/vj_df.loc['VJ',:].sum()
				PlotVJGeneHeatMap(v2,gene_analysis_plot+'.vj.v_and_jgene_analysis',max_val=None,min_val=None)																							
				plots_created.append(gene_analysis_plot+'.vj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist
			del vj_df
		del gene_df
		
	#lets do some cdr3 analysis 									
	cdr3_length_stats = {}
	diversity_measurements = {}
	if cdr3analysis:	
		unique_cdr3_file = output_file_prefix+'.unique_cdr3_counts.txt' 
		print('Performing CDR3 analyisis')
		if sum(num_cdr3)>0:
			#again create a pandas dataframe but this time using the unique cdr3 calls 
			print('Loading CDR3s into a dataframe')
			cdr3_df_list = [pd.DataFrame.from_dict(c,orient='index') for c in [cdr3_dict_vdj,cdr3_dict_vj,cdr3_dict_unk]]
			#merge all dftogether
			keys=['VDJ','VJ','UNK']
			cdr3_df = pd.concat(cdr3_df_list,keys=keys)
			#cdr3_df = pd.DataFrame(cdr3_dict).transpose()			
			cdr3_df['TOTAL_COUNTS'] = cdr3_df.sum(axis=1)
			print('Dataframe created')
			
			cdr3_df.index.names = ['recomb','CDR3']
			cdr3_df = cdr3_df.reset_index()				
			#cdr3_df['CDR3'] = ''
			#cdr3_df['recomb'] = ''
			#cdr3_df = cdr3_df.apply(ModifyPDTable,axis=1,raw=True,reduce=True,args=(['CDR3','recomb'],delim))			
			
			new_names = {}
			#performm 			
			cdr3_df['CDR3_LENGTH'] = cdr3_df.CDR3.map(len) 
			for f,v in enumerate(exp_names):
				new_names[f]=v
			#rename the columns to match the experiment names 
			
			cdr3_df = cdr3_df.rename(columns=new_names)
			cdr3_df.sort(['recomb','TOTAL_COUNTS'],ascending=[1,0],inplace=True)
			cdr3_df.set_index(['recomb','CDR3'],inplace=True)					
			
			#save dataframe as tab dleim file 						
			cdr3_df.to_csv(unique_cdr3_file,sep='\t')									
			
			cdr3_length_stats = PlotCDR3Histogram(cdr3_df,gene_analysis_plot+'.cdr3_length_histogram')
			plots_created.append(gene_analysis_plot+'.cdr3_length_histogram.png')
			
			diversity_measurements = CalculateDiversities(cdr3_df,gene_analysis_plot+'.cdr3_diversity_plots')
			plots_created.append(gene_analysis_plot+'.cdr3_diversity_plots.png')			
		del cdr3_df
	
	print('Writing summary to file')
	#finally make a results text file that summarizes all the information	
	GenerateResultsSummaryFile(gene_summary_file,statistics_to_run,list_of_files,exp_names,unique_aa_file,unique_cdr3_file,v_gene_analysis,j_gene_analysis,vj_gene_analysis,plots_created,num_sequences,num_results,num_vdj,num_vj,num_cdr3,num_stop_codon,cdr3_length_stats,diversity_measurements)	
	
	files_generated = [gene_summary_file]
	if unique_aa_file:
		files_generated.append(unique_aa_file)
	if unique_cdr3_file:
		files_generated.append(unique_cdr3_file)
	if v_gene_analysis:
		files_generated.append(v_gene_analysis)
	if j_gene_analysis:
		files_generated.append(j_gene_analysis)
	if vj_gene_analysis:
		files_generated.append(vj_gene_analysis)
	
	print('Descriptive statistics completed at {0}.'.format(str(datetime.datetime.now())))
	
	gc.collect()

	
	return {'files':files_generated,'figures':plots_created}
Ejemplo n.º 6
0
def isotype_sequences(input_file,input_file_type,barcode_file='',output_file=None,output_format='TAB',seq_var='sequence',header_var='header',helper_fields = {},alignment_settings = {},analysis_name = None):		
	#####OVER HEAD FUNCTIONS
	
	help_1 = defaultdict(str,copy.deepcopy(helper_fields))
	recombination_var = help_1['recombination_var']
	strand_field = help_1['strand_field']
	end_of_ab_field = help_1['end_of_ab_field']
		
	
	al_1 = copy.deepcopy(alignment_settings)
	
	penalize_truncations = al_1['penalize_truncations'] if 'penalize_truncations' in al_1 else True
	
	minimum_alignment_length = al_1['minimum_alignment_length'] if 'minimum_alignment_length' in al_1 else 15
	
	#0=> only consider barcodes as provided
	#1=> only consider the reverse complmeent of barcodes provided 
	#2=> consider both strands 
	search_rc = al_1['search_rc'] if 'search_rc' in al_1 else 2
	
	allowed_mismatches_in_alignment = al_1['allowed_mismatches_in_alignment'] if 'allowed_mismatches_in_alignment' in al_1 else 2
	
	#the sequence filed provided is the sequence of the SENSE AB gene not the antisense
	#when False, will consider both the forward and reverse copmlmement of sequence 
	strand_corrected = al_1['strand_corrected'] if 'strand_corrected' in al_1 else False
		
				
	#file locations
	seq_fasta_location =input_file#  functionVars["folder_location"]+functionVars["input_file"] #location of input file
	
	translator_field = copy.deepcopy(translator)
	
	if analysis_name:
		translator_field['ANALYSIS_NAME'] = analysis_name.upper()
	
	
	translator_field = {translation_var:translator_field}
	if output_file == None or output_file==input_file:
		output_file = useful.removeFileExtension(input_file)+'.isotype.annotation'
	
	output_file_location = output_file
		
		
	output_file_format = output_format #functionVars['write_format']
	#seqHandle = open(seq_fasta_location,"rU")
		
	outHandle = open(output_file_location,'w')		
	outHandle.write(descriptor_symbol+json.dumps(translator_field)+'\n')#write a translator line to this file so that we know how to add results to database 
	if output_format == 'TAB' or output_format == 'CSV':
		outHandle.write('\t'.join(FileDelimFields)+'\n')
	
	if not barcode_file:# 'barcodefilename' in functionVars:
		#manually using these primers
		barcodeSeqList = defaultBarcodes()
	elif not(os.path.isfile(barcode_file)):
		print('Barcode file not found! Using default barcodes')		
		#manually using these primers
		barcodeSeqList = defaultBarcodes()
	else:
		barcodeSeqList = readBarcodeFile(barcode_file)
		
	command_string = json.dumps({'Barcodes':barcodeSeqList,'mismatch_cutoff':allowed_mismatches_in_alignment,'penalize_truncations':penalize_truncations,'minimum_length_cutoff':minimum_alignment_length})
	
	
	
	iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,filetype=input_file_type)
		
	#get maximum length of sequences in file 
	[maxLen,numSeq] = maxSeqLen(iffile,seq_var) 	
	
	#make a call to the generator for alinging sequences to isotypes 
	guessed_num_bases_after_jgene = 60
	isotype_predictor =fft_tools.BarcodeAligner(barcodeSeqList,penalize_truncations,search_rc,allowed_mismatches_in_alignment,minimum_alignment_length,nmax=maxLen,nmin=guessed_num_bases_after_jgene)		
					
	###END OF OVERHEAD FUNCTIONS
	
	
	#now lets read through sequences and start alignining
	algnLim = 10
	currentSeq = 0
	overlap_len = 10
	
	#seqHandle=open(seq_fasta_location,"rU")
	counter = 0
	startPer = 0
	
	num_isotype_found = {}
	total_isotype_found = 0
	total_found_score=0
	total_notfound_score=0
	
	print("Starting isotyping analysis for {0} sequences".format(numSeq))

	
	totaltime = 0
	a = int(round(time.time()))
	found = 0 
	
	iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,filetype=input_file_type);
	summary_data = {'found':0,'top_isotype':defaultdict(int),'average_mismatch':0,'average_num_isotype_found':0}
	
	for line_row in iffile.read():			
		jsonVar = {}
		if not line_row:
			continue
		
		if header_var in line_row:
			if idIdentifier in line_row:
				jsonVar[idIdentifier] = line_row[idIdentifier]
				jsonVar['Header'] = line_row[header_var]
			else:
				[header,id] = GrabAdditionalHeaderInfo(line_row[header_var])
				jsonVar[idIdentifier] = id			
				jsonVar['Header'] = header
			
		
		if seq_var not in line_row or line_row[seq_var]=='':		
			jsonVar['Sequence']=''					
			jsonVar['Notes'] = 'No sequence found'			
			writeSeqResult(outHandle,jsonVar,output_format)			
			continue
								
		#allow the user to monitor what percent of the sequences have been processed					
		startPer = useful.LoopStatus(counter,numSeq,10,startPer)
		
		bestScore = 0;
		bestBarcode = -1;
			
		jsonVar['Sequence'] = line_row[seq_var]
		jsonVar['Command'] = command_string
		counter+=1		
				
		seqFwd = jsonVar['Sequence']
		
		if strand_corrected:
			all_seqs = [seqFwd]
		else:
			all_seqs = [seqFwd,str(Seq(seqFwd).reverse_complement())]
		
		
		found_strand =''
		for pos,each_seq in enumerate(all_seqs):										
			#determine if we should take a substring of the sequence 
			#basically, only consider nucleotides AFTER the end of the ab field 
			if end_of_ab_field in line_row and line_row[end_of_ab_field]!='':
				try:
					end_of_ab = int(line_row[end_of_ab_field])							
				except:
					end_of_ab = 0
				#take substring
				if end_of_ab-overlap_len<len(each_seq) and end_of_ab-overlap_len>=0:
					each_seq = each_seq[end_of_ab:]																							
										
			isotypes_results = isotype_predictor.AlignToSeq(each_seq)
			if isotypes_results:
				found_strand = strand_orientation_list[pos]
				break
		
		
		if isotypes_results:
			found += 1 
			jsonVar = dict(jsonVar.items()+isotypes_results.items())
			
			jsonVar['Sequence strand'] = found_strand			
			
			
			if recombination_var in line_row and line_row[recombination_var]:
				#always trust the recombination type from input file IF provided
				jsonVar['Recombination type'] = line_row[recombination_var]
			else:
				#if there is no results then attemp to guess it our selves
				jsonVar['Recombination type'] = GuessRecombType(jsonVar['Isotype'][0])
			
			summary_data['top_isotype'][jsonVar['Isotype'][0]]+=1
			summary_data['average_num_isotype_found']+=len(jsonVar['Isotype'])
			summary_data['average_mismatch']+=jsonVar['Mismatches'][0]
		else:
			if recombination_var in line_row and line_row[recombination_var]:
				#always trust the recombination type from input file IF provided
				jsonVar['Recombination type'] = line_row[recombination_var]
		
		
			jsonVar['Isotype'] = ''
			jsonVar['Notes'] = 'Could not identify isotype with alignment score above threshold'
			summary_data['top_isotype']['NotFound']+=1
				
		writeSeqResult(outHandle,jsonVar,output_format)
				
		
	
	b = int(round(time.time()))
	
	summary_data['found'] = found
	if found:
		summary_data['average_mismatch'] = summary_data['average_mismatch']/float(found) 
		summary_data['average_num_isotype_found'] = summary_data['average_num_isotype_found']/float(found)
		
	totaltime=(b-a)			
	
	print "time: "
	print totaltime
	
	print "Summary of identified isotypes:"
	print summary_data
	
	#if total_isotype_found>0:
	#	print "\nAverage score for identified isotypes:"	
	#	print str(total_found_score/float(total_isotype_found))		
	
	#if numSeq-total_isotype_found>0:	
	#	print "\nAverage score for unidentified isotypes:"	
	#	print str(total_notfound_score/float(numSeq-total_isotype_found))
			
	outHandle.close()	
	#if output_file_format=="txt":
	#	JSON_to_TXT(output_file_location, output_file_location, True,{'Header':1,'Seq':2,'dir':3,'isotype':4,'algnPos':5,'maxscore':6,'bestscore':7})
	return output_file 
def run_gglab_pipeline(input_files, species, loci, group_name=''):
    # Unzip files
    print('Processing raw fastq files')
    processed_files = []
    for i, f in enumerate(input_files):
        folder_path = os.path.dirname(f)
        if f.endswith('.gz'):
            print('Unzipping: ', f)
            f = useful.gunzip_python(f)

        # Run trimmomatic
        trimming_parameters = {
            'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim),
            'MINLEN': min_read_len_post_trim
        }
        method = 'SE'
        trimmedf = processing.run_trimmomatic(f, folder_path, method,
                                              phred_encode,
                                              trimming_parameters)[0]
        # Run quality filtering
        filtered_trimmed_file = fastx.Run_Quality_Filter(
            trimmedf,
            output_dir=folder_path,
            quality=quality_cutoff,
            percent=percent_bases)
        os.remove(trimmedf)
        processed_files.append(filtered_trimmed_file)

    print('Annotating processed fastq files')
    annotated_files = []
    for i, f in enumerate(processed_files):
        output_file = useful.removeFileExtension(f) + '.mixcr.alignment'
        output_file_annotation = useful.removeFileExtension(
            f) + '.mixcr.annotation'
        # Run MIXCR file
        print('Running MIXCR')
        [annotated_f,
         command_val] = mixcr.RunMixcr(f,
                                       output_file,
                                       filetype='FASTQ',
                                       loci=[],
                                       species='',
                                       exportPrettyAlignment=False,
                                       num_threads=number_threads)
        # Parse MIXCR file
        print('Parsing MIXCR')
        annotated_file = mixcr.parseMIXCR(
            f,
            output_file,
            'FASTQ',
            output_file_annotation,
            command_val=command_val
        )  # again, annotated_file should be equal to outfile_annotation
        annotated_files.append(annotated_file)
    print('Pairing sequences')
    output_dir = os.path.dirname(annotated_files[0])
    pairing.RunPairing(annotated_files,
                       annotated_file_formats='TAB',
                       analysis_method='MIXCR',
                       output_folder_path=output_dir,
                       prefix_output_files=group_name,
                       cluster_cutoff=cluster_setting,
                       annotation_cluster_setting=annotation_cluster_cutoff)
    print('Pipeline complete')