コード例 #1
0
def parseMIXCR(originalfileloc,resultfileloc,inputype,outfile=None,header_var='document_header',sequence_var='sequence',command_val = {}):
								
	command_string = json.dumps(command_val) if command_val else json.dumps({'MIXCR V1.3': 'Unknown settings'})
	if not outfile:
		outfile = "%s-parsed.annotation"%resultfileloc
	
	print('Parsing mixcr file')
	number_of_annotation_lines = useful.file_line_count(resultfileloc)	
	
	seqfile=immunogrepFile(originalfileloc,inputype) #the original file used as an input file for mixcr annotation  	
	iffile=immunogrepFile(resultfileloc,'TAB',None)#,"\t",True,"r") #the mixcr generated alignment file 
	
	parent_folder = '/'.join(resultfileloc.split('/')[:-1])+'/'
	error_file = open(resultfileloc+'.errorlog.txt','w')
	unfound_seqs = open(resultfileloc+'.notfound.txt','w') 
	notfound=0
	seq_num=0
	errors=0
	needcapture = True
	
	looper = useful.LoopStatusGen(number_of_annotation_lines,10)
	t1 = time.time()
	with open(outfile,"w") as f:
		f.write(descriptor_symbol+json.dumps(DatabaseTranslator())+'\n')#write a translator line to this file so that we know how to add results to database 
		f.write('\t'.join(presetlabels)+'\n')		
		
		#read each input sequence/file
		for fastseq in seqfile.read():			
			try:
				content={}
				if not fastseq:
					continue
				
				#read in the annotation information from mixcr
				seq = fastseq[sequence_var].upper()
				#extract sequence header and the SEQ_ID field from input file 
				[header, id] = GetHeaderInfo(fastseq,header_var)
				
				if needcapture:
					#we need to match this sequence to mixcr program output
					if iffile.IFclass.eof:
						mixcr_data = None
					else:
						mixcr_data = iffile.IFclass.read()
						#print percent status completed
						looper.next()
				
				#check whether mixcr data matches the current sequence 
				strand=''
				if mixcr_data:
					if 'Read id' in mixcr_data:
						if int(mixcr_data['Read id']) == seq_num:
							matched_seqs = True
						else:
							matched_seqs=False							
					elif 'Description R1' in mixcr_data:
						if mixcr_data['Description R1'].strip() == fastseq[header_var].strip():
							matched_seqs=True							
						else:
							matched_seqs = False							
					else:
						mixcr_data['Read(s) sequence'] = mixcr_data['Read(s) sequence'].upper()				
						[matched_seqs,strand] = match_sequence(seq,mixcr_data['Read(s) sequence'])					
					mixcr_seq = mixcr_data['Read(s) sequence']
				else:
					mixcr_seq = ''
					matched_seqs=False
					strand=''
					needcapture=True
								
				if matched_seqs==False:
					#these results did not match mixcr sequence, so this sequence probably did not yield any results
					#so we do not need to recapture a new miseq sequence. We will just stay with this one 
					needcapture = False
					content['Sequence']=seq
					content['Seqheader']=header
					content['Notes'] = 'Sequence not found in mixcr file;'
					content[idIdentifier] = id
					unfound_seqs.write('\t'.join([content['Seqheader'],content['Sequence'],mixcr_seq])+'\n')														
					content['Command'] = command_string
					content = defaultdict(str,content)
					output_line = [str(content[lab]) for lab in presetlabels]
					f.write('\t'.join(output_line)+'\n')					
					notfound+=1
					seq_num+=1		
					continue
				
				seq_num+=1		
				#in the next iteration of the code, we will need to get a fresh mixcr result
				needcapture=True						
				content = mixcr_data
				content['Notes'] = ''
				content[idIdentifier] = id
				content['Seqheader'] = header
				r_j = ''
				r_v = ''
				chain_v = ''
				
				content['Sequence']=seq
				content['Strand corrected sequence'] = content['Read(s) sequence']				
				[content['Full NT'],content['5_Prime_Annotation'],content['3_Prime_Annotation'],missing_fields]=return_full_nt(content)				
				if missing_fields:
					content['Notes']+='The sequence is missing features between the 5 prime and 3 prime region;'
					content['3_Prime_Annotation']=content['3_Prime_Annotation']+'*'				
					content['Full length'] = 'FALSE'
				else:
					if content['5_Prime_Annotation'] == 'FR1' and content['3_Prime_Annotation'] == 'FR4':						
						content['Full length'] = 'TRUE'
					else:
						content['Full length'] = 'FALSE'
					
				[content['Full AA'],content['Productivity']] =GetFullAA(content,missing_fields)
				if content['AA. seq. CDR3'] and content['AA. seq. CDR3'] in content['Full AA']:
					content['CDR3_Junction_In_Frame']= 'TRUE'
				else:
					content['CDR3_Junction_In_Frame']= 'FALSE'
					
				if content['All V hits']:
					[vgenelist,vscorelist,vlocus,chain_v,r_v]=extractScores(content['All V hits'])
					content['All V hits']=','.join(vgenelist)
					content['All V scores']=','.join(vscorelist)
					content['FirstVgene']=vgenelist[0]
					content['Locus']=vlocus
																	
				else:
					content['All V hits']=''
					content['All V scores']=''
					content['FirstVgene']=''
					content['Locus']=''
					
				if content['All D hits']:
					[dgenelist,dscorelist,dlocus,chain,recomb]=extractScores(content['All D hits'])
					content['All D hits']=','.join(dgenelist)
					content['All D scores']=','.join(dscorelist)
					content['FirstDgene']=dgenelist[0]
				else:
					content['All D hits']=''
					content['All D scores']=''
					content['FirstDgene']=''
					
					
				if content['All J hits']:
					[jgenelist,jscorelist,jlocus,chain,r_j]=extractScores(content['All J hits'])
					content['All J hits']=','.join(jgenelist)
					content['All J scores']=','.join(jscorelist)
					content['FirstJgene']=jgenelist[0]
				else:
					r_j = r_v
					content['All J hits']=''
					content['All J scores']=''
					content['FirstJgene']=''
					
				if content['All C hits']:
					[cgenelist,cscorelist,clocus,chain,recomb]=extractScores(content['All C hits'])
					content['All C hits']=','.join(cgenelist)
					content['All C scores']=','.join(cscorelist)
				else:
					content['All C hits']=''
					content['All C scores']=''
				if r_j == r_v:
					content['Recombination Type'] = r_v
					content['Chain'] = chain_v
				else:
					content['Recombination Type'] = ''
					content['Chain'] = ''
				
				if content['All V alignment']:
					[query_start,query_end,germ_start,germ_end,algn_len,num_mismatch,num_ins,num_del,shm,alignment_string] = ParseAlignment(content['All V alignment'])
					content['VGENE: Query start'] = query_start
					content['VGENE: Query end'] = query_end
					content['VGENE: Germline start'] = germ_start
					content['VGENE: Germline end'] = germ_end
					content['VGENE: Shm.nt'] = num_ins+num_del+num_mismatch
					content['VGENE: Mismatch'] = num_mismatch
					content['VGENE: Insertion'] = num_ins
					content['VGENE: Deletion'] = num_del
					content['VGENE: Alignment'] = alignment_string
					content['VGENE: Shm.per'] = round(100*shm,3)
					content['VGENE: Alignment length'] = algn_len						
					content['AB end'] = query_end
					content['AB start'] = query_start
					
				if content['All J alignment']:
					[query_start,query_end,germ_start,germ_end,algn_len,num_mismatch,num_ins,num_del,shm,alignment_string] = ParseAlignment(content['All J alignment'])
					content['JGENE: Query start'] = query_start
					content['JGENE: Query end'] = query_end
					content['JGENE: Germline start'] = germ_start
					content['JGENE: Germline end'] = germ_end
					content['JGENE: Shm.nt'] = num_ins+num_del+num_mismatch
					content['JGENE: Mismatch'] = num_mismatch
					content['JGENE: Insertion'] = num_ins
					content['JGENE: Deletion'] = num_del
					content['JGENE: Alignment'] = alignment_string
					content['JGENE: Shm.per'] = round(100*shm,3)
					content['JGENE: Alignment length'] = algn_len
					content['AB end'] = query_end
					if 'AB start' not in content:
						content['AB start'] = query_start
								
				content['Orientation'] = guess_strand(content['Full NT'],content['Sequence'])
				content['Command'] = command_string
				content = defaultdict(str,content)
				output_line = [str(content[lab]) for lab in presetlabels]
				f.write('\t'.join(output_line)+'\n')
																				
			except Exception as e:
				errors+=1
				print('There was an error in sequence: '+str(seq_num))
				print('Error: '+str(e))				
				error_file.write('****ERROR FOUND IN SEQUENCE:{0}  ****\n'.format(str(seq_num)))
				error_file.write(useful.print_error_string(e)+'\n')
				error_file.write('MIXCR DATA: \n')
				error_file.write(json.dumps(content,indent=4)+'\n')
				error_file.write('*************END OF ERROR*********\n')
														
					
	iffile.IFclass.close()
	seqfile.IFclass.close()
	error_file.close()
	unfound_seqs.close()
	
	if errors==0:
		os.remove(resultfileloc+'.errorlog.txt')
	
	if notfound==0:
		os.remove(resultfileloc+'.notfound.txt')
	t2 =time.time()
	print(str(t2-t1))
	return outfile
コード例 #2
0
def GenerateResultsSummaryFile(output_file,analysis_requests,input_file_paths,exp_names,aa_file_location,cdr3_file_location,v_gene_file_location,j_gene_file_location,vj_gene_file_location,plots_created,num_sequences,num_results,num_vdj,num_vj,num_cdr3,num_stop_codon,cdr3_length_stats,diversity_measurements):
	
	with open(output_file,'w') as summary:
		summary.write('Data generated on: %s\n' %(datetime.datetime.now()))		
		summary.write("******************************************************************************\n")
		summary.write("******************Summary Report for Descriptive Statistics*******************\n")
		summary.write("******************************************************************************\n\n\n")
		summary.write('The following experiments were used in the analysis:\n')
		summary.write('\tExperiment Name\tFile path\n')
		exp_string = ""
		for num,exp in enumerate(exp_names):
			exp_string += '\t'+exp
			if isinstance(input_file_paths[num],list):
				exp_string+='\t'+input_file_paths[num][0]+'\n'
				for k in range(1,len(input_file_paths[num][0])):
					exp_string+='\t\t'+input_file_paths[num][k]+'\n'
			else:
				exp_string+='\t'+input_file_paths[num]+'\n'
		summary.write(exp_string+'\n\n')
		summary.write('The following analyses were requested: {0}'.format(','.join(analysis_requests))+'\n\n')
		
		total_unique_aa_count = 0
		if aa_file_location:
			total_unique_aa_count = useful.file_line_count(aa_file_location)-1				
				
		summary.write('General File Info\n')
		summary.write('\t'+'\t'.join(['Experiment name','Number of results found','Number sequences containing antibody sequence','Number sequences with stop codon','Num VDJ','Num VJ','Number sequences with CDR3 amino acid sequence'])+'\n')
		for ind,each_exp in enumerate(exp_names):
			num_total_seqs = num_sequences[ind]			
			res_string = str(num_results[ind])+' ('+str(round(100*float(num_results[ind])/num_total_seqs,3))+'%)'
			stop_string = str(num_stop_codon[ind])+' ('+str(round(100*float(num_stop_codon[ind])/num_total_seqs,3))+'%)'
			summary.write('\t'+'\t'.join([each_exp,str(num_total_seqs),res_string,stop_string,str(num_vdj[ind]),str(num_vj[ind]),str(num_cdr3[ind])])+'\n')
		if len(exp_names)>1:
			num_total_seqs = sum(num_sequences)
			res_string = str(sum(num_results))+' ('+str(round(100*float(sum(num_results))/num_total_seqs,3))+'%)'
			stop_string = str(sum(num_stop_codon))+' ('+str(round(100*float(sum(num_stop_codon))/num_total_seqs,3))+'%)'
			summary.write('\t'+'\t'.join(['Total',str(num_total_seqs),res_string,stop_string,str(sum(num_vdj)),str(sum(num_vj)),str(sum(num_cdr3))])+'\n')
		if total_unique_aa_count:
			summary.write('A total of {0} unique amino acid antibody sequences were found across all files\n'.format(str(total_unique_aa_count)))
		
		if 'cdr3' in analysis_requests:								
			summary.write('\nCDR3 Results Summary\n')			
			summary.write('*The average CDR3 length only considers CDR3 amino acid sequences with more than 2 amino acids\n')
			use_names = exp_names
			if len(exp_names)>1:
				use_names.append('TOTAL_COUNTS')					
			for rtype in ['VDJ','VJ']:				
				if rtype in cdr3_length_stats or rtype in diversity_measurements:
					summary.write('CDR3 - '+rtype+'\n')			
					summary.write('\t'+'\t'.join(['Experiment name','Average CDR3 length*','Standard deviation CDR3 length','Number unique CDR3 AA','Number unique CDR3 AA identified more than two times','Shannon diversity index', 'Ginni-simpsons diversity index','Normalized shannon diversity','Normalized ginni-simpsons diversity'])+'\n')									
					
					for exp_num,each_exp in enumerate(use_names):															
						if each_exp == 'TOTAL_COUNTS':
							results = ['All experiments']
						else:
							results = [each_exp]																						
						if each_exp in cdr3_length_stats[rtype]['mean']:
							results.extend([str(cdr3_length_stats[rtype]['mean'][each_exp]),str(cdr3_length_stats[rtype]['std'][each_exp])])
						else:
							results.extend(['',''])																		
						try:
							results.append(str(diversity_measurements[rtype]['unique_cdr3s'][each_exp]))
						except:
							results.append('')							
						try:
							results.append(str(diversity_measurements[rtype]['num_above_2'][each_exp]))
						except:
							results.append('')						
						try:
							results.append(str(diversity_measurements[rtype]['shannon_entropy']['index'][each_exp]))
						except:
							results.append('')						
						try:
							results.append(str(diversity_measurements[rtype]['ginni_simpsons']['index'][each_exp]))
						except:
							results.append('')						
						try:
							results.append(str(diversity_measurements[rtype]['shannon_entropy']['true_diversity'][each_exp]))
						except:
							results.append('')						
						try:
							results.append(str(diversity_measurements[rtype]['ginni_simpsons']['true_diversity'][each_exp]))
						except:
							results.append('')																		
						summary.write('\t'+'\t'.join(results)+'\n')																																		
				summary.write('\n')
		
		summary.write('Result files created from program\n')
		if aa_file_location:
			summary.write('\tA file containing the unique antibody amino acid sequences and counts can be found in the following location:\n\t\t{0}\n'.format(aa_file_location)) 			
		if cdr3_file_location:
			summary.write('\tA file containing a list of unique cdr3 amino acid sequences and counts can be found in the following location:\n\t\t{0}\n'.format(cdr3_file_location)) 
		if v_gene_file_location:
			summary.write('\tA file summarizing VGENE usage can be found in the following location:\n\t\t{0}\n'.format(v_gene_file_location))
		if j_gene_file_location:
			summary.write('\tA file summarizing JGENE usage can be found in the following location:\n\t\t{0}\n'.format(j_gene_file_location))
		if vj_gene_file_location:
			summary.write('\tA file summarizing V and JGENE usage can be found in the following location:\n\t\t{0}\n'.format(vj_gene_file_location))
		if len(plots_created)>0:
			summary.write('\tThe following figures in both PNG and SVG format were generated during the analysis:\n')
			for fig in plots_created:
				summary.write('\t\t'+fig+'\n')		
コード例 #3
0
def parseMIXCR(originalfileloc,
               resultfileloc,
               inputype,
               outfile=None,
               header_var='document_header',
               sequence_var='sequence',
               quality_var='phred',
               command_val={}):

    command_string = json.dumps(command_val) if command_val else json.dumps(
        {'MIXCR V1.3': 'Unknown settings'})
    if not outfile:
        outfile = "%s-parsed.annotation" % resultfileloc

    print('Parsing mixcr file')
    number_of_annotation_lines = useful.file_line_count(resultfileloc)

    seqfile = immunogrepFile(
        originalfileloc, inputype
    )  #the original file used as an input file for mixcr annotation
    iffile = immunogrepFile(
        resultfileloc, 'TAB',
        None)  #,"\t",True,"r") #the mixcr generated alignment file

    parent_folder = '/'.join(resultfileloc.split('/')[:-1]) + '/'
    error_file = open(resultfileloc + '.errorlog.txt', 'w')
    unfound_seqs = open(resultfileloc + '.notfound.txt', 'w')
    notfound = 0
    seq_num = 0
    errors = 0
    needcapture = True

    looper = useful.LoopStatusGen(number_of_annotation_lines, 10)
    t1 = time.time()
    with open(outfile, "w") as f:
        f.write(
            descriptor_symbol + json.dumps(DatabaseTranslator()) + '\n'
        )  #write a translator line to this file so that we know how to add results to database
        f.write('\t'.join(presetlabels) + '\n')

        #read each input sequence/file
        for fastseq in seqfile.read():
            try:
                content = {}
                if not fastseq:
                    continue

                #read in the annotation information from mixcr
                seq = fastseq[sequence_var].upper()
                #extract sequence header and the SEQ_ID field from input file
                [header, id] = GetHeaderInfo(fastseq, header_var)

                if needcapture:
                    #we need to match this sequence to mixcr program output
                    if iffile.IFclass.eof:
                        mixcr_data = None
                    else:
                        mixcr_data = iffile.IFclass.read()
                        #print percent status completed
                        looper.next()

                #check whether mixcr data matches the current sequence
                strand = ''
                if mixcr_data:
                    if 'Read id' in mixcr_data:
                        if int(mixcr_data['Read id']) == seq_num:
                            matched_seqs = True
                        else:
                            matched_seqs = False
                    elif 'Description R1' in mixcr_data:
                        if mixcr_data['Description R1'].strip(
                        ) == fastseq[header_var].strip():
                            matched_seqs = True
                        else:
                            matched_seqs = False
                    else:
                        mixcr_data['Read(s) sequence'] = mixcr_data[
                            'Read(s) sequence'].upper()
                        [matched_seqs, strand
                         ] = match_sequence(seq,
                                            mixcr_data['Read(s) sequence'])
                    mixcr_seq = mixcr_data['Read(s) sequence']
                else:
                    mixcr_seq = ''
                    matched_seqs = False
                    strand = ''
                    needcapture = True

                if matched_seqs == False:
                    #these results did not match mixcr sequence, so this sequence probably did not yield any results
                    #so we do not need to recapture a new miseq sequence. We will just stay with this one
                    needcapture = False
                    content['Sequence'] = seq
                    content['Seqheader'] = header
                    if quality_var in fastseq:
                        content['Read(s) sequence qualities'] = fastseq[
                            quality_var]
                    content['Notes'] = 'Sequence not found in mixcr file;'
                    content[idIdentifier] = id
                    unfound_seqs.write('\t'.join([
                        content['Seqheader'], content['Sequence'], mixcr_seq
                    ]) + '\n')
                    content['Command'] = command_string
                    content = defaultdict(str, content)
                    output_line = [str(content[lab]) for lab in presetlabels]
                    f.write('\t'.join(output_line) + '\n')
                    notfound += 1
                    seq_num += 1
                    continue

                seq_num += 1
                #in the next iteration of the code, we will need to get a fresh mixcr result
                needcapture = True
                content = mixcr_data
                content['Notes'] = ''
                content[idIdentifier] = id
                content['Seqheader'] = header
                r_j = ''
                r_v = ''
                chain_v = ''

                content['Sequence'] = seq
                content['Strand corrected sequence'] = content[
                    'Read(s) sequence']
                [
                    content['Full NT'], content['5_Prime_Annotation'],
                    content['3_Prime_Annotation'], missing_fields
                ] = return_full_nt(content)
                if missing_fields:
                    content[
                        'Notes'] += 'The sequence is missing features between the 5 prime and 3 prime region;'
                    content['3_Prime_Annotation'] = content[
                        '3_Prime_Annotation'] + '*'
                    content['Full length'] = 'FALSE'
                else:
                    if content['5_Prime_Annotation'] == 'FR1' and content[
                            '3_Prime_Annotation'] == 'FR4':
                        content['Full length'] = 'TRUE'
                    else:
                        content['Full length'] = 'FALSE'

                [content['Full AA'],
                 content['Productivity']] = GetFullAA(content, missing_fields)
                if content['AA. Seq. CDR3'] and content[
                        'AA. Seq. CDR3'] in content['Full AA']:
                    content['CDR3_Junction_In_Frame'] = 'TRUE'
                else:
                    content['CDR3_Junction_In_Frame'] = 'FALSE'

                if content['All V hits']:
                    [vgenelist, vscorelist, vlocus, chain_v,
                     r_v] = extractScores(content['All V hits'])
                    content['All V hits'] = ','.join(vgenelist)
                    content['All V scores'] = ','.join(vscorelist)
                    content['FirstVgene'] = vgenelist[0]
                    content['Locus'] = vlocus

                else:
                    content['All V hits'] = ''
                    content['All V scores'] = ''
                    content['FirstVgene'] = ''
                    content['Locus'] = ''

                if content['All D hits']:
                    [dgenelist, dscorelist, dlocus, chain,
                     recomb] = extractScores(content['All D hits'])
                    content['All D hits'] = ','.join(dgenelist)
                    content['All D scores'] = ','.join(dscorelist)
                    content['FirstDgene'] = dgenelist[0]
                else:
                    content['All D hits'] = ''
                    content['All D scores'] = ''
                    content['FirstDgene'] = ''

                if content['All J hits']:
                    [jgenelist, jscorelist, jlocus, chain,
                     r_j] = extractScores(content['All J hits'])
                    content['All J hits'] = ','.join(jgenelist)
                    content['All J scores'] = ','.join(jscorelist)
                    content['FirstJgene'] = jgenelist[0]
                else:
                    r_j = r_v
                    content['All J hits'] = ''
                    content['All J scores'] = ''
                    content['FirstJgene'] = ''

                if content['All C hits']:
                    [cgenelist, cscorelist, clocus, chain,
                     recomb] = extractScores(content['All C hits'])
                    content['All C hits'] = ','.join(cgenelist)
                    content['All C scores'] = ','.join(cscorelist)
                else:
                    content['All C hits'] = ''
                    content['All C scores'] = ''
                if r_j == r_v:
                    content['Recombination Type'] = r_v
                    content['Chain'] = chain_v
                else:
                    content['Recombination Type'] = ''
                    content['Chain'] = ''

                if content['All V alignments']:
                    [
                        query_start, query_end, germ_start, germ_end, algn_len,
                        num_mismatch, num_ins, num_del, shm, alignment_string
                    ] = ParseAlignment(content['All V alignments'])
                    content['VGENE: Query start'] = query_start
                    content['VGENE: Query end'] = query_end
                    content['VGENE: Germline start'] = germ_start
                    content['VGENE: Germline end'] = germ_end
                    content['VGENE: Shm.nt'] = num_ins + num_del + num_mismatch
                    content['VGENE: Mismatch'] = num_mismatch
                    content['VGENE: Insertion'] = num_ins
                    content['VGENE: Deletion'] = num_del
                    content['VGENE: Alignment'] = alignment_string
                    content['VGENE: Shm.per'] = round(100 * shm, 3)
                    content['VGENE: Alignment length'] = algn_len
                    content['AB end'] = query_end
                    content['AB start'] = query_start

                if content['All J alignments']:
                    [
                        query_start, query_end, germ_start, germ_end, algn_len,
                        num_mismatch, num_ins, num_del, shm, alignment_string
                    ] = ParseAlignment(content['All J alignments'])
                    content['JGENE: Query start'] = query_start
                    content['JGENE: Query end'] = query_end
                    content['JGENE: Germline start'] = germ_start
                    content['JGENE: Germline end'] = germ_end
                    content['JGENE: Shm.nt'] = num_ins + num_del + num_mismatch
                    content['JGENE: Mismatch'] = num_mismatch
                    content['JGENE: Insertion'] = num_ins
                    content['JGENE: Deletion'] = num_del
                    content['JGENE: Alignment'] = alignment_string
                    content['JGENE: Shm.per'] = round(100 * shm, 3)
                    content['JGENE: Alignment length'] = algn_len
                    content['AB end'] = query_end
                    if 'AB start' not in content:
                        content['AB start'] = query_start

                content['Orientation'] = guess_strand(content['Full NT'],
                                                      content['Sequence'])
                content['Command'] = command_string
                content = defaultdict(str, content)
                output_line = [str(content[lab]) for lab in presetlabels]
                f.write('\t'.join(output_line) + '\n')

            except Exception as e:
                errors += 1
                print('There was an error in sequence: ' + str(seq_num))
                print('Error: ' + str(e))
                error_file.write(
                    '****ERROR FOUND IN SEQUENCE:{0}  ****\n'.format(
                        str(seq_num)))
                error_file.write(useful.print_error_string(e) + '\n')
                error_file.write('MIXCR DATA: \n')
                error_file.write(json.dumps(content, indent=4) + '\n')
                error_file.write('*************END OF ERROR*********\n')

    iffile.IFclass.close()
    seqfile.IFclass.close()
    error_file.close()
    unfound_seqs.close()
    if errors == 0:
        os.remove(resultfileloc + '.errorlog.txt')
    if notfound == 0:
        os.remove(resultfileloc + '.notfound.txt')
    t2 = time.time()
    print(str(t2 - t1))
    return outfile
コード例 #4
0
def Descriptive_Statistics(list_of_files,input_file_type,analysis_name='',exp_names = [],output_file_prefix='',fields={},statistics_to_run=['ab_aa','cdr3','vgene','jgene','vjgene']):
	analysis_name = analysis_name.upper()
	if input_file_type=='IMGT' and not isinstance(list_of_files[0],list):
		list_of_files = [list_of_files]
	elif not isinstance(list_of_files,list):
		list_of_files = [list_of_files]
		
	if len(exp_names)!=len(list_of_files):
		exp_names = []
	
	#by default, save results to the same folder as the input file
	if not output_file_prefix:		
		output_file_prefix = useful.removeFileExtension(list_of_files[0])
	
	analysis_name = analysis_name.upper()
	supported_analyses = fields_for_analysis.keys()
	if (not analysis_name or analysis_name=='CUSTOM' or analysis_name not in supported_analyses) and not fields:
		raise Exception('The required fields for the provided analysis, {0}, is not currently automated. Please explicity provide the fields names'.format(str(analysis_name)))
	
	#first we use default fields defined ehere
	if analysis_name in supported_analyses:
		fields_to_use = copy.deepcopy(fields_for_analysis[analysis_name])
	else:
		fields_to_use = {}
	#next we add in user defined fields just in case there are any changes/mistakes
	for f,name in fields.iteritems():
		fields_to_use[f] = name 
	
	
	
	filenames_to_use = [f[0] if isinstance(f,list) else f for f in list_of_files]
	print('Performing descriptive statistics at {0}.'.format(str(datetime.datetime.now())))
	print('Analyzing the following files:\n\t {0}'.format('\n\t'.join(filenames_to_use)))
	unique_aa_file = None 
	unique_cdr3_file = None 	
	v_gene_analysis = None
	j_gene_analysis = None
	vj_gene_analysis = None
	gene_analysis_plot = output_file_prefix
	plots_created = []
	gene_summary_file = output_file_prefix+'.summary_of_stats.txt'
	
	
	output_file_names = {}
	
	aa_files = ['AB AA SEQUENCE','RECOMBINATION_TYPE','LOCUS','CDR1','CDR2','CDR3','STOP CODONS','PRODUCTIVE','VGENES','DGENES','JGENES','TOTAL COUNTS']
	fields_order = ['full_len_ab','recomb','locus','cdr1','cdr2','cdr3','stopc','functionality','vgene','dgene','jgene']
	num_exp= len(list_of_files)
	if not exp_names:
		if input_file_type=='IMGT':
			pass
		else:			
			exp_names = []
			for file in list_of_files:
				count = 1
				str_file = os.path.basename(file)
				while True:					
					if str_file in exp_names:
						str_file = os.path.basename(file)+'_'+str(count)
						count+=1
					else:
						exp_names.append(str_file)
						break			
		
	if 'ab_aa' in statistics_to_run:
		intermediate_file =  output_file_prefix+'.unique_aa_file_temp'
		#first we will use a temp file/intermeidate file 
		output_file_names['ab_aa'] = open(intermediate_file,'w')
		#output_file_names['ab_aa'].write('\t'.join(aa_files)+'\n')
	
	
	cdr3analysis = True if 'cdr3' in statistics_to_run else False
	aaanalysis = True if 'ab_aa' in statistics_to_run else False
	
	vjgene_dict=defaultdict(lambda:defaultdictgenes(num_exp))
	
	#cdr3_dict=defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_vdj = defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_vj = defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_unk = defaultdict(lambda:defaultdictcdr3(num_exp))	
	
	use_these_fields = fields_to_use.values()
	fields_to_use['stopc'] = 'stopc'
	num_results = [0]*(num_exp)
	num_cdr3 = [0]*(num_exp)
	num_stop_codon = [0]*(num_exp)
	num_vdj = [0]*(num_exp)
	num_vj = [0]*(num_exp)
	num_sequences = [0]*(num_exp)
	
	
	if not fields_to_use['recomb']:
		#maybe the user never defined a feild for recombinoation type..that coudl be a problem because we will have to guess it using the variable at the top of the script: recomb_call
		recomb_not_defined = True	
		fields_to_use['recomb'] = 'recomb'
	else:
		recomb_not_defined = False
	
	
	print('Reading through sequences in file(s)')
	seqnum=1
	#go through all of the files and report the relevant fields 
	#if we are creating a unique amino acid file, then report thiese fields to temp file
	for fnum,each_file in enumerate(list_of_files):				
		annotated_file = readfile.immunogrepFile(each_file,input_file_type,field_names = use_these_fields)
		#loop through each file 
		for seq_lines in annotated_file.read():						
			if not seq_lines:
				continue
			if seqnum%500000==0:
				print('Read {0} sequences'.format(str(seqnum)))
			seqnum+=1
			num_sequences[fnum]+=1			
			seq_lines = defaultdict(str,seq_lines)
			if seq_lines[fields_to_use['full_len_ab']]:
				#full length antibody sequence not found
				num_results[fnum]+=1				
												
			#only select the first gene in the list. alos remove allelic name ('*')
			seq_lines[fields_to_use['vgene']] = seq_lines[fields_to_use['vgene']].split(',')[0].split('*')[0]
			seq_lines[fields_to_use['dgene']] = seq_lines[fields_to_use['dgene']].split(',')[0].split('*')[0]
						
			#IF NO RECOMBINATION TYPE IS FOUND or provided, THEN guess it using the vgene or jgene call
			if recomb_not_defined or not seq_lines[fields_to_use['recomb']]:
				r = '' #not sure what the recombation type is yet
				#try to guess the recombination type 				
				if seq_lines[fields_to_use['vgene']]:
					#use vgene if present
					# look at the first three characters in vgene to predict recombioation type
					gn = ProcessGene(seq_lines[fields_to_use['vgene']])
					if gn[:3] in recomb_call:
						r = recomb_call[gn[:3]]
					elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example 
						r = recomb_call[gn[:2]]
				if not r and seq_lines[fields_to_use['jgene']]:
					#still not r found, so use jgene 
					gn = ProcessGene(seq_lines[fields_to_use['jgene']])
					if gn[:3] in recomb_call:
						r = recomb_call[gn[:3]]
					elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example 
						r = recomb_call[gn[:2]]					
				
				#update recomb result 
				seq_lines[fields_to_use['recomb']] = r								
			
			if not seq_lines[fields_to_use['recomb']]:
				continue
								
			if seq_lines[fields_to_use['recomb']] == 'VDJ':
				num_vdj[fnum]+=1								
			elif seq_lines[fields_to_use['recomb']] == 'VJ':				
				num_vj[fnum]+=1
				
			
			seq_lines[fields_to_use['jgene']] = seq_lines[fields_to_use['jgene']].split(',')[0].split('*')[0]
			seq_lines['stopc'] = 'YES' if '*' in seq_lines[fields_to_use['full_len_ab']] else 'NO'			
			if seq_lines['stopc'] == 'YES':
				num_stop_codon[fnum]+=1
			if aaanalysis:
				exp_str = str(fnum+1)
				#make an intermediate file where we only put the fields we want in the proper order from any file 
				#we will use this field for sorting afterwards
				#also output exp_num to account for which sequence came from which experiment 
				output_file_names['ab_aa'].write('\t'.join([seq_lines[fields_to_use[f]] for f in fields_order])+'\t'+str(exp_str)+'\n')
			if seq_lines[fields_to_use['vgene']] or seq_lines[fields_to_use['jgene']]:							
				key_v =delim.join([seq_lines[fields_to_use['vgene']],seq_lines[fields_to_use['jgene']],seq_lines[fields_to_use['recomb']]])
				vjgene_dict[key_v][fnum]+=1
			
			if not seq_lines[fields_to_use['cdr3']]:
				#no cdr3 found 	
				continue
			
			#add unique cdr3_recomb and vjgene info to dictionaires
			num_cdr3[fnum]+=1
			
			if cdr3analysis:
				key = seq_lines[fields_to_use['cdr3']]
				#key_cdr3 = delim.join([],seq_lines[fields_to_use['recomb']]])
				if seq_lines[fields_to_use['recomb']]=='VDJ':
					cdr3_dict_vdj[key][fnum]+=1
				elif seq_lines[fields_to_use['recomb']]=='VJ':
					cdr3_dict_vj[key][fnum]+=1					
				else:
					print('unknown recombination types: ',seq_lines[fields_to_use['recomb']])
					cdr3_dict_unk[key][fnum]+=1 
									  
			if seqnum>10000:
				break
				
					
	if aaanalysis:
		output_file_names['ab_aa'].close()
		print('Generating a file of unique AB amino acid sequences')
		unique_aa_file = output_file_prefix+'.unique_aa_file.txt'
		#Use some bash to make a unique amino acid file using sorting and then some awk 
		GenerateAAFile(intermediate_file,unique_aa_file,aa_files,exp_names)
		#number of amino acid sequences observed
		if not os.path.isfile(unique_aa_file):
			num_unique_aa = 0 
		else:
			num_unique_aa = useful.file_line_count(unique_aa_file)-1 #-1 => remove header row count
	
	#Now have some fun with pandas 	
	if set(['vgene','jgene','vjgene']) & set(statistics_to_run):
		#vjgene_dict format = {
			#'key' = 'vgene',_,'jgene',_,'recombtype'
			#value = [count,count] => a list of counts for presence of that key in EACH providced file/experiment. Length of list = number of experiments
		#}
		gene_df = pd.DataFrame(vjgene_dict).transpose()
		if 'VGENE' not in gene_df.columns:
			gene_df['VGENE'] = ''
		if 'JGENE' not in gene_df.columns:
			gene_df['JGENE'] = ''
		if 'recomb' not in gene_df.columns:
			gene_df['recomb'] = ''
		gene_df['TOTAL_COUNTS'] = gene_df.sum(axis=1)		
		gene_df = gene_df.reset_index()				
		gene_df = gene_df.apply(ModifyPDTable,axis=1,args=(['VGENE','JGENE','recomb'],delim))
		
		
		new_names = {}
		for f,v in enumerate(exp_names):
			new_names[f]=v
			#key = experiment index number
			#value = new name

		#rename the columns 0,1,...num experiments to match the experiment names 
		gene_df = gene_df.rename(columns=new_names)
		
		#format of gene_df:
			#index => no index set, just use default numbers
			#columns => start with column for each experiment, then add the following columns: VGENE, JGENE, recomb, TOTAL_COUNTS

		if 'vgene' in statistics_to_run:
			print('Performing V gene analysis')
			
			v_gene_analysis = output_file_prefix+'.vgenes.txt'
			#group elements by VH GENE CALLS and VL gene calls 
			sorted_v_counts =  gene_df.groupby(['recomb','VGENE']).sum()#.count()#.sort('VGENE',ascending=1)						
			
			#find out which level in multilevel index corresponds to 'VGENE' => looking at above code , it should be level 1 (recomb should be level 0)
			vgene_level = sorted_v_counts.index.names.index('VGENE')			
			
			#remove results where vGENE is empty
			if '' in list(sorted_v_counts.index.levels[vgene_level]):
				sorted_v_counts = sorted_v_counts.drop('',level='VGENE')			
			
			ignore_counts = ['TOTAL_COUNTS','JGENE']
			keep_col = [n for n in sorted_v_counts.columns if n not in ignore_counts]
			g = sorted_v_counts[keep_col]			
			
			#NOW PLOT the FREQUENCY for every exeprement 
			if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]):
				
				vdj_g = g.xs('VDJ',level='recomb')
				
				PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.vgenes','VH Gene Distribution','Frequency','V Gene',max_val=None,min_val=0)
				
				plots_created.append(gene_analysis_plot+'.vdj.vgenes.png') #.png extension is added in the function plotgenedist
				
			if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]):
				
				vj_g = g.xs('VJ',level='recomb')
				PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.vgenes','VL Gene Distribution','Frequency','V Gene',max_val=None,min_val=0)			
				plots_created.append(gene_analysis_plot+'.vj.vgenes.png') #.png extension is added in the function plotgenedist
			sorted_v_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(v_gene_analysis,sep='\t',index=False)			
			
		
		#do the same as above, except for J genes this time 
		if 'jgene' in statistics_to_run:
			print('Performing J gene analysis')
			j_gene_analysis = output_file_prefix+'.jgenes.txt'
			sorted_j_counts =  gene_df.groupby(['recomb','JGENE']).sum()#.sort('VGENE',ascending=1)						
			jgene_level = sorted_j_counts.index.names.index('JGENE')			
			if '' in list(sorted_j_counts.index.levels[jgene_level]):
				sorted_j_counts.drop('',level='JGENE',inplace=True)			
			ignore_counts = ['TOTAL_COUNTS','VGENE']
			keep_col = [n for n in sorted_j_counts.columns if n not in ignore_counts]
			g = sorted_j_counts[keep_col]			
			sorted_j_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(j_gene_analysis,sep='\t',index=False)
			
			#NOW CALCULATE FREQUENCY for every exeprement 						
			if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]):
				vdj_g = g.xs('VDJ',level='recomb')
				PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.jgenes','JH Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5)
				plots_created.append(gene_analysis_plot+'.vdj.jgenes.png') #.png extension is added in the function plotgenedist
			if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]):
				vj_g = g.xs('VJ',level='recomb')			
				PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.jgenes','JL Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5)
				plots_created.append(gene_analysis_plot+'.vj.jgenes.png') #.png extension is added in the function plotgenedist		
		
		#now perform a V-J gene analysis (heat map) for each experiment 
		if 'vjgene' in statistics_to_run:
			print('Performing V-J gene analysis')
			vj_gene_analysis = output_file_prefix+'.v_and_jgene_analysis.txt'
			#group datafraom by recombination, vgene, and jgene 
			#first rename all V and J gnees that are empyt as No call 						
			#Then Group H / L results by  by v and j gnees and take the sum of each column in the group 
			vj_df =  gene_df.replace([''],[' No call']).groupby(['recomb','VGENE','JGENE']).sum()
			vj_df.to_csv(vj_gene_analysis,sep='\t')			
			
			#remove TOTAL_COUNTS			
			vj_df.drop('TOTAL_COUNTS', axis=1, inplace=True)
			
			#calculate frequency for each recomb type 
			if 'VDJ' in list(vj_df.index.levels[g.index.names.index('recomb')]):						
				v1 =  vj_df.loc['VDJ',:]/vj_df.loc['VDJ',:].sum()
				PlotVJGeneHeatMap(v1,gene_analysis_plot+'.vdj.v_and_jgene_analysis',max_val=None,min_val=None)
				plots_created.append(gene_analysis_plot+'.vdj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist
			if 'VJ' in list(vj_df.index.levels[g.index.names.index('recomb')]):
				v2 =  vj_df.loc['VJ',:]/vj_df.loc['VJ',:].sum()
				PlotVJGeneHeatMap(v2,gene_analysis_plot+'.vj.v_and_jgene_analysis',max_val=None,min_val=None)																							
				plots_created.append(gene_analysis_plot+'.vj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist
			del vj_df
		del gene_df
		
	#lets do some cdr3 analysis 									
	cdr3_length_stats = {}
	diversity_measurements = {}
	if cdr3analysis:	
		unique_cdr3_file = output_file_prefix+'.unique_cdr3_counts.txt' 
		print('Performing CDR3 analyisis')
		if sum(num_cdr3)>0:
			#again create a pandas dataframe but this time using the unique cdr3 calls 
			print('Loading CDR3s into a dataframe')
			cdr3_df_list = [pd.DataFrame.from_dict(c,orient='index') for c in [cdr3_dict_vdj,cdr3_dict_vj,cdr3_dict_unk]]
			#merge all dftogether
			keys=['VDJ','VJ','UNK']
			cdr3_df = pd.concat(cdr3_df_list,keys=keys)
			#cdr3_df = pd.DataFrame(cdr3_dict).transpose()			
			cdr3_df['TOTAL_COUNTS'] = cdr3_df.sum(axis=1)
			print('Dataframe created')
			
			cdr3_df.index.names = ['recomb','CDR3']
			cdr3_df = cdr3_df.reset_index()				
			#cdr3_df['CDR3'] = ''
			#cdr3_df['recomb'] = ''
			#cdr3_df = cdr3_df.apply(ModifyPDTable,axis=1,raw=True,reduce=True,args=(['CDR3','recomb'],delim))			
			
			new_names = {}
			#performm 			
			cdr3_df['CDR3_LENGTH'] = cdr3_df.CDR3.map(len) 
			for f,v in enumerate(exp_names):
				new_names[f]=v
			#rename the columns to match the experiment names 
			
			cdr3_df = cdr3_df.rename(columns=new_names)
			cdr3_df.sort(['recomb','TOTAL_COUNTS'],ascending=[1,0],inplace=True)
			cdr3_df.set_index(['recomb','CDR3'],inplace=True)					
			
			#save dataframe as tab dleim file 						
			cdr3_df.to_csv(unique_cdr3_file,sep='\t')									
			
			cdr3_length_stats = PlotCDR3Histogram(cdr3_df,gene_analysis_plot+'.cdr3_length_histogram')
			plots_created.append(gene_analysis_plot+'.cdr3_length_histogram.png')
			
			diversity_measurements = CalculateDiversities(cdr3_df,gene_analysis_plot+'.cdr3_diversity_plots')
			plots_created.append(gene_analysis_plot+'.cdr3_diversity_plots.png')			
		del cdr3_df
	
	print('Writing summary to file')
	#finally make a results text file that summarizes all the information	
	GenerateResultsSummaryFile(gene_summary_file,statistics_to_run,list_of_files,exp_names,unique_aa_file,unique_cdr3_file,v_gene_analysis,j_gene_analysis,vj_gene_analysis,plots_created,num_sequences,num_results,num_vdj,num_vj,num_cdr3,num_stop_codon,cdr3_length_stats,diversity_measurements)	
	
	files_generated = [gene_summary_file]
	if unique_aa_file:
		files_generated.append(unique_aa_file)
	if unique_cdr3_file:
		files_generated.append(unique_cdr3_file)
	if v_gene_analysis:
		files_generated.append(v_gene_analysis)
	if j_gene_analysis:
		files_generated.append(j_gene_analysis)
	if vj_gene_analysis:
		files_generated.append(vj_gene_analysis)
	
	print('Descriptive statistics completed at {0}.'.format(str(datetime.datetime.now())))
	
	gc.collect()

	
	return {'files':files_generated,'figures':plots_created}
コード例 #5
0
def run_flash(r1file, r2file, working_directory, outfile='', parameters={}, suffix=''):
	r1_path = useful.get_parent_dir(r1file)  # '/'.join(r1file.split('/')[:-1])	
	r2_path = useful.get_parent_dir(r2file)  # '/'.join(r2file.split('/')[:-1])
	
	if not parameters:
		print "PARAMETERS NOT PASSED INTO FLASH PROGRAM. USING DEFAULT IGSEQ PARAMETERS: R = 300, F = 400"
		parameters = {'r': 300, 'f': 400}
	
	if r1file.endswith('.gz'):
		print "Unzipping R1 File.."				
		r1file = useful.gunzip_python(r1file)
	
	if r2file.endswith('.gz'):		
		print "Unzipping R2 File.."
		r2file = useful.gunzip_python(r2file)
		
	working_directory = os.path.abspath(working_directory)
	if r1_path != working_directory:
		os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file)))		
	if r2_path != working_directory:	
		os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file)))
		
	if outfile == '':		
		outfile = os.path.basename(r1file).split('.')					
		for p, subs in enumerate(outfile):
			if '_R1' in subs:
				r_pos = subs.index("_R1")
				outfile[p] = subs[:r_pos]				
				break
			elif '_R2' in subs:
				r_pos = subs.index("_R2")
				outfile[p] = subs[:r_pos]				
				break
		outfile = '.'.join(outfile)
	else:		
		outfile = os.path.basename(outfile)
		
	outfile = outfile.replace('.fastq', '').replace('.fasta', '')
	outfile += '.flashed' + suffix		
			
	if os.path.isfile(os.path.join(working_directory, outfile)):  # in resulting_files:		
		print('WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'.format(working_directory + '/' + outfile))
							
	r1file = os.path.join(working_directory, os.path.basename(r1file))  # working_directory+'/'+os.path.basename(r1file)
	r2file = os.path.join(working_directory, os.path.basename(r2file))  # working_directory+'/'+os.path.basename(r2file)

	flash_command = "{2} {0} {1}".format(r1file, r2file, flash_location)
	
	parameters['o'] = outfile
	parameters['d'] = working_directory

	for p, val in parameters.iteritems():
		flash_command += ' -{0} {1}'.format(p, str(val))
	
	flash_command += ' -q'  # run on quiet command
	# os.system(flash_command)
	worked = subprocess.call(flash_command, shell=True)
	if worked > 0:
		raise Exception('Flash failed')
	os.rename(os.path.join(working_directory, outfile + '.extendedFrags.fastq'), os.path.join(working_directory, outfile))
	
	try:
		read_count_r1_file = useful.file_line_count(r1file)
	except Exception as e:
		read_count_r1_file = 1
		print("Could not get number of lines in read file: " + str(e))
	
	try:
		read_count_flashed_file = useful.file_line_count(os.path.join(working_directory, outfile))
	except Exception as e:
		read_count_flashed_file = 1
		print("Could not get number of lines in outfile read file: " + str(e))
	resulting_counts = (
		os.path.join(working_directory, outfile),
		read_count_flashed_file / 4,
		read_count_r1_file / 4,
		float(100) * (read_count_flashed_file / float(read_count_r1_file))
	)
	
	return resulting_counts
コード例 #6
0
def run_pear(r1file, r2file, working_directory, outfile='', parameters={}, suffix='', num_threads=1, memory='1G'):
	r1_path = useful.get_parent_dir(r1file)
	r2_path = useful.get_parent_dir(r2file)

	if r1file.endswith('.gz'):
		print("Unzipping R1 File..")
		r1file = useful.gunzip_python(r1file)
	
	if r2file.endswith('.gz'):		
		print("Unzipping R2 File..")
		r2file = useful.gunzip_python(r2file)
				
	working_directory = os.path.abspath(working_directory)
	if r1_path != working_directory:
		os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file)))		
	if r2_path != working_directory:	
		os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file)))		
		
	if outfile == '':		
		outfile = os.path.basename(r1file).split('.')					
		for p, subs in enumerate(outfile):
			if '_R1' in subs:
				r_pos = subs.index("_R1")
				outfile[p] = subs[:r_pos]				
				break
			elif '_R2' in subs:
				r_pos = subs.index("_R2")
				outfile[p] = subs[:r_pos]				
				break
		outfile = '.'.join(outfile)
	else:		
		outfile = os.path.basename(outfile)
	
	outfile = outfile.replace('.fastq', '').replace('.fasta', '')
	
	outfile = os.path.join(working_directory, outfile)
	if os.path.isfile(os.path.join(working_directory, outfile)):  # in resulting_files:
		print('WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'.format(working_directory + '/' + outfile))
							
	r1file = os.path.join(working_directory, os.path.basename(r1file))
	r2file = os.path.join(working_directory, os.path.basename(r2file))

	pear_command = "{2} -f {0} -r {1}".format(r1file, r2file, pear_location)
	
	parameters['o'] = outfile
	parameters['y'] = memory
	parameters['j'] = num_threads
	
	for p, val in parameters.iteritems():
		pear_command += ' -{0} {1}'.format(p, str(val))
			
	worked = subprocess.call(pear_command, shell=True)
	
	if worked > 0:
		raise Exception('Error in pear program')
	
	try:
		read_count_r1_file = useful.file_line_count(r1file)
	except Exception as e:
		read_count_r1_file = 1
		print("Could not get number of lines in read file: " + str(e))
	
	try:
		read_count_flashed_file = useful.file_line_count(outfile + '.assembled.fastq')
	except Exception as e:
		read_count_flashed_file = 1
		print("Could not get number of lines in outfile read file: " + str(e))

	resulting_counts = (
		outfile + '.assembled.fastq',
		read_count_flashed_file / 4,
		read_count_r1_file / 4,
		float(100) * (read_count_flashed_file / float(read_count_r1_file))
	)
	
	return resulting_counts
コード例 #7
0
def run_pear(r1file,
             r2file,
             working_directory,
             outfile='',
             parameters={},
             suffix='',
             num_threads=1,
             memory='1G'):
    r1_path = useful.get_parent_dir(r1file)
    r2_path = useful.get_parent_dir(r2file)

    if r1file.endswith('.gz'):
        print("Unzipping R1 File..")
        r1file = useful.gunzip_python(r1file)

    if r2file.endswith('.gz'):
        print("Unzipping R2 File..")
        r2file = useful.gunzip_python(r2file)

    working_directory = os.path.abspath(working_directory)
    if r1_path != working_directory:
        os.rename(r1file,
                  os.path.join(working_directory, os.path.basename(r1file)))
    if r2_path != working_directory:
        os.rename(r2file,
                  os.path.join(working_directory, os.path.basename(r2file)))

    if outfile == '':
        outfile = os.path.basename(r1file).split('.')
        for p, subs in enumerate(outfile):
            if '_R1' in subs:
                r_pos = subs.index("_R1")
                outfile[p] = subs[:r_pos]
                break
            elif '_R2' in subs:
                r_pos = subs.index("_R2")
                outfile[p] = subs[:r_pos]
                break
        outfile = '.'.join(outfile)
    else:
        outfile = os.path.basename(outfile)

    outfile = outfile.replace('.fastq', '').replace('.fasta', '')

    outfile = os.path.join(working_directory, outfile)
    if os.path.isfile(os.path.join(working_directory,
                                   outfile)):  # in resulting_files:
        print(
            'WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'
            .format(working_directory + '/' + outfile))

    r1file = os.path.join(working_directory, os.path.basename(r1file))
    r2file = os.path.join(working_directory, os.path.basename(r2file))

    pear_command = "{2} -f {0} -r {1}".format(r1file, r2file, pear_location)

    parameters['o'] = outfile
    parameters['y'] = memory
    parameters['j'] = num_threads

    for p, val in parameters.iteritems():
        pear_command += ' -{0} {1}'.format(p, str(val))

    worked = subprocess.call(pear_command, shell=True)

    if worked > 0:
        raise Exception('Error in pear program')

    try:
        read_count_r1_file = useful.file_line_count(r1file)
    except Exception as e:
        read_count_r1_file = 1
        print("Could not get number of lines in read file: " + str(e))

    try:
        read_count_flashed_file = useful.file_line_count(outfile +
                                                         '.assembled.fastq')
    except Exception as e:
        read_count_flashed_file = 1
        print("Could not get number of lines in outfile read file: " + str(e))

    resulting_counts = (outfile + '.assembled.fastq',
                        read_count_flashed_file / 4, read_count_r1_file / 4,
                        float(100) *
                        (read_count_flashed_file / float(read_count_r1_file)))

    return resulting_counts
コード例 #8
0
def run_flash(r1file,
              r2file,
              working_directory,
              outfile='',
              parameters={},
              suffix=''):
    r1_path = useful.get_parent_dir(r1file)  # '/'.join(r1file.split('/')[:-1])
    r2_path = useful.get_parent_dir(r2file)  # '/'.join(r2file.split('/')[:-1])

    if not parameters:
        print "PARAMETERS NOT PASSED INTO FLASH PROGRAM. USING DEFAULT IGSEQ PARAMETERS: R = 300, F = 400"
        parameters = {'r': 300, 'f': 400}

    if r1file.endswith('.gz'):
        print "Unzipping R1 File.."
        r1file = useful.gunzip_python(r1file)

    if r2file.endswith('.gz'):
        print "Unzipping R2 File.."
        r2file = useful.gunzip_python(r2file)

    working_directory = os.path.abspath(working_directory)
    if r1_path != working_directory:
        os.rename(r1file,
                  os.path.join(working_directory, os.path.basename(r1file)))
    if r2_path != working_directory:
        os.rename(r2file,
                  os.path.join(working_directory, os.path.basename(r2file)))

    if outfile == '':
        outfile = os.path.basename(r1file).split('.')
        for p, subs in enumerate(outfile):
            if '_R1' in subs:
                r_pos = subs.index("_R1")
                outfile[p] = subs[:r_pos]
                break
            elif '_R2' in subs:
                r_pos = subs.index("_R2")
                outfile[p] = subs[:r_pos]
                break
        outfile = '.'.join(outfile)
    else:
        outfile = os.path.basename(outfile)

    outfile = outfile.replace('.fastq', '').replace('.fasta', '')
    outfile += '.flashed' + suffix

    if os.path.isfile(os.path.join(working_directory,
                                   outfile)):  # in resulting_files:
        print(
            'WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'
            .format(working_directory + '/' + outfile))

    r1file = os.path.join(working_directory, os.path.basename(
        r1file))  # working_directory+'/'+os.path.basename(r1file)
    r2file = os.path.join(working_directory, os.path.basename(
        r2file))  # working_directory+'/'+os.path.basename(r2file)

    flash_command = "{2} {0} {1}".format(r1file, r2file, flash_location)

    parameters['o'] = outfile
    parameters['d'] = working_directory

    for p, val in parameters.iteritems():
        flash_command += ' -{0} {1}'.format(p, str(val))

    flash_command += ' -q'  # run on quiet command
    # os.system(flash_command)
    worked = subprocess.call(flash_command, shell=True)
    if worked > 0:
        raise Exception('Flash failed')
    os.rename(
        os.path.join(working_directory, outfile + '.extendedFrags.fastq'),
        os.path.join(working_directory, outfile))

    try:
        read_count_r1_file = useful.file_line_count(r1file)
    except Exception as e:
        read_count_r1_file = 1
        print("Could not get number of lines in read file: " + str(e))

    try:
        read_count_flashed_file = useful.file_line_count(
            os.path.join(working_directory, outfile))
    except Exception as e:
        read_count_flashed_file = 1
        print("Could not get number of lines in outfile read file: " + str(e))
    resulting_counts = (os.path.join(working_directory,
                                     outfile), read_count_flashed_file / 4,
                        read_count_r1_file / 4, float(100) *
                        (read_count_flashed_file / float(read_count_r1_file)))

    return resulting_counts