def parseMIXCR(originalfileloc,resultfileloc,inputype,outfile=None,header_var='document_header',sequence_var='sequence',command_val = {}): command_string = json.dumps(command_val) if command_val else json.dumps({'MIXCR V1.3': 'Unknown settings'}) if not outfile: outfile = "%s-parsed.annotation"%resultfileloc print('Parsing mixcr file') number_of_annotation_lines = useful.file_line_count(resultfileloc) seqfile=immunogrepFile(originalfileloc,inputype) #the original file used as an input file for mixcr annotation iffile=immunogrepFile(resultfileloc,'TAB',None)#,"\t",True,"r") #the mixcr generated alignment file parent_folder = '/'.join(resultfileloc.split('/')[:-1])+'/' error_file = open(resultfileloc+'.errorlog.txt','w') unfound_seqs = open(resultfileloc+'.notfound.txt','w') notfound=0 seq_num=0 errors=0 needcapture = True looper = useful.LoopStatusGen(number_of_annotation_lines,10) t1 = time.time() with open(outfile,"w") as f: f.write(descriptor_symbol+json.dumps(DatabaseTranslator())+'\n')#write a translator line to this file so that we know how to add results to database f.write('\t'.join(presetlabels)+'\n') #read each input sequence/file for fastseq in seqfile.read(): try: content={} if not fastseq: continue #read in the annotation information from mixcr seq = fastseq[sequence_var].upper() #extract sequence header and the SEQ_ID field from input file [header, id] = GetHeaderInfo(fastseq,header_var) if needcapture: #we need to match this sequence to mixcr program output if iffile.IFclass.eof: mixcr_data = None else: mixcr_data = iffile.IFclass.read() #print percent status completed looper.next() #check whether mixcr data matches the current sequence strand='' if mixcr_data: if 'Read id' in mixcr_data: if int(mixcr_data['Read id']) == seq_num: matched_seqs = True else: matched_seqs=False elif 'Description R1' in mixcr_data: if mixcr_data['Description R1'].strip() == fastseq[header_var].strip(): matched_seqs=True else: matched_seqs = False else: mixcr_data['Read(s) sequence'] = mixcr_data['Read(s) sequence'].upper() [matched_seqs,strand] = match_sequence(seq,mixcr_data['Read(s) sequence']) mixcr_seq = mixcr_data['Read(s) sequence'] else: mixcr_seq = '' matched_seqs=False strand='' needcapture=True if matched_seqs==False: #these results did not match mixcr sequence, so this sequence probably did not yield any results #so we do not need to recapture a new miseq sequence. We will just stay with this one needcapture = False content['Sequence']=seq content['Seqheader']=header content['Notes'] = 'Sequence not found in mixcr file;' content[idIdentifier] = id unfound_seqs.write('\t'.join([content['Seqheader'],content['Sequence'],mixcr_seq])+'\n') content['Command'] = command_string content = defaultdict(str,content) output_line = [str(content[lab]) for lab in presetlabels] f.write('\t'.join(output_line)+'\n') notfound+=1 seq_num+=1 continue seq_num+=1 #in the next iteration of the code, we will need to get a fresh mixcr result needcapture=True content = mixcr_data content['Notes'] = '' content[idIdentifier] = id content['Seqheader'] = header r_j = '' r_v = '' chain_v = '' content['Sequence']=seq content['Strand corrected sequence'] = content['Read(s) sequence'] [content['Full NT'],content['5_Prime_Annotation'],content['3_Prime_Annotation'],missing_fields]=return_full_nt(content) if missing_fields: content['Notes']+='The sequence is missing features between the 5 prime and 3 prime region;' content['3_Prime_Annotation']=content['3_Prime_Annotation']+'*' content['Full length'] = 'FALSE' else: if content['5_Prime_Annotation'] == 'FR1' and content['3_Prime_Annotation'] == 'FR4': content['Full length'] = 'TRUE' else: content['Full length'] = 'FALSE' [content['Full AA'],content['Productivity']] =GetFullAA(content,missing_fields) if content['AA. seq. CDR3'] and content['AA. seq. CDR3'] in content['Full AA']: content['CDR3_Junction_In_Frame']= 'TRUE' else: content['CDR3_Junction_In_Frame']= 'FALSE' if content['All V hits']: [vgenelist,vscorelist,vlocus,chain_v,r_v]=extractScores(content['All V hits']) content['All V hits']=','.join(vgenelist) content['All V scores']=','.join(vscorelist) content['FirstVgene']=vgenelist[0] content['Locus']=vlocus else: content['All V hits']='' content['All V scores']='' content['FirstVgene']='' content['Locus']='' if content['All D hits']: [dgenelist,dscorelist,dlocus,chain,recomb]=extractScores(content['All D hits']) content['All D hits']=','.join(dgenelist) content['All D scores']=','.join(dscorelist) content['FirstDgene']=dgenelist[0] else: content['All D hits']='' content['All D scores']='' content['FirstDgene']='' if content['All J hits']: [jgenelist,jscorelist,jlocus,chain,r_j]=extractScores(content['All J hits']) content['All J hits']=','.join(jgenelist) content['All J scores']=','.join(jscorelist) content['FirstJgene']=jgenelist[0] else: r_j = r_v content['All J hits']='' content['All J scores']='' content['FirstJgene']='' if content['All C hits']: [cgenelist,cscorelist,clocus,chain,recomb]=extractScores(content['All C hits']) content['All C hits']=','.join(cgenelist) content['All C scores']=','.join(cscorelist) else: content['All C hits']='' content['All C scores']='' if r_j == r_v: content['Recombination Type'] = r_v content['Chain'] = chain_v else: content['Recombination Type'] = '' content['Chain'] = '' if content['All V alignment']: [query_start,query_end,germ_start,germ_end,algn_len,num_mismatch,num_ins,num_del,shm,alignment_string] = ParseAlignment(content['All V alignment']) content['VGENE: Query start'] = query_start content['VGENE: Query end'] = query_end content['VGENE: Germline start'] = germ_start content['VGENE: Germline end'] = germ_end content['VGENE: Shm.nt'] = num_ins+num_del+num_mismatch content['VGENE: Mismatch'] = num_mismatch content['VGENE: Insertion'] = num_ins content['VGENE: Deletion'] = num_del content['VGENE: Alignment'] = alignment_string content['VGENE: Shm.per'] = round(100*shm,3) content['VGENE: Alignment length'] = algn_len content['AB end'] = query_end content['AB start'] = query_start if content['All J alignment']: [query_start,query_end,germ_start,germ_end,algn_len,num_mismatch,num_ins,num_del,shm,alignment_string] = ParseAlignment(content['All J alignment']) content['JGENE: Query start'] = query_start content['JGENE: Query end'] = query_end content['JGENE: Germline start'] = germ_start content['JGENE: Germline end'] = germ_end content['JGENE: Shm.nt'] = num_ins+num_del+num_mismatch content['JGENE: Mismatch'] = num_mismatch content['JGENE: Insertion'] = num_ins content['JGENE: Deletion'] = num_del content['JGENE: Alignment'] = alignment_string content['JGENE: Shm.per'] = round(100*shm,3) content['JGENE: Alignment length'] = algn_len content['AB end'] = query_end if 'AB start' not in content: content['AB start'] = query_start content['Orientation'] = guess_strand(content['Full NT'],content['Sequence']) content['Command'] = command_string content = defaultdict(str,content) output_line = [str(content[lab]) for lab in presetlabels] f.write('\t'.join(output_line)+'\n') except Exception as e: errors+=1 print('There was an error in sequence: '+str(seq_num)) print('Error: '+str(e)) error_file.write('****ERROR FOUND IN SEQUENCE:{0} ****\n'.format(str(seq_num))) error_file.write(useful.print_error_string(e)+'\n') error_file.write('MIXCR DATA: \n') error_file.write(json.dumps(content,indent=4)+'\n') error_file.write('*************END OF ERROR*********\n') iffile.IFclass.close() seqfile.IFclass.close() error_file.close() unfound_seqs.close() if errors==0: os.remove(resultfileloc+'.errorlog.txt') if notfound==0: os.remove(resultfileloc+'.notfound.txt') t2 =time.time() print(str(t2-t1)) return outfile
def GenerateResultsSummaryFile(output_file,analysis_requests,input_file_paths,exp_names,aa_file_location,cdr3_file_location,v_gene_file_location,j_gene_file_location,vj_gene_file_location,plots_created,num_sequences,num_results,num_vdj,num_vj,num_cdr3,num_stop_codon,cdr3_length_stats,diversity_measurements): with open(output_file,'w') as summary: summary.write('Data generated on: %s\n' %(datetime.datetime.now())) summary.write("******************************************************************************\n") summary.write("******************Summary Report for Descriptive Statistics*******************\n") summary.write("******************************************************************************\n\n\n") summary.write('The following experiments were used in the analysis:\n') summary.write('\tExperiment Name\tFile path\n') exp_string = "" for num,exp in enumerate(exp_names): exp_string += '\t'+exp if isinstance(input_file_paths[num],list): exp_string+='\t'+input_file_paths[num][0]+'\n' for k in range(1,len(input_file_paths[num][0])): exp_string+='\t\t'+input_file_paths[num][k]+'\n' else: exp_string+='\t'+input_file_paths[num]+'\n' summary.write(exp_string+'\n\n') summary.write('The following analyses were requested: {0}'.format(','.join(analysis_requests))+'\n\n') total_unique_aa_count = 0 if aa_file_location: total_unique_aa_count = useful.file_line_count(aa_file_location)-1 summary.write('General File Info\n') summary.write('\t'+'\t'.join(['Experiment name','Number of results found','Number sequences containing antibody sequence','Number sequences with stop codon','Num VDJ','Num VJ','Number sequences with CDR3 amino acid sequence'])+'\n') for ind,each_exp in enumerate(exp_names): num_total_seqs = num_sequences[ind] res_string = str(num_results[ind])+' ('+str(round(100*float(num_results[ind])/num_total_seqs,3))+'%)' stop_string = str(num_stop_codon[ind])+' ('+str(round(100*float(num_stop_codon[ind])/num_total_seqs,3))+'%)' summary.write('\t'+'\t'.join([each_exp,str(num_total_seqs),res_string,stop_string,str(num_vdj[ind]),str(num_vj[ind]),str(num_cdr3[ind])])+'\n') if len(exp_names)>1: num_total_seqs = sum(num_sequences) res_string = str(sum(num_results))+' ('+str(round(100*float(sum(num_results))/num_total_seqs,3))+'%)' stop_string = str(sum(num_stop_codon))+' ('+str(round(100*float(sum(num_stop_codon))/num_total_seqs,3))+'%)' summary.write('\t'+'\t'.join(['Total',str(num_total_seqs),res_string,stop_string,str(sum(num_vdj)),str(sum(num_vj)),str(sum(num_cdr3))])+'\n') if total_unique_aa_count: summary.write('A total of {0} unique amino acid antibody sequences were found across all files\n'.format(str(total_unique_aa_count))) if 'cdr3' in analysis_requests: summary.write('\nCDR3 Results Summary\n') summary.write('*The average CDR3 length only considers CDR3 amino acid sequences with more than 2 amino acids\n') use_names = exp_names if len(exp_names)>1: use_names.append('TOTAL_COUNTS') for rtype in ['VDJ','VJ']: if rtype in cdr3_length_stats or rtype in diversity_measurements: summary.write('CDR3 - '+rtype+'\n') summary.write('\t'+'\t'.join(['Experiment name','Average CDR3 length*','Standard deviation CDR3 length','Number unique CDR3 AA','Number unique CDR3 AA identified more than two times','Shannon diversity index', 'Ginni-simpsons diversity index','Normalized shannon diversity','Normalized ginni-simpsons diversity'])+'\n') for exp_num,each_exp in enumerate(use_names): if each_exp == 'TOTAL_COUNTS': results = ['All experiments'] else: results = [each_exp] if each_exp in cdr3_length_stats[rtype]['mean']: results.extend([str(cdr3_length_stats[rtype]['mean'][each_exp]),str(cdr3_length_stats[rtype]['std'][each_exp])]) else: results.extend(['','']) try: results.append(str(diversity_measurements[rtype]['unique_cdr3s'][each_exp])) except: results.append('') try: results.append(str(diversity_measurements[rtype]['num_above_2'][each_exp])) except: results.append('') try: results.append(str(diversity_measurements[rtype]['shannon_entropy']['index'][each_exp])) except: results.append('') try: results.append(str(diversity_measurements[rtype]['ginni_simpsons']['index'][each_exp])) except: results.append('') try: results.append(str(diversity_measurements[rtype]['shannon_entropy']['true_diversity'][each_exp])) except: results.append('') try: results.append(str(diversity_measurements[rtype]['ginni_simpsons']['true_diversity'][each_exp])) except: results.append('') summary.write('\t'+'\t'.join(results)+'\n') summary.write('\n') summary.write('Result files created from program\n') if aa_file_location: summary.write('\tA file containing the unique antibody amino acid sequences and counts can be found in the following location:\n\t\t{0}\n'.format(aa_file_location)) if cdr3_file_location: summary.write('\tA file containing a list of unique cdr3 amino acid sequences and counts can be found in the following location:\n\t\t{0}\n'.format(cdr3_file_location)) if v_gene_file_location: summary.write('\tA file summarizing VGENE usage can be found in the following location:\n\t\t{0}\n'.format(v_gene_file_location)) if j_gene_file_location: summary.write('\tA file summarizing JGENE usage can be found in the following location:\n\t\t{0}\n'.format(j_gene_file_location)) if vj_gene_file_location: summary.write('\tA file summarizing V and JGENE usage can be found in the following location:\n\t\t{0}\n'.format(vj_gene_file_location)) if len(plots_created)>0: summary.write('\tThe following figures in both PNG and SVG format were generated during the analysis:\n') for fig in plots_created: summary.write('\t\t'+fig+'\n')
def parseMIXCR(originalfileloc, resultfileloc, inputype, outfile=None, header_var='document_header', sequence_var='sequence', quality_var='phred', command_val={}): command_string = json.dumps(command_val) if command_val else json.dumps( {'MIXCR V1.3': 'Unknown settings'}) if not outfile: outfile = "%s-parsed.annotation" % resultfileloc print('Parsing mixcr file') number_of_annotation_lines = useful.file_line_count(resultfileloc) seqfile = immunogrepFile( originalfileloc, inputype ) #the original file used as an input file for mixcr annotation iffile = immunogrepFile( resultfileloc, 'TAB', None) #,"\t",True,"r") #the mixcr generated alignment file parent_folder = '/'.join(resultfileloc.split('/')[:-1]) + '/' error_file = open(resultfileloc + '.errorlog.txt', 'w') unfound_seqs = open(resultfileloc + '.notfound.txt', 'w') notfound = 0 seq_num = 0 errors = 0 needcapture = True looper = useful.LoopStatusGen(number_of_annotation_lines, 10) t1 = time.time() with open(outfile, "w") as f: f.write( descriptor_symbol + json.dumps(DatabaseTranslator()) + '\n' ) #write a translator line to this file so that we know how to add results to database f.write('\t'.join(presetlabels) + '\n') #read each input sequence/file for fastseq in seqfile.read(): try: content = {} if not fastseq: continue #read in the annotation information from mixcr seq = fastseq[sequence_var].upper() #extract sequence header and the SEQ_ID field from input file [header, id] = GetHeaderInfo(fastseq, header_var) if needcapture: #we need to match this sequence to mixcr program output if iffile.IFclass.eof: mixcr_data = None else: mixcr_data = iffile.IFclass.read() #print percent status completed looper.next() #check whether mixcr data matches the current sequence strand = '' if mixcr_data: if 'Read id' in mixcr_data: if int(mixcr_data['Read id']) == seq_num: matched_seqs = True else: matched_seqs = False elif 'Description R1' in mixcr_data: if mixcr_data['Description R1'].strip( ) == fastseq[header_var].strip(): matched_seqs = True else: matched_seqs = False else: mixcr_data['Read(s) sequence'] = mixcr_data[ 'Read(s) sequence'].upper() [matched_seqs, strand ] = match_sequence(seq, mixcr_data['Read(s) sequence']) mixcr_seq = mixcr_data['Read(s) sequence'] else: mixcr_seq = '' matched_seqs = False strand = '' needcapture = True if matched_seqs == False: #these results did not match mixcr sequence, so this sequence probably did not yield any results #so we do not need to recapture a new miseq sequence. We will just stay with this one needcapture = False content['Sequence'] = seq content['Seqheader'] = header if quality_var in fastseq: content['Read(s) sequence qualities'] = fastseq[ quality_var] content['Notes'] = 'Sequence not found in mixcr file;' content[idIdentifier] = id unfound_seqs.write('\t'.join([ content['Seqheader'], content['Sequence'], mixcr_seq ]) + '\n') content['Command'] = command_string content = defaultdict(str, content) output_line = [str(content[lab]) for lab in presetlabels] f.write('\t'.join(output_line) + '\n') notfound += 1 seq_num += 1 continue seq_num += 1 #in the next iteration of the code, we will need to get a fresh mixcr result needcapture = True content = mixcr_data content['Notes'] = '' content[idIdentifier] = id content['Seqheader'] = header r_j = '' r_v = '' chain_v = '' content['Sequence'] = seq content['Strand corrected sequence'] = content[ 'Read(s) sequence'] [ content['Full NT'], content['5_Prime_Annotation'], content['3_Prime_Annotation'], missing_fields ] = return_full_nt(content) if missing_fields: content[ 'Notes'] += 'The sequence is missing features between the 5 prime and 3 prime region;' content['3_Prime_Annotation'] = content[ '3_Prime_Annotation'] + '*' content['Full length'] = 'FALSE' else: if content['5_Prime_Annotation'] == 'FR1' and content[ '3_Prime_Annotation'] == 'FR4': content['Full length'] = 'TRUE' else: content['Full length'] = 'FALSE' [content['Full AA'], content['Productivity']] = GetFullAA(content, missing_fields) if content['AA. Seq. CDR3'] and content[ 'AA. Seq. CDR3'] in content['Full AA']: content['CDR3_Junction_In_Frame'] = 'TRUE' else: content['CDR3_Junction_In_Frame'] = 'FALSE' if content['All V hits']: [vgenelist, vscorelist, vlocus, chain_v, r_v] = extractScores(content['All V hits']) content['All V hits'] = ','.join(vgenelist) content['All V scores'] = ','.join(vscorelist) content['FirstVgene'] = vgenelist[0] content['Locus'] = vlocus else: content['All V hits'] = '' content['All V scores'] = '' content['FirstVgene'] = '' content['Locus'] = '' if content['All D hits']: [dgenelist, dscorelist, dlocus, chain, recomb] = extractScores(content['All D hits']) content['All D hits'] = ','.join(dgenelist) content['All D scores'] = ','.join(dscorelist) content['FirstDgene'] = dgenelist[0] else: content['All D hits'] = '' content['All D scores'] = '' content['FirstDgene'] = '' if content['All J hits']: [jgenelist, jscorelist, jlocus, chain, r_j] = extractScores(content['All J hits']) content['All J hits'] = ','.join(jgenelist) content['All J scores'] = ','.join(jscorelist) content['FirstJgene'] = jgenelist[0] else: r_j = r_v content['All J hits'] = '' content['All J scores'] = '' content['FirstJgene'] = '' if content['All C hits']: [cgenelist, cscorelist, clocus, chain, recomb] = extractScores(content['All C hits']) content['All C hits'] = ','.join(cgenelist) content['All C scores'] = ','.join(cscorelist) else: content['All C hits'] = '' content['All C scores'] = '' if r_j == r_v: content['Recombination Type'] = r_v content['Chain'] = chain_v else: content['Recombination Type'] = '' content['Chain'] = '' if content['All V alignments']: [ query_start, query_end, germ_start, germ_end, algn_len, num_mismatch, num_ins, num_del, shm, alignment_string ] = ParseAlignment(content['All V alignments']) content['VGENE: Query start'] = query_start content['VGENE: Query end'] = query_end content['VGENE: Germline start'] = germ_start content['VGENE: Germline end'] = germ_end content['VGENE: Shm.nt'] = num_ins + num_del + num_mismatch content['VGENE: Mismatch'] = num_mismatch content['VGENE: Insertion'] = num_ins content['VGENE: Deletion'] = num_del content['VGENE: Alignment'] = alignment_string content['VGENE: Shm.per'] = round(100 * shm, 3) content['VGENE: Alignment length'] = algn_len content['AB end'] = query_end content['AB start'] = query_start if content['All J alignments']: [ query_start, query_end, germ_start, germ_end, algn_len, num_mismatch, num_ins, num_del, shm, alignment_string ] = ParseAlignment(content['All J alignments']) content['JGENE: Query start'] = query_start content['JGENE: Query end'] = query_end content['JGENE: Germline start'] = germ_start content['JGENE: Germline end'] = germ_end content['JGENE: Shm.nt'] = num_ins + num_del + num_mismatch content['JGENE: Mismatch'] = num_mismatch content['JGENE: Insertion'] = num_ins content['JGENE: Deletion'] = num_del content['JGENE: Alignment'] = alignment_string content['JGENE: Shm.per'] = round(100 * shm, 3) content['JGENE: Alignment length'] = algn_len content['AB end'] = query_end if 'AB start' not in content: content['AB start'] = query_start content['Orientation'] = guess_strand(content['Full NT'], content['Sequence']) content['Command'] = command_string content = defaultdict(str, content) output_line = [str(content[lab]) for lab in presetlabels] f.write('\t'.join(output_line) + '\n') except Exception as e: errors += 1 print('There was an error in sequence: ' + str(seq_num)) print('Error: ' + str(e)) error_file.write( '****ERROR FOUND IN SEQUENCE:{0} ****\n'.format( str(seq_num))) error_file.write(useful.print_error_string(e) + '\n') error_file.write('MIXCR DATA: \n') error_file.write(json.dumps(content, indent=4) + '\n') error_file.write('*************END OF ERROR*********\n') iffile.IFclass.close() seqfile.IFclass.close() error_file.close() unfound_seqs.close() if errors == 0: os.remove(resultfileloc + '.errorlog.txt') if notfound == 0: os.remove(resultfileloc + '.notfound.txt') t2 = time.time() print(str(t2 - t1)) return outfile
def Descriptive_Statistics(list_of_files,input_file_type,analysis_name='',exp_names = [],output_file_prefix='',fields={},statistics_to_run=['ab_aa','cdr3','vgene','jgene','vjgene']): analysis_name = analysis_name.upper() if input_file_type=='IMGT' and not isinstance(list_of_files[0],list): list_of_files = [list_of_files] elif not isinstance(list_of_files,list): list_of_files = [list_of_files] if len(exp_names)!=len(list_of_files): exp_names = [] #by default, save results to the same folder as the input file if not output_file_prefix: output_file_prefix = useful.removeFileExtension(list_of_files[0]) analysis_name = analysis_name.upper() supported_analyses = fields_for_analysis.keys() if (not analysis_name or analysis_name=='CUSTOM' or analysis_name not in supported_analyses) and not fields: raise Exception('The required fields for the provided analysis, {0}, is not currently automated. Please explicity provide the fields names'.format(str(analysis_name))) #first we use default fields defined ehere if analysis_name in supported_analyses: fields_to_use = copy.deepcopy(fields_for_analysis[analysis_name]) else: fields_to_use = {} #next we add in user defined fields just in case there are any changes/mistakes for f,name in fields.iteritems(): fields_to_use[f] = name filenames_to_use = [f[0] if isinstance(f,list) else f for f in list_of_files] print('Performing descriptive statistics at {0}.'.format(str(datetime.datetime.now()))) print('Analyzing the following files:\n\t {0}'.format('\n\t'.join(filenames_to_use))) unique_aa_file = None unique_cdr3_file = None v_gene_analysis = None j_gene_analysis = None vj_gene_analysis = None gene_analysis_plot = output_file_prefix plots_created = [] gene_summary_file = output_file_prefix+'.summary_of_stats.txt' output_file_names = {} aa_files = ['AB AA SEQUENCE','RECOMBINATION_TYPE','LOCUS','CDR1','CDR2','CDR3','STOP CODONS','PRODUCTIVE','VGENES','DGENES','JGENES','TOTAL COUNTS'] fields_order = ['full_len_ab','recomb','locus','cdr1','cdr2','cdr3','stopc','functionality','vgene','dgene','jgene'] num_exp= len(list_of_files) if not exp_names: if input_file_type=='IMGT': pass else: exp_names = [] for file in list_of_files: count = 1 str_file = os.path.basename(file) while True: if str_file in exp_names: str_file = os.path.basename(file)+'_'+str(count) count+=1 else: exp_names.append(str_file) break if 'ab_aa' in statistics_to_run: intermediate_file = output_file_prefix+'.unique_aa_file_temp' #first we will use a temp file/intermeidate file output_file_names['ab_aa'] = open(intermediate_file,'w') #output_file_names['ab_aa'].write('\t'.join(aa_files)+'\n') cdr3analysis = True if 'cdr3' in statistics_to_run else False aaanalysis = True if 'ab_aa' in statistics_to_run else False vjgene_dict=defaultdict(lambda:defaultdictgenes(num_exp)) #cdr3_dict=defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_vdj = defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_vj = defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_unk = defaultdict(lambda:defaultdictcdr3(num_exp)) use_these_fields = fields_to_use.values() fields_to_use['stopc'] = 'stopc' num_results = [0]*(num_exp) num_cdr3 = [0]*(num_exp) num_stop_codon = [0]*(num_exp) num_vdj = [0]*(num_exp) num_vj = [0]*(num_exp) num_sequences = [0]*(num_exp) if not fields_to_use['recomb']: #maybe the user never defined a feild for recombinoation type..that coudl be a problem because we will have to guess it using the variable at the top of the script: recomb_call recomb_not_defined = True fields_to_use['recomb'] = 'recomb' else: recomb_not_defined = False print('Reading through sequences in file(s)') seqnum=1 #go through all of the files and report the relevant fields #if we are creating a unique amino acid file, then report thiese fields to temp file for fnum,each_file in enumerate(list_of_files): annotated_file = readfile.immunogrepFile(each_file,input_file_type,field_names = use_these_fields) #loop through each file for seq_lines in annotated_file.read(): if not seq_lines: continue if seqnum%500000==0: print('Read {0} sequences'.format(str(seqnum))) seqnum+=1 num_sequences[fnum]+=1 seq_lines = defaultdict(str,seq_lines) if seq_lines[fields_to_use['full_len_ab']]: #full length antibody sequence not found num_results[fnum]+=1 #only select the first gene in the list. alos remove allelic name ('*') seq_lines[fields_to_use['vgene']] = seq_lines[fields_to_use['vgene']].split(',')[0].split('*')[0] seq_lines[fields_to_use['dgene']] = seq_lines[fields_to_use['dgene']].split(',')[0].split('*')[0] #IF NO RECOMBINATION TYPE IS FOUND or provided, THEN guess it using the vgene or jgene call if recomb_not_defined or not seq_lines[fields_to_use['recomb']]: r = '' #not sure what the recombation type is yet #try to guess the recombination type if seq_lines[fields_to_use['vgene']]: #use vgene if present # look at the first three characters in vgene to predict recombioation type gn = ProcessGene(seq_lines[fields_to_use['vgene']]) if gn[:3] in recomb_call: r = recomb_call[gn[:3]] elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example r = recomb_call[gn[:2]] if not r and seq_lines[fields_to_use['jgene']]: #still not r found, so use jgene gn = ProcessGene(seq_lines[fields_to_use['jgene']]) if gn[:3] in recomb_call: r = recomb_call[gn[:3]] elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example r = recomb_call[gn[:2]] #update recomb result seq_lines[fields_to_use['recomb']] = r if not seq_lines[fields_to_use['recomb']]: continue if seq_lines[fields_to_use['recomb']] == 'VDJ': num_vdj[fnum]+=1 elif seq_lines[fields_to_use['recomb']] == 'VJ': num_vj[fnum]+=1 seq_lines[fields_to_use['jgene']] = seq_lines[fields_to_use['jgene']].split(',')[0].split('*')[0] seq_lines['stopc'] = 'YES' if '*' in seq_lines[fields_to_use['full_len_ab']] else 'NO' if seq_lines['stopc'] == 'YES': num_stop_codon[fnum]+=1 if aaanalysis: exp_str = str(fnum+1) #make an intermediate file where we only put the fields we want in the proper order from any file #we will use this field for sorting afterwards #also output exp_num to account for which sequence came from which experiment output_file_names['ab_aa'].write('\t'.join([seq_lines[fields_to_use[f]] for f in fields_order])+'\t'+str(exp_str)+'\n') if seq_lines[fields_to_use['vgene']] or seq_lines[fields_to_use['jgene']]: key_v =delim.join([seq_lines[fields_to_use['vgene']],seq_lines[fields_to_use['jgene']],seq_lines[fields_to_use['recomb']]]) vjgene_dict[key_v][fnum]+=1 if not seq_lines[fields_to_use['cdr3']]: #no cdr3 found continue #add unique cdr3_recomb and vjgene info to dictionaires num_cdr3[fnum]+=1 if cdr3analysis: key = seq_lines[fields_to_use['cdr3']] #key_cdr3 = delim.join([],seq_lines[fields_to_use['recomb']]]) if seq_lines[fields_to_use['recomb']]=='VDJ': cdr3_dict_vdj[key][fnum]+=1 elif seq_lines[fields_to_use['recomb']]=='VJ': cdr3_dict_vj[key][fnum]+=1 else: print('unknown recombination types: ',seq_lines[fields_to_use['recomb']]) cdr3_dict_unk[key][fnum]+=1 if seqnum>10000: break if aaanalysis: output_file_names['ab_aa'].close() print('Generating a file of unique AB amino acid sequences') unique_aa_file = output_file_prefix+'.unique_aa_file.txt' #Use some bash to make a unique amino acid file using sorting and then some awk GenerateAAFile(intermediate_file,unique_aa_file,aa_files,exp_names) #number of amino acid sequences observed if not os.path.isfile(unique_aa_file): num_unique_aa = 0 else: num_unique_aa = useful.file_line_count(unique_aa_file)-1 #-1 => remove header row count #Now have some fun with pandas if set(['vgene','jgene','vjgene']) & set(statistics_to_run): #vjgene_dict format = { #'key' = 'vgene',_,'jgene',_,'recombtype' #value = [count,count] => a list of counts for presence of that key in EACH providced file/experiment. Length of list = number of experiments #} gene_df = pd.DataFrame(vjgene_dict).transpose() if 'VGENE' not in gene_df.columns: gene_df['VGENE'] = '' if 'JGENE' not in gene_df.columns: gene_df['JGENE'] = '' if 'recomb' not in gene_df.columns: gene_df['recomb'] = '' gene_df['TOTAL_COUNTS'] = gene_df.sum(axis=1) gene_df = gene_df.reset_index() gene_df = gene_df.apply(ModifyPDTable,axis=1,args=(['VGENE','JGENE','recomb'],delim)) new_names = {} for f,v in enumerate(exp_names): new_names[f]=v #key = experiment index number #value = new name #rename the columns 0,1,...num experiments to match the experiment names gene_df = gene_df.rename(columns=new_names) #format of gene_df: #index => no index set, just use default numbers #columns => start with column for each experiment, then add the following columns: VGENE, JGENE, recomb, TOTAL_COUNTS if 'vgene' in statistics_to_run: print('Performing V gene analysis') v_gene_analysis = output_file_prefix+'.vgenes.txt' #group elements by VH GENE CALLS and VL gene calls sorted_v_counts = gene_df.groupby(['recomb','VGENE']).sum()#.count()#.sort('VGENE',ascending=1) #find out which level in multilevel index corresponds to 'VGENE' => looking at above code , it should be level 1 (recomb should be level 0) vgene_level = sorted_v_counts.index.names.index('VGENE') #remove results where vGENE is empty if '' in list(sorted_v_counts.index.levels[vgene_level]): sorted_v_counts = sorted_v_counts.drop('',level='VGENE') ignore_counts = ['TOTAL_COUNTS','JGENE'] keep_col = [n for n in sorted_v_counts.columns if n not in ignore_counts] g = sorted_v_counts[keep_col] #NOW PLOT the FREQUENCY for every exeprement if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]): vdj_g = g.xs('VDJ',level='recomb') PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.vgenes','VH Gene Distribution','Frequency','V Gene',max_val=None,min_val=0) plots_created.append(gene_analysis_plot+'.vdj.vgenes.png') #.png extension is added in the function plotgenedist if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]): vj_g = g.xs('VJ',level='recomb') PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.vgenes','VL Gene Distribution','Frequency','V Gene',max_val=None,min_val=0) plots_created.append(gene_analysis_plot+'.vj.vgenes.png') #.png extension is added in the function plotgenedist sorted_v_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(v_gene_analysis,sep='\t',index=False) #do the same as above, except for J genes this time if 'jgene' in statistics_to_run: print('Performing J gene analysis') j_gene_analysis = output_file_prefix+'.jgenes.txt' sorted_j_counts = gene_df.groupby(['recomb','JGENE']).sum()#.sort('VGENE',ascending=1) jgene_level = sorted_j_counts.index.names.index('JGENE') if '' in list(sorted_j_counts.index.levels[jgene_level]): sorted_j_counts.drop('',level='JGENE',inplace=True) ignore_counts = ['TOTAL_COUNTS','VGENE'] keep_col = [n for n in sorted_j_counts.columns if n not in ignore_counts] g = sorted_j_counts[keep_col] sorted_j_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(j_gene_analysis,sep='\t',index=False) #NOW CALCULATE FREQUENCY for every exeprement if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]): vdj_g = g.xs('VDJ',level='recomb') PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.jgenes','JH Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5) plots_created.append(gene_analysis_plot+'.vdj.jgenes.png') #.png extension is added in the function plotgenedist if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]): vj_g = g.xs('VJ',level='recomb') PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.jgenes','JL Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5) plots_created.append(gene_analysis_plot+'.vj.jgenes.png') #.png extension is added in the function plotgenedist #now perform a V-J gene analysis (heat map) for each experiment if 'vjgene' in statistics_to_run: print('Performing V-J gene analysis') vj_gene_analysis = output_file_prefix+'.v_and_jgene_analysis.txt' #group datafraom by recombination, vgene, and jgene #first rename all V and J gnees that are empyt as No call #Then Group H / L results by by v and j gnees and take the sum of each column in the group vj_df = gene_df.replace([''],[' No call']).groupby(['recomb','VGENE','JGENE']).sum() vj_df.to_csv(vj_gene_analysis,sep='\t') #remove TOTAL_COUNTS vj_df.drop('TOTAL_COUNTS', axis=1, inplace=True) #calculate frequency for each recomb type if 'VDJ' in list(vj_df.index.levels[g.index.names.index('recomb')]): v1 = vj_df.loc['VDJ',:]/vj_df.loc['VDJ',:].sum() PlotVJGeneHeatMap(v1,gene_analysis_plot+'.vdj.v_and_jgene_analysis',max_val=None,min_val=None) plots_created.append(gene_analysis_plot+'.vdj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist if 'VJ' in list(vj_df.index.levels[g.index.names.index('recomb')]): v2 = vj_df.loc['VJ',:]/vj_df.loc['VJ',:].sum() PlotVJGeneHeatMap(v2,gene_analysis_plot+'.vj.v_and_jgene_analysis',max_val=None,min_val=None) plots_created.append(gene_analysis_plot+'.vj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist del vj_df del gene_df #lets do some cdr3 analysis cdr3_length_stats = {} diversity_measurements = {} if cdr3analysis: unique_cdr3_file = output_file_prefix+'.unique_cdr3_counts.txt' print('Performing CDR3 analyisis') if sum(num_cdr3)>0: #again create a pandas dataframe but this time using the unique cdr3 calls print('Loading CDR3s into a dataframe') cdr3_df_list = [pd.DataFrame.from_dict(c,orient='index') for c in [cdr3_dict_vdj,cdr3_dict_vj,cdr3_dict_unk]] #merge all dftogether keys=['VDJ','VJ','UNK'] cdr3_df = pd.concat(cdr3_df_list,keys=keys) #cdr3_df = pd.DataFrame(cdr3_dict).transpose() cdr3_df['TOTAL_COUNTS'] = cdr3_df.sum(axis=1) print('Dataframe created') cdr3_df.index.names = ['recomb','CDR3'] cdr3_df = cdr3_df.reset_index() #cdr3_df['CDR3'] = '' #cdr3_df['recomb'] = '' #cdr3_df = cdr3_df.apply(ModifyPDTable,axis=1,raw=True,reduce=True,args=(['CDR3','recomb'],delim)) new_names = {} #performm cdr3_df['CDR3_LENGTH'] = cdr3_df.CDR3.map(len) for f,v in enumerate(exp_names): new_names[f]=v #rename the columns to match the experiment names cdr3_df = cdr3_df.rename(columns=new_names) cdr3_df.sort(['recomb','TOTAL_COUNTS'],ascending=[1,0],inplace=True) cdr3_df.set_index(['recomb','CDR3'],inplace=True) #save dataframe as tab dleim file cdr3_df.to_csv(unique_cdr3_file,sep='\t') cdr3_length_stats = PlotCDR3Histogram(cdr3_df,gene_analysis_plot+'.cdr3_length_histogram') plots_created.append(gene_analysis_plot+'.cdr3_length_histogram.png') diversity_measurements = CalculateDiversities(cdr3_df,gene_analysis_plot+'.cdr3_diversity_plots') plots_created.append(gene_analysis_plot+'.cdr3_diversity_plots.png') del cdr3_df print('Writing summary to file') #finally make a results text file that summarizes all the information GenerateResultsSummaryFile(gene_summary_file,statistics_to_run,list_of_files,exp_names,unique_aa_file,unique_cdr3_file,v_gene_analysis,j_gene_analysis,vj_gene_analysis,plots_created,num_sequences,num_results,num_vdj,num_vj,num_cdr3,num_stop_codon,cdr3_length_stats,diversity_measurements) files_generated = [gene_summary_file] if unique_aa_file: files_generated.append(unique_aa_file) if unique_cdr3_file: files_generated.append(unique_cdr3_file) if v_gene_analysis: files_generated.append(v_gene_analysis) if j_gene_analysis: files_generated.append(j_gene_analysis) if vj_gene_analysis: files_generated.append(vj_gene_analysis) print('Descriptive statistics completed at {0}.'.format(str(datetime.datetime.now()))) gc.collect() return {'files':files_generated,'figures':plots_created}
def run_flash(r1file, r2file, working_directory, outfile='', parameters={}, suffix=''): r1_path = useful.get_parent_dir(r1file) # '/'.join(r1file.split('/')[:-1]) r2_path = useful.get_parent_dir(r2file) # '/'.join(r2file.split('/')[:-1]) if not parameters: print "PARAMETERS NOT PASSED INTO FLASH PROGRAM. USING DEFAULT IGSEQ PARAMETERS: R = 300, F = 400" parameters = {'r': 300, 'f': 400} if r1file.endswith('.gz'): print "Unzipping R1 File.." r1file = useful.gunzip_python(r1file) if r2file.endswith('.gz'): print "Unzipping R2 File.." r2file = useful.gunzip_python(r2file) working_directory = os.path.abspath(working_directory) if r1_path != working_directory: os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file))) if r2_path != working_directory: os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file))) if outfile == '': outfile = os.path.basename(r1file).split('.') for p, subs in enumerate(outfile): if '_R1' in subs: r_pos = subs.index("_R1") outfile[p] = subs[:r_pos] break elif '_R2' in subs: r_pos = subs.index("_R2") outfile[p] = subs[:r_pos] break outfile = '.'.join(outfile) else: outfile = os.path.basename(outfile) outfile = outfile.replace('.fastq', '').replace('.fasta', '') outfile += '.flashed' + suffix if os.path.isfile(os.path.join(working_directory, outfile)): # in resulting_files: print('WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'.format(working_directory + '/' + outfile)) r1file = os.path.join(working_directory, os.path.basename(r1file)) # working_directory+'/'+os.path.basename(r1file) r2file = os.path.join(working_directory, os.path.basename(r2file)) # working_directory+'/'+os.path.basename(r2file) flash_command = "{2} {0} {1}".format(r1file, r2file, flash_location) parameters['o'] = outfile parameters['d'] = working_directory for p, val in parameters.iteritems(): flash_command += ' -{0} {1}'.format(p, str(val)) flash_command += ' -q' # run on quiet command # os.system(flash_command) worked = subprocess.call(flash_command, shell=True) if worked > 0: raise Exception('Flash failed') os.rename(os.path.join(working_directory, outfile + '.extendedFrags.fastq'), os.path.join(working_directory, outfile)) try: read_count_r1_file = useful.file_line_count(r1file) except Exception as e: read_count_r1_file = 1 print("Could not get number of lines in read file: " + str(e)) try: read_count_flashed_file = useful.file_line_count(os.path.join(working_directory, outfile)) except Exception as e: read_count_flashed_file = 1 print("Could not get number of lines in outfile read file: " + str(e)) resulting_counts = ( os.path.join(working_directory, outfile), read_count_flashed_file / 4, read_count_r1_file / 4, float(100) * (read_count_flashed_file / float(read_count_r1_file)) ) return resulting_counts
def run_pear(r1file, r2file, working_directory, outfile='', parameters={}, suffix='', num_threads=1, memory='1G'): r1_path = useful.get_parent_dir(r1file) r2_path = useful.get_parent_dir(r2file) if r1file.endswith('.gz'): print("Unzipping R1 File..") r1file = useful.gunzip_python(r1file) if r2file.endswith('.gz'): print("Unzipping R2 File..") r2file = useful.gunzip_python(r2file) working_directory = os.path.abspath(working_directory) if r1_path != working_directory: os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file))) if r2_path != working_directory: os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file))) if outfile == '': outfile = os.path.basename(r1file).split('.') for p, subs in enumerate(outfile): if '_R1' in subs: r_pos = subs.index("_R1") outfile[p] = subs[:r_pos] break elif '_R2' in subs: r_pos = subs.index("_R2") outfile[p] = subs[:r_pos] break outfile = '.'.join(outfile) else: outfile = os.path.basename(outfile) outfile = outfile.replace('.fastq', '').replace('.fasta', '') outfile = os.path.join(working_directory, outfile) if os.path.isfile(os.path.join(working_directory, outfile)): # in resulting_files: print('WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN'.format(working_directory + '/' + outfile)) r1file = os.path.join(working_directory, os.path.basename(r1file)) r2file = os.path.join(working_directory, os.path.basename(r2file)) pear_command = "{2} -f {0} -r {1}".format(r1file, r2file, pear_location) parameters['o'] = outfile parameters['y'] = memory parameters['j'] = num_threads for p, val in parameters.iteritems(): pear_command += ' -{0} {1}'.format(p, str(val)) worked = subprocess.call(pear_command, shell=True) if worked > 0: raise Exception('Error in pear program') try: read_count_r1_file = useful.file_line_count(r1file) except Exception as e: read_count_r1_file = 1 print("Could not get number of lines in read file: " + str(e)) try: read_count_flashed_file = useful.file_line_count(outfile + '.assembled.fastq') except Exception as e: read_count_flashed_file = 1 print("Could not get number of lines in outfile read file: " + str(e)) resulting_counts = ( outfile + '.assembled.fastq', read_count_flashed_file / 4, read_count_r1_file / 4, float(100) * (read_count_flashed_file / float(read_count_r1_file)) ) return resulting_counts
def run_pear(r1file, r2file, working_directory, outfile='', parameters={}, suffix='', num_threads=1, memory='1G'): r1_path = useful.get_parent_dir(r1file) r2_path = useful.get_parent_dir(r2file) if r1file.endswith('.gz'): print("Unzipping R1 File..") r1file = useful.gunzip_python(r1file) if r2file.endswith('.gz'): print("Unzipping R2 File..") r2file = useful.gunzip_python(r2file) working_directory = os.path.abspath(working_directory) if r1_path != working_directory: os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file))) if r2_path != working_directory: os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file))) if outfile == '': outfile = os.path.basename(r1file).split('.') for p, subs in enumerate(outfile): if '_R1' in subs: r_pos = subs.index("_R1") outfile[p] = subs[:r_pos] break elif '_R2' in subs: r_pos = subs.index("_R2") outfile[p] = subs[:r_pos] break outfile = '.'.join(outfile) else: outfile = os.path.basename(outfile) outfile = outfile.replace('.fastq', '').replace('.fasta', '') outfile = os.path.join(working_directory, outfile) if os.path.isfile(os.path.join(working_directory, outfile)): # in resulting_files: print( 'WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN' .format(working_directory + '/' + outfile)) r1file = os.path.join(working_directory, os.path.basename(r1file)) r2file = os.path.join(working_directory, os.path.basename(r2file)) pear_command = "{2} -f {0} -r {1}".format(r1file, r2file, pear_location) parameters['o'] = outfile parameters['y'] = memory parameters['j'] = num_threads for p, val in parameters.iteritems(): pear_command += ' -{0} {1}'.format(p, str(val)) worked = subprocess.call(pear_command, shell=True) if worked > 0: raise Exception('Error in pear program') try: read_count_r1_file = useful.file_line_count(r1file) except Exception as e: read_count_r1_file = 1 print("Could not get number of lines in read file: " + str(e)) try: read_count_flashed_file = useful.file_line_count(outfile + '.assembled.fastq') except Exception as e: read_count_flashed_file = 1 print("Could not get number of lines in outfile read file: " + str(e)) resulting_counts = (outfile + '.assembled.fastq', read_count_flashed_file / 4, read_count_r1_file / 4, float(100) * (read_count_flashed_file / float(read_count_r1_file))) return resulting_counts
def run_flash(r1file, r2file, working_directory, outfile='', parameters={}, suffix=''): r1_path = useful.get_parent_dir(r1file) # '/'.join(r1file.split('/')[:-1]) r2_path = useful.get_parent_dir(r2file) # '/'.join(r2file.split('/')[:-1]) if not parameters: print "PARAMETERS NOT PASSED INTO FLASH PROGRAM. USING DEFAULT IGSEQ PARAMETERS: R = 300, F = 400" parameters = {'r': 300, 'f': 400} if r1file.endswith('.gz'): print "Unzipping R1 File.." r1file = useful.gunzip_python(r1file) if r2file.endswith('.gz'): print "Unzipping R2 File.." r2file = useful.gunzip_python(r2file) working_directory = os.path.abspath(working_directory) if r1_path != working_directory: os.rename(r1file, os.path.join(working_directory, os.path.basename(r1file))) if r2_path != working_directory: os.rename(r2file, os.path.join(working_directory, os.path.basename(r2file))) if outfile == '': outfile = os.path.basename(r1file).split('.') for p, subs in enumerate(outfile): if '_R1' in subs: r_pos = subs.index("_R1") outfile[p] = subs[:r_pos] break elif '_R2' in subs: r_pos = subs.index("_R2") outfile[p] = subs[:r_pos] break outfile = '.'.join(outfile) else: outfile = os.path.basename(outfile) outfile = outfile.replace('.fastq', '').replace('.fasta', '') outfile += '.flashed' + suffix if os.path.isfile(os.path.join(working_directory, outfile)): # in resulting_files: print( 'WARNING: FILE {0} ALREADY PRESENT IN FOLDER. FILE WILL BE OVERWRITTEN' .format(working_directory + '/' + outfile)) r1file = os.path.join(working_directory, os.path.basename( r1file)) # working_directory+'/'+os.path.basename(r1file) r2file = os.path.join(working_directory, os.path.basename( r2file)) # working_directory+'/'+os.path.basename(r2file) flash_command = "{2} {0} {1}".format(r1file, r2file, flash_location) parameters['o'] = outfile parameters['d'] = working_directory for p, val in parameters.iteritems(): flash_command += ' -{0} {1}'.format(p, str(val)) flash_command += ' -q' # run on quiet command # os.system(flash_command) worked = subprocess.call(flash_command, shell=True) if worked > 0: raise Exception('Flash failed') os.rename( os.path.join(working_directory, outfile + '.extendedFrags.fastq'), os.path.join(working_directory, outfile)) try: read_count_r1_file = useful.file_line_count(r1file) except Exception as e: read_count_r1_file = 1 print("Could not get number of lines in read file: " + str(e)) try: read_count_flashed_file = useful.file_line_count( os.path.join(working_directory, outfile)) except Exception as e: read_count_flashed_file = 1 print("Could not get number of lines in outfile read file: " + str(e)) resulting_counts = (os.path.join(working_directory, outfile), read_count_flashed_file / 4, read_count_r1_file / 4, float(100) * (read_count_flashed_file / float(read_count_r1_file))) return resulting_counts