def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for pair_of_files in input_files: folder_path = os.path.dirname(pair_of_files[0]) for i, f in enumerate(pair_of_files): if f.endswith('.gz'): print('Unzipping: ', f) pair_of_files[i] = useful.gunzip_python(f) # Run trimmomatic if trim_seqs: print('Trimming low quality bases') trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'PE' input_files = processing.run_trimmomatic(pair_of_files, folder_path, method, phred_encode, trimming_parameters) else: input_files = pair_of_files # Stitch R1-R2 files pairing_parameters = { 'v': min_overlap_length, 'm': max_assembly_length, 'n': min_assembly_length, 'u': max_fraction_uncalled, } print('Stitching R1-R2 reads') pear_results = processing.run_pear(input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory)[0] # Run quality filtering filtered_file = fastx.Run_Quality_Filter(pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(pear_results) processed_files.append(filtered_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): output_file = useful.removeFileExtension(f) + '.mixcr.alignment' output_file_annotation = useful.removeFileExtension(f) + '.mixcr.annotation' # Run MIXCR file print('Running MIXCR') [annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads) # Parse MIXCR file print('Parsing MIXCR') annotated_file = mixcr.parseMIXCR(f, output_file, 'FASTQ', output_file_annotation, command_val=command_val) # again, annotated_file should be equal to outfile_annotation annotated_files.append(annotated_file[0]) print('Pipeline complete')
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith('.gz'): print('Unzipping: ', f) f = useful.gunzip_python(f) # Run trimmomatic trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'SE' trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0] # Run quality filtering filtered_trimmed_file = fastx.Run_Quality_Filter(trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(trimmedf) processed_files.append(filtered_trimmed_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): output_file = useful.removeFileExtension(f) + '.mixcr.alignment' output_file_annotation = useful.removeFileExtension(f) + '.mixcr.annotation' # Run MIXCR file print('Running MIXCR') [annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads) # Parse MIXCR file print('Parsing MIXCR') annotated_file = mixcr.parseMIXCR(f, output_file, 'FASTQ', output_file_annotation, command_val=command_val) # again, annotated_file should be equal to outfile_annotation annotated_files.append(annotated_file) print('Pairing sequences') output_dir = os.path.dirname(annotated_files[0]) pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='MIXCR', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff) print('Pipeline complete')
def isotype_sequences(input_file, input_file_type, barcode_file='', output_file=None, output_format='TAB', seq_var='sequence', header_var='header', helper_fields={}, alignment_settings={}, analysis_name=None): #####OVER HEAD FUNCTIONS help_1 = defaultdict(str, copy.deepcopy(helper_fields)) recombination_var = help_1['recombination_var'] strand_field = help_1['strand_field'] end_of_ab_field = help_1['end_of_ab_field'] al_1 = copy.deepcopy(alignment_settings) penalize_truncations = al_1[ 'penalize_truncations'] if 'penalize_truncations' in al_1 else True minimum_alignment_length = al_1[ 'minimum_alignment_length'] if 'minimum_alignment_length' in al_1 else 15 #0=> only consider barcodes as provided #1=> only consider the reverse complmeent of barcodes provided #2=> consider both strands search_rc = al_1['search_rc'] if 'search_rc' in al_1 else 2 allowed_mismatches_in_alignment = al_1[ 'allowed_mismatches_in_alignment'] if 'allowed_mismatches_in_alignment' in al_1 else 2 #the sequence filed provided is the sequence of the SENSE AB gene not the antisense #when False, will consider both the forward and reverse copmlmement of sequence strand_corrected = al_1[ 'strand_corrected'] if 'strand_corrected' in al_1 else False #file locations seq_fasta_location = input_file # functionVars["folder_location"]+functionVars["input_file"] #location of input file translator_field = copy.deepcopy(translator) if analysis_name: translator_field['ANALYSIS_NAME'] = analysis_name.upper() translator_field = {translation_var: translator_field} if output_file == None or output_file == input_file: output_file = useful.removeFileExtension( input_file) + '.isotype.annotation' output_file_location = output_file output_file_format = output_format #functionVars['write_format'] #seqHandle = open(seq_fasta_location,"rU") outHandle = open(output_file_location, 'w') outHandle.write( descriptor_symbol + json.dumps(translator_field) + '\n' ) #write a translator line to this file so that we know how to add results to database if output_format == 'TAB' or output_format == 'CSV': outHandle.write('\t'.join(FileDelimFields) + '\n') if not barcode_file: # 'barcodefilename' in functionVars: #manually using these primers barcodeSeqList = defaultBarcodes() elif not (os.path.isfile(barcode_file)): print('Barcode file not found! Using default barcodes') #manually using these primers barcodeSeqList = defaultBarcodes() else: barcodeSeqList = readBarcodeFile(barcode_file) command_string = json.dumps({ 'Barcodes': barcodeSeqList, 'mismatch_cutoff': allowed_mismatches_in_alignment, 'penalize_truncations': penalize_truncations, 'minimum_length_cutoff': minimum_alignment_length }) iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location, filetype=input_file_type) #get maximum length of sequences in file [maxLen, numSeq] = maxSeqLen(iffile, seq_var) #make a call to the generator for alinging sequences to isotypes guessed_num_bases_after_jgene = 60 isotype_predictor = fft_tools.BarcodeAligner( barcodeSeqList, penalize_truncations, search_rc, allowed_mismatches_in_alignment, minimum_alignment_length, nmax=maxLen, nmin=guessed_num_bases_after_jgene) ###END OF OVERHEAD FUNCTIONS #now lets read through sequences and start alignining algnLim = 10 currentSeq = 0 overlap_len = 10 #seqHandle=open(seq_fasta_location,"rU") counter = 0 startPer = 0 num_isotype_found = {} total_isotype_found = 0 total_found_score = 0 total_notfound_score = 0 print("Starting isotyping analysis for {0} sequences".format(numSeq)) totaltime = 0 a = int(round(time.time())) found = 0 iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location, filetype=input_file_type) summary_data = { 'found': 0, 'top_isotype': defaultdict(int), 'average_mismatch': 0, 'average_num_isotype_found': 0 } for line_row in iffile.read(): jsonVar = {} if not line_row: continue if header_var in line_row: if idIdentifier in line_row: jsonVar[idIdentifier] = line_row[idIdentifier] jsonVar['Header'] = line_row[header_var] else: [header, id] = GrabAdditionalHeaderInfo(line_row[header_var]) jsonVar[idIdentifier] = id jsonVar['Header'] = header if seq_var not in line_row or line_row[seq_var] == '': jsonVar['Sequence'] = '' jsonVar['Notes'] = 'No sequence found' writeSeqResult(outHandle, jsonVar, output_format) continue #allow the user to monitor what percent of the sequences have been processed startPer = useful.LoopStatus(counter, numSeq, 10, startPer) bestScore = 0 bestBarcode = -1 jsonVar['Sequence'] = line_row[seq_var] jsonVar['Command'] = command_string counter += 1 seqFwd = jsonVar['Sequence'] if strand_corrected: all_seqs = [seqFwd] else: all_seqs = [seqFwd, str(Seq(seqFwd).reverse_complement())] found_strand = '' for pos, each_seq in enumerate(all_seqs): #determine if we should take a substring of the sequence #basically, only consider nucleotides AFTER the end of the ab field if end_of_ab_field in line_row and line_row[end_of_ab_field] != '': try: end_of_ab = int(line_row[end_of_ab_field]) except: end_of_ab = 0 #take substring if end_of_ab - overlap_len < len( each_seq) and end_of_ab - overlap_len >= 0: each_seq = each_seq[end_of_ab:] isotypes_results = isotype_predictor.AlignToSeq(each_seq) if isotypes_results: found_strand = strand_orientation_list[pos] break if isotypes_results: found += 1 jsonVar = dict(jsonVar.items() + isotypes_results.items()) jsonVar['Sequence strand'] = found_strand if recombination_var in line_row and line_row[recombination_var]: #always trust the recombination type from input file IF provided jsonVar['Recombination type'] = line_row[recombination_var] else: #if there is no results then attemp to guess it our selves jsonVar['Recombination type'] = GuessRecombType( jsonVar['Isotype'][0]) summary_data['top_isotype'][jsonVar['Isotype'][0]] += 1 summary_data['average_num_isotype_found'] += len( jsonVar['Isotype']) summary_data['average_mismatch'] += jsonVar['Mismatches'][0] else: if recombination_var in line_row and line_row[recombination_var]: #always trust the recombination type from input file IF provided jsonVar['Recombination type'] = line_row[recombination_var] jsonVar['Isotype'] = '' jsonVar[ 'Notes'] = 'Could not identify isotype with alignment score above threshold' summary_data['top_isotype']['NotFound'] += 1 writeSeqResult(outHandle, jsonVar, output_format) b = int(round(time.time())) summary_data['found'] = found if found: summary_data['average_mismatch'] = summary_data[ 'average_mismatch'] / float(found) summary_data['average_num_isotype_found'] = summary_data[ 'average_num_isotype_found'] / float(found) totaltime = (b - a) print "time: " print totaltime print "Summary of identified isotypes:" print summary_data #if total_isotype_found>0: # print "\nAverage score for identified isotypes:" # print str(total_found_score/float(total_isotype_found)) #if numSeq-total_isotype_found>0: # print "\nAverage score for unidentified isotypes:" # print str(total_notfound_score/float(numSeq-total_isotype_found)) outHandle.close() #if output_file_format=="txt": # JSON_to_TXT(output_file_location, output_file_location, True,{'Header':1,'Seq':2,'dir':3,'isotype':4,'algnPos':5,'maxscore':6,'bestscore':7}) return output_file
def run_gglab_pipeline(input_files, species, loci, group_name=""): # Unzip files print("Processing raw fastq files") processed_files = [] for pair_of_files in input_files: folder_path = os.path.dirname(pair_of_files[0]) for i, f in enumerate(pair_of_files): if f.endswith(".gz"): print("Unzipping: ", f) pair_of_files[i] = useful.gunzip_python(f) # Run trimmomatic if trim_seqs: print("Trimming low quality bases") trimming_parameters = { "SLIDINGWINDOW": str(window_trim) + ":" + str(quality_cutoff_trim), "MINLEN": min_read_len_post_trim, } method = "PE" input_files = processing.run_trimmomatic( pair_of_files, folder_path, method, phred_encode, trimming_parameters ) else: input_files = pair_of_files # Stitch R1-R2 files pairing_parameters = { "v": min_overlap_length, "m": max_assembly_length, "n": min_assembly_length, "u": max_fraction_uncalled, } print("Stitching R1-R2 reads") pear_results = processing.run_pear( input_files[0], input_files[1], working_directory=folder_path, parameters=pairing_parameters, num_threads=number_threads, memory=pear_memory, )[0] # Run quality filtering filtered_file = fastx.Run_Quality_Filter( pear_results, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases ) os.remove(pear_results) processed_files.append(filtered_file) print("Annotating processed fastq files") annotated_files = [] for i, f in enumerate(processed_files): output_file = useful.removeFileExtension(f) + ".mixcr.alignment" output_file_annotation = useful.removeFileExtension(f) + ".mixcr.annotation" # Run MIXCR file print("Running MIXCR") [annotated_f, command_val] = mixcr.RunMixcr( f, output_file, filetype="FASTQ", loci=[], species="", exportPrettyAlignment=False, num_threads=number_threads, ) # Parse MIXCR file print("Parsing MIXCR") annotated_file = mixcr.parseMIXCR( f, output_file, "FASTQ", output_file_annotation, command_val=command_val ) # again, annotated_file should be equal to outfile_annotation annotated_files.append(annotated_file[0]) print("Pipeline complete")
def Descriptive_Statistics(list_of_files,input_file_type,analysis_name='',exp_names = [],output_file_prefix='',fields={},statistics_to_run=['ab_aa','cdr3','vgene','jgene','vjgene']): analysis_name = analysis_name.upper() if input_file_type=='IMGT' and not isinstance(list_of_files[0],list): list_of_files = [list_of_files] elif not isinstance(list_of_files,list): list_of_files = [list_of_files] if len(exp_names)!=len(list_of_files): exp_names = [] #by default, save results to the same folder as the input file if not output_file_prefix: output_file_prefix = useful.removeFileExtension(list_of_files[0]) analysis_name = analysis_name.upper() supported_analyses = fields_for_analysis.keys() if (not analysis_name or analysis_name=='CUSTOM' or analysis_name not in supported_analyses) and not fields: raise Exception('The required fields for the provided analysis, {0}, is not currently automated. Please explicity provide the fields names'.format(str(analysis_name))) #first we use default fields defined ehere if analysis_name in supported_analyses: fields_to_use = copy.deepcopy(fields_for_analysis[analysis_name]) else: fields_to_use = {} #next we add in user defined fields just in case there are any changes/mistakes for f,name in fields.iteritems(): fields_to_use[f] = name filenames_to_use = [f[0] if isinstance(f,list) else f for f in list_of_files] print('Performing descriptive statistics at {0}.'.format(str(datetime.datetime.now()))) print('Analyzing the following files:\n\t {0}'.format('\n\t'.join(filenames_to_use))) unique_aa_file = None unique_cdr3_file = None v_gene_analysis = None j_gene_analysis = None vj_gene_analysis = None gene_analysis_plot = output_file_prefix plots_created = [] gene_summary_file = output_file_prefix+'.summary_of_stats.txt' output_file_names = {} aa_files = ['AB AA SEQUENCE','RECOMBINATION_TYPE','LOCUS','CDR1','CDR2','CDR3','STOP CODONS','PRODUCTIVE','VGENES','DGENES','JGENES','TOTAL COUNTS'] fields_order = ['full_len_ab','recomb','locus','cdr1','cdr2','cdr3','stopc','functionality','vgene','dgene','jgene'] num_exp= len(list_of_files) if not exp_names: if input_file_type=='IMGT': pass else: exp_names = [] for file in list_of_files: count = 1 str_file = os.path.basename(file) while True: if str_file in exp_names: str_file = os.path.basename(file)+'_'+str(count) count+=1 else: exp_names.append(str_file) break if 'ab_aa' in statistics_to_run: intermediate_file = output_file_prefix+'.unique_aa_file_temp' #first we will use a temp file/intermeidate file output_file_names['ab_aa'] = open(intermediate_file,'w') #output_file_names['ab_aa'].write('\t'.join(aa_files)+'\n') cdr3analysis = True if 'cdr3' in statistics_to_run else False aaanalysis = True if 'ab_aa' in statistics_to_run else False vjgene_dict=defaultdict(lambda:defaultdictgenes(num_exp)) #cdr3_dict=defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_vdj = defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_vj = defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_unk = defaultdict(lambda:defaultdictcdr3(num_exp)) use_these_fields = fields_to_use.values() fields_to_use['stopc'] = 'stopc' num_results = [0]*(num_exp) num_cdr3 = [0]*(num_exp) num_stop_codon = [0]*(num_exp) num_vdj = [0]*(num_exp) num_vj = [0]*(num_exp) num_sequences = [0]*(num_exp) if not fields_to_use['recomb']: #maybe the user never defined a feild for recombinoation type..that coudl be a problem because we will have to guess it using the variable at the top of the script: recomb_call recomb_not_defined = True fields_to_use['recomb'] = 'recomb' else: recomb_not_defined = False print('Reading through sequences in file(s)') seqnum=1 #go through all of the files and report the relevant fields #if we are creating a unique amino acid file, then report thiese fields to temp file for fnum,each_file in enumerate(list_of_files): annotated_file = readfile.immunogrepFile(each_file,input_file_type,field_names = use_these_fields) #loop through each file for seq_lines in annotated_file.read(): if not seq_lines: continue if seqnum%500000==0: print('Read {0} sequences'.format(str(seqnum))) seqnum+=1 num_sequences[fnum]+=1 seq_lines = defaultdict(str,seq_lines) if seq_lines[fields_to_use['full_len_ab']]: #full length antibody sequence not found num_results[fnum]+=1 #only select the first gene in the list. alos remove allelic name ('*') seq_lines[fields_to_use['vgene']] = seq_lines[fields_to_use['vgene']].split(',')[0].split('*')[0] seq_lines[fields_to_use['dgene']] = seq_lines[fields_to_use['dgene']].split(',')[0].split('*')[0] #IF NO RECOMBINATION TYPE IS FOUND or provided, THEN guess it using the vgene or jgene call if recomb_not_defined or not seq_lines[fields_to_use['recomb']]: r = '' #not sure what the recombation type is yet #try to guess the recombination type if seq_lines[fields_to_use['vgene']]: #use vgene if present # look at the first three characters in vgene to predict recombioation type gn = ProcessGene(seq_lines[fields_to_use['vgene']]) if gn[:3] in recomb_call: r = recomb_call[gn[:3]] elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example r = recomb_call[gn[:2]] if not r and seq_lines[fields_to_use['jgene']]: #still not r found, so use jgene gn = ProcessGene(seq_lines[fields_to_use['jgene']]) if gn[:3] in recomb_call: r = recomb_call[gn[:3]] elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example r = recomb_call[gn[:2]] #update recomb result seq_lines[fields_to_use['recomb']] = r if not seq_lines[fields_to_use['recomb']]: continue if seq_lines[fields_to_use['recomb']] == 'VDJ': num_vdj[fnum]+=1 elif seq_lines[fields_to_use['recomb']] == 'VJ': num_vj[fnum]+=1 seq_lines[fields_to_use['jgene']] = seq_lines[fields_to_use['jgene']].split(',')[0].split('*')[0] seq_lines['stopc'] = 'YES' if '*' in seq_lines[fields_to_use['full_len_ab']] else 'NO' if seq_lines['stopc'] == 'YES': num_stop_codon[fnum]+=1 if aaanalysis: exp_str = str(fnum+1) #make an intermediate file where we only put the fields we want in the proper order from any file #we will use this field for sorting afterwards #also output exp_num to account for which sequence came from which experiment output_file_names['ab_aa'].write('\t'.join([seq_lines[fields_to_use[f]] for f in fields_order])+'\t'+str(exp_str)+'\n') if seq_lines[fields_to_use['vgene']] or seq_lines[fields_to_use['jgene']]: key_v =delim.join([seq_lines[fields_to_use['vgene']],seq_lines[fields_to_use['jgene']],seq_lines[fields_to_use['recomb']]]) vjgene_dict[key_v][fnum]+=1 if not seq_lines[fields_to_use['cdr3']]: #no cdr3 found continue #add unique cdr3_recomb and vjgene info to dictionaires num_cdr3[fnum]+=1 if cdr3analysis: key = seq_lines[fields_to_use['cdr3']] #key_cdr3 = delim.join([],seq_lines[fields_to_use['recomb']]]) if seq_lines[fields_to_use['recomb']]=='VDJ': cdr3_dict_vdj[key][fnum]+=1 elif seq_lines[fields_to_use['recomb']]=='VJ': cdr3_dict_vj[key][fnum]+=1 else: print('unknown recombination types: ',seq_lines[fields_to_use['recomb']]) cdr3_dict_unk[key][fnum]+=1 if seqnum>10000: break if aaanalysis: output_file_names['ab_aa'].close() print('Generating a file of unique AB amino acid sequences') unique_aa_file = output_file_prefix+'.unique_aa_file.txt' #Use some bash to make a unique amino acid file using sorting and then some awk GenerateAAFile(intermediate_file,unique_aa_file,aa_files,exp_names) #number of amino acid sequences observed if not os.path.isfile(unique_aa_file): num_unique_aa = 0 else: num_unique_aa = useful.file_line_count(unique_aa_file)-1 #-1 => remove header row count #Now have some fun with pandas if set(['vgene','jgene','vjgene']) & set(statistics_to_run): #vjgene_dict format = { #'key' = 'vgene',_,'jgene',_,'recombtype' #value = [count,count] => a list of counts for presence of that key in EACH providced file/experiment. Length of list = number of experiments #} gene_df = pd.DataFrame(vjgene_dict).transpose() if 'VGENE' not in gene_df.columns: gene_df['VGENE'] = '' if 'JGENE' not in gene_df.columns: gene_df['JGENE'] = '' if 'recomb' not in gene_df.columns: gene_df['recomb'] = '' gene_df['TOTAL_COUNTS'] = gene_df.sum(axis=1) gene_df = gene_df.reset_index() gene_df = gene_df.apply(ModifyPDTable,axis=1,args=(['VGENE','JGENE','recomb'],delim)) new_names = {} for f,v in enumerate(exp_names): new_names[f]=v #key = experiment index number #value = new name #rename the columns 0,1,...num experiments to match the experiment names gene_df = gene_df.rename(columns=new_names) #format of gene_df: #index => no index set, just use default numbers #columns => start with column for each experiment, then add the following columns: VGENE, JGENE, recomb, TOTAL_COUNTS if 'vgene' in statistics_to_run: print('Performing V gene analysis') v_gene_analysis = output_file_prefix+'.vgenes.txt' #group elements by VH GENE CALLS and VL gene calls sorted_v_counts = gene_df.groupby(['recomb','VGENE']).sum()#.count()#.sort('VGENE',ascending=1) #find out which level in multilevel index corresponds to 'VGENE' => looking at above code , it should be level 1 (recomb should be level 0) vgene_level = sorted_v_counts.index.names.index('VGENE') #remove results where vGENE is empty if '' in list(sorted_v_counts.index.levels[vgene_level]): sorted_v_counts = sorted_v_counts.drop('',level='VGENE') ignore_counts = ['TOTAL_COUNTS','JGENE'] keep_col = [n for n in sorted_v_counts.columns if n not in ignore_counts] g = sorted_v_counts[keep_col] #NOW PLOT the FREQUENCY for every exeprement if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]): vdj_g = g.xs('VDJ',level='recomb') PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.vgenes','VH Gene Distribution','Frequency','V Gene',max_val=None,min_val=0) plots_created.append(gene_analysis_plot+'.vdj.vgenes.png') #.png extension is added in the function plotgenedist if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]): vj_g = g.xs('VJ',level='recomb') PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.vgenes','VL Gene Distribution','Frequency','V Gene',max_val=None,min_val=0) plots_created.append(gene_analysis_plot+'.vj.vgenes.png') #.png extension is added in the function plotgenedist sorted_v_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(v_gene_analysis,sep='\t',index=False) #do the same as above, except for J genes this time if 'jgene' in statistics_to_run: print('Performing J gene analysis') j_gene_analysis = output_file_prefix+'.jgenes.txt' sorted_j_counts = gene_df.groupby(['recomb','JGENE']).sum()#.sort('VGENE',ascending=1) jgene_level = sorted_j_counts.index.names.index('JGENE') if '' in list(sorted_j_counts.index.levels[jgene_level]): sorted_j_counts.drop('',level='JGENE',inplace=True) ignore_counts = ['TOTAL_COUNTS','VGENE'] keep_col = [n for n in sorted_j_counts.columns if n not in ignore_counts] g = sorted_j_counts[keep_col] sorted_j_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(j_gene_analysis,sep='\t',index=False) #NOW CALCULATE FREQUENCY for every exeprement if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]): vdj_g = g.xs('VDJ',level='recomb') PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.jgenes','JH Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5) plots_created.append(gene_analysis_plot+'.vdj.jgenes.png') #.png extension is added in the function plotgenedist if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]): vj_g = g.xs('VJ',level='recomb') PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.jgenes','JL Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5) plots_created.append(gene_analysis_plot+'.vj.jgenes.png') #.png extension is added in the function plotgenedist #now perform a V-J gene analysis (heat map) for each experiment if 'vjgene' in statistics_to_run: print('Performing V-J gene analysis') vj_gene_analysis = output_file_prefix+'.v_and_jgene_analysis.txt' #group datafraom by recombination, vgene, and jgene #first rename all V and J gnees that are empyt as No call #Then Group H / L results by by v and j gnees and take the sum of each column in the group vj_df = gene_df.replace([''],[' No call']).groupby(['recomb','VGENE','JGENE']).sum() vj_df.to_csv(vj_gene_analysis,sep='\t') #remove TOTAL_COUNTS vj_df.drop('TOTAL_COUNTS', axis=1, inplace=True) #calculate frequency for each recomb type if 'VDJ' in list(vj_df.index.levels[g.index.names.index('recomb')]): v1 = vj_df.loc['VDJ',:]/vj_df.loc['VDJ',:].sum() PlotVJGeneHeatMap(v1,gene_analysis_plot+'.vdj.v_and_jgene_analysis',max_val=None,min_val=None) plots_created.append(gene_analysis_plot+'.vdj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist if 'VJ' in list(vj_df.index.levels[g.index.names.index('recomb')]): v2 = vj_df.loc['VJ',:]/vj_df.loc['VJ',:].sum() PlotVJGeneHeatMap(v2,gene_analysis_plot+'.vj.v_and_jgene_analysis',max_val=None,min_val=None) plots_created.append(gene_analysis_plot+'.vj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist del vj_df del gene_df #lets do some cdr3 analysis cdr3_length_stats = {} diversity_measurements = {} if cdr3analysis: unique_cdr3_file = output_file_prefix+'.unique_cdr3_counts.txt' print('Performing CDR3 analyisis') if sum(num_cdr3)>0: #again create a pandas dataframe but this time using the unique cdr3 calls print('Loading CDR3s into a dataframe') cdr3_df_list = [pd.DataFrame.from_dict(c,orient='index') for c in [cdr3_dict_vdj,cdr3_dict_vj,cdr3_dict_unk]] #merge all dftogether keys=['VDJ','VJ','UNK'] cdr3_df = pd.concat(cdr3_df_list,keys=keys) #cdr3_df = pd.DataFrame(cdr3_dict).transpose() cdr3_df['TOTAL_COUNTS'] = cdr3_df.sum(axis=1) print('Dataframe created') cdr3_df.index.names = ['recomb','CDR3'] cdr3_df = cdr3_df.reset_index() #cdr3_df['CDR3'] = '' #cdr3_df['recomb'] = '' #cdr3_df = cdr3_df.apply(ModifyPDTable,axis=1,raw=True,reduce=True,args=(['CDR3','recomb'],delim)) new_names = {} #performm cdr3_df['CDR3_LENGTH'] = cdr3_df.CDR3.map(len) for f,v in enumerate(exp_names): new_names[f]=v #rename the columns to match the experiment names cdr3_df = cdr3_df.rename(columns=new_names) cdr3_df.sort(['recomb','TOTAL_COUNTS'],ascending=[1,0],inplace=True) cdr3_df.set_index(['recomb','CDR3'],inplace=True) #save dataframe as tab dleim file cdr3_df.to_csv(unique_cdr3_file,sep='\t') cdr3_length_stats = PlotCDR3Histogram(cdr3_df,gene_analysis_plot+'.cdr3_length_histogram') plots_created.append(gene_analysis_plot+'.cdr3_length_histogram.png') diversity_measurements = CalculateDiversities(cdr3_df,gene_analysis_plot+'.cdr3_diversity_plots') plots_created.append(gene_analysis_plot+'.cdr3_diversity_plots.png') del cdr3_df print('Writing summary to file') #finally make a results text file that summarizes all the information GenerateResultsSummaryFile(gene_summary_file,statistics_to_run,list_of_files,exp_names,unique_aa_file,unique_cdr3_file,v_gene_analysis,j_gene_analysis,vj_gene_analysis,plots_created,num_sequences,num_results,num_vdj,num_vj,num_cdr3,num_stop_codon,cdr3_length_stats,diversity_measurements) files_generated = [gene_summary_file] if unique_aa_file: files_generated.append(unique_aa_file) if unique_cdr3_file: files_generated.append(unique_cdr3_file) if v_gene_analysis: files_generated.append(v_gene_analysis) if j_gene_analysis: files_generated.append(j_gene_analysis) if vj_gene_analysis: files_generated.append(vj_gene_analysis) print('Descriptive statistics completed at {0}.'.format(str(datetime.datetime.now()))) gc.collect() return {'files':files_generated,'figures':plots_created}
def isotype_sequences(input_file,input_file_type,barcode_file='',output_file=None,output_format='TAB',seq_var='sequence',header_var='header',helper_fields = {},alignment_settings = {},analysis_name = None): #####OVER HEAD FUNCTIONS help_1 = defaultdict(str,copy.deepcopy(helper_fields)) recombination_var = help_1['recombination_var'] strand_field = help_1['strand_field'] end_of_ab_field = help_1['end_of_ab_field'] al_1 = copy.deepcopy(alignment_settings) penalize_truncations = al_1['penalize_truncations'] if 'penalize_truncations' in al_1 else True minimum_alignment_length = al_1['minimum_alignment_length'] if 'minimum_alignment_length' in al_1 else 15 #0=> only consider barcodes as provided #1=> only consider the reverse complmeent of barcodes provided #2=> consider both strands search_rc = al_1['search_rc'] if 'search_rc' in al_1 else 2 allowed_mismatches_in_alignment = al_1['allowed_mismatches_in_alignment'] if 'allowed_mismatches_in_alignment' in al_1 else 2 #the sequence filed provided is the sequence of the SENSE AB gene not the antisense #when False, will consider both the forward and reverse copmlmement of sequence strand_corrected = al_1['strand_corrected'] if 'strand_corrected' in al_1 else False #file locations seq_fasta_location =input_file# functionVars["folder_location"]+functionVars["input_file"] #location of input file translator_field = copy.deepcopy(translator) if analysis_name: translator_field['ANALYSIS_NAME'] = analysis_name.upper() translator_field = {translation_var:translator_field} if output_file == None or output_file==input_file: output_file = useful.removeFileExtension(input_file)+'.isotype.annotation' output_file_location = output_file output_file_format = output_format #functionVars['write_format'] #seqHandle = open(seq_fasta_location,"rU") outHandle = open(output_file_location,'w') outHandle.write(descriptor_symbol+json.dumps(translator_field)+'\n')#write a translator line to this file so that we know how to add results to database if output_format == 'TAB' or output_format == 'CSV': outHandle.write('\t'.join(FileDelimFields)+'\n') if not barcode_file:# 'barcodefilename' in functionVars: #manually using these primers barcodeSeqList = defaultBarcodes() elif not(os.path.isfile(barcode_file)): print('Barcode file not found! Using default barcodes') #manually using these primers barcodeSeqList = defaultBarcodes() else: barcodeSeqList = readBarcodeFile(barcode_file) command_string = json.dumps({'Barcodes':barcodeSeqList,'mismatch_cutoff':allowed_mismatches_in_alignment,'penalize_truncations':penalize_truncations,'minimum_length_cutoff':minimum_alignment_length}) iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,filetype=input_file_type) #get maximum length of sequences in file [maxLen,numSeq] = maxSeqLen(iffile,seq_var) #make a call to the generator for alinging sequences to isotypes guessed_num_bases_after_jgene = 60 isotype_predictor =fft_tools.BarcodeAligner(barcodeSeqList,penalize_truncations,search_rc,allowed_mismatches_in_alignment,minimum_alignment_length,nmax=maxLen,nmin=guessed_num_bases_after_jgene) ###END OF OVERHEAD FUNCTIONS #now lets read through sequences and start alignining algnLim = 10 currentSeq = 0 overlap_len = 10 #seqHandle=open(seq_fasta_location,"rU") counter = 0 startPer = 0 num_isotype_found = {} total_isotype_found = 0 total_found_score=0 total_notfound_score=0 print("Starting isotyping analysis for {0} sequences".format(numSeq)) totaltime = 0 a = int(round(time.time())) found = 0 iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,filetype=input_file_type); summary_data = {'found':0,'top_isotype':defaultdict(int),'average_mismatch':0,'average_num_isotype_found':0} for line_row in iffile.read(): jsonVar = {} if not line_row: continue if header_var in line_row: if idIdentifier in line_row: jsonVar[idIdentifier] = line_row[idIdentifier] jsonVar['Header'] = line_row[header_var] else: [header,id] = GrabAdditionalHeaderInfo(line_row[header_var]) jsonVar[idIdentifier] = id jsonVar['Header'] = header if seq_var not in line_row or line_row[seq_var]=='': jsonVar['Sequence']='' jsonVar['Notes'] = 'No sequence found' writeSeqResult(outHandle,jsonVar,output_format) continue #allow the user to monitor what percent of the sequences have been processed startPer = useful.LoopStatus(counter,numSeq,10,startPer) bestScore = 0; bestBarcode = -1; jsonVar['Sequence'] = line_row[seq_var] jsonVar['Command'] = command_string counter+=1 seqFwd = jsonVar['Sequence'] if strand_corrected: all_seqs = [seqFwd] else: all_seqs = [seqFwd,str(Seq(seqFwd).reverse_complement())] found_strand ='' for pos,each_seq in enumerate(all_seqs): #determine if we should take a substring of the sequence #basically, only consider nucleotides AFTER the end of the ab field if end_of_ab_field in line_row and line_row[end_of_ab_field]!='': try: end_of_ab = int(line_row[end_of_ab_field]) except: end_of_ab = 0 #take substring if end_of_ab-overlap_len<len(each_seq) and end_of_ab-overlap_len>=0: each_seq = each_seq[end_of_ab:] isotypes_results = isotype_predictor.AlignToSeq(each_seq) if isotypes_results: found_strand = strand_orientation_list[pos] break if isotypes_results: found += 1 jsonVar = dict(jsonVar.items()+isotypes_results.items()) jsonVar['Sequence strand'] = found_strand if recombination_var in line_row and line_row[recombination_var]: #always trust the recombination type from input file IF provided jsonVar['Recombination type'] = line_row[recombination_var] else: #if there is no results then attemp to guess it our selves jsonVar['Recombination type'] = GuessRecombType(jsonVar['Isotype'][0]) summary_data['top_isotype'][jsonVar['Isotype'][0]]+=1 summary_data['average_num_isotype_found']+=len(jsonVar['Isotype']) summary_data['average_mismatch']+=jsonVar['Mismatches'][0] else: if recombination_var in line_row and line_row[recombination_var]: #always trust the recombination type from input file IF provided jsonVar['Recombination type'] = line_row[recombination_var] jsonVar['Isotype'] = '' jsonVar['Notes'] = 'Could not identify isotype with alignment score above threshold' summary_data['top_isotype']['NotFound']+=1 writeSeqResult(outHandle,jsonVar,output_format) b = int(round(time.time())) summary_data['found'] = found if found: summary_data['average_mismatch'] = summary_data['average_mismatch']/float(found) summary_data['average_num_isotype_found'] = summary_data['average_num_isotype_found']/float(found) totaltime=(b-a) print "time: " print totaltime print "Summary of identified isotypes:" print summary_data #if total_isotype_found>0: # print "\nAverage score for identified isotypes:" # print str(total_found_score/float(total_isotype_found)) #if numSeq-total_isotype_found>0: # print "\nAverage score for unidentified isotypes:" # print str(total_notfound_score/float(numSeq-total_isotype_found)) outHandle.close() #if output_file_format=="txt": # JSON_to_TXT(output_file_location, output_file_location, True,{'Header':1,'Seq':2,'dir':3,'isotype':4,'algnPos':5,'maxscore':6,'bestscore':7}) return output_file
def run_gglab_pipeline(input_files, species, loci, group_name=''): # Unzip files print('Processing raw fastq files') processed_files = [] for i, f in enumerate(input_files): folder_path = os.path.dirname(f) if f.endswith('.gz'): print('Unzipping: ', f) f = useful.gunzip_python(f) # Run trimmomatic trimming_parameters = { 'SLIDINGWINDOW': str(window_trim) + ':' + str(quality_cutoff_trim), 'MINLEN': min_read_len_post_trim } method = 'SE' trimmedf = processing.run_trimmomatic(f, folder_path, method, phred_encode, trimming_parameters)[0] # Run quality filtering filtered_trimmed_file = fastx.Run_Quality_Filter( trimmedf, output_dir=folder_path, quality=quality_cutoff, percent=percent_bases) os.remove(trimmedf) processed_files.append(filtered_trimmed_file) print('Annotating processed fastq files') annotated_files = [] for i, f in enumerate(processed_files): output_file = useful.removeFileExtension(f) + '.mixcr.alignment' output_file_annotation = useful.removeFileExtension( f) + '.mixcr.annotation' # Run MIXCR file print('Running MIXCR') [annotated_f, command_val] = mixcr.RunMixcr(f, output_file, filetype='FASTQ', loci=[], species='', exportPrettyAlignment=False, num_threads=number_threads) # Parse MIXCR file print('Parsing MIXCR') annotated_file = mixcr.parseMIXCR( f, output_file, 'FASTQ', output_file_annotation, command_val=command_val ) # again, annotated_file should be equal to outfile_annotation annotated_files.append(annotated_file) print('Pairing sequences') output_dir = os.path.dirname(annotated_files[0]) pairing.RunPairing(annotated_files, annotated_file_formats='TAB', analysis_method='MIXCR', output_folder_path=output_dir, prefix_output_files=group_name, cluster_cutoff=cluster_setting, annotation_cluster_setting=annotation_cluster_cutoff) print('Pipeline complete')