def trim_ORF(core_dna, end_j_seq): for i in range(1, 4): #1, 2, 3 if end_j_seq == Bio.Seq.translate( core_dna[len(core_dna) - 3 * len(end_j_seq) - i:len(core_dna) - i]): return core_dna[:len(core_dna) - i] logger.error( f'\n{"#"*50}\nSomething went wrong with the ORF!!\n{core_dna}\n{Bio.Seq.transcribe(core_dna)}\n{end_j_seq}\n{"#"*50}' )
def get_meta_data(line_tokens, chain, isotype, core_dna, core_aa, cdr3, best_v_family_col, best_d_family_col, best_j_family_col): if not re.match(chain + 'V\d+', line_tokens[best_v_family_col]): logger.error(line_tokens) logger.error(line_tokens[best_v_family_col]) v_type = re.match(chain + 'V\d+', line_tokens[best_v_family_col]).group() d_type = 'unknown' if line_tokens[best_d_family_col]: # d assignment is sometimes missing d_type = re.match(chain + 'D\d+', line_tokens[best_d_family_col]).group() j_type = re.match(chain + 'J\d+', line_tokens[best_j_family_col]).group() return [chain, isotype, core_dna, core_aa, cdr3, v_type, d_type, j_type]
def remove_colliding_entries_from_default_dict(alternative_taxon_dict, default_taxon_dict): alternative_gene_names = get_gene_names(alternative_taxon_dict) default_gene_names = set(get_gene_names(default_taxon_dict)) for gene_name in alternative_gene_names: if gene_name in default_gene_names: gene_to_remove = -1 for i, gene_dict in enumerate(default_taxon_dict['genes']): if gene_dict['name'] == gene_name: gene_to_remove = i break sequence_to_remove = -1 for i, sequence_dict in enumerate(default_taxon_dict['sequenceFragments']): if sequence_dict['uri'].endswith(gene_name): sequence_to_remove = i break if gene_to_remove > -1: if sequence_to_remove == -1: logger.error(f'Did not find sequence for {gene_name}') default_taxon_dict['genes'].pop(gene_to_remove) default_taxon_dict['sequenceFragments'].pop(sequence_to_remove)
def verify_fastq_files_format(error_path, fastq1, fastq2): rep_num = os.path.split(os.path.split(fastq1)[0])[-1][-1] #/bioseq/data/results/asap/154832296135203243128690777655/reads/run1/R1.fastq num_lines_fastq1 = sum(1 for line in open(fastq1) if line.rstrip() != '') logger.info(f'{num_lines_fastq1} lines in {fastq1}') if num_lines_fastq1 % 4 != 0: err_msg = f'Illegal fastq file format: number of lines in {os.path.split(fastq1)[-1]} of rep {rep_num} is not a multiple of 4. One or more records are faulty.' logger.error(err_msg) with open(error_path, 'w') as error_path_f: error_path_f.write(err_msg) raise ValueError(err_msg) num_lines_fastq2 = sum(1 for line in open(fastq2) if line.rstrip() != '') logger.info(f'{num_lines_fastq2} lines in {fastq2}') if num_lines_fastq2 % 4 != 0: err_msg = f'Illegal fastq file format: number of lines in {os.path.split(fastq2)[-1]} of rep {rep_num} is not a multiple of 4. One or more records are faulty.' logger.error(err_msg) with open(error_path, 'w') as error_path_f: error_path_f.write(err_msg) raise ValueError(err_msg) if num_lines_fastq1 != num_lines_fastq2: err_msg = f'Illegal fastq files format: {os.path.split(fastq1)[-1]} and {os.path.split(fastq2)[-1]} of rep {rep_num} contain different number of lines ({num_lines_fastq1} and {num_lines_fastq2}, respectively).' logger.error(err_msg) with open(error_path, 'w') as error_path_f: error_path_f.write(err_msg) raise ValueError(err_msg)
def get_mixcr_cmds(lib_path, fastq_path, outpath, MMU, remote_run, error_path): if not os.path.exists(outpath): os.makedirs(outpath) logger.debug(f'fastq path: {fastq_path}') logger.debug(f'os.path.join(fastq_path, "R1.fastq"): {os.path.join(fastq_path, "R1.fastq")}') fastq1 = fastq2 = '' for file_name in os.listdir(fastq_path): if 'fastq' in file_name: if 'R1' in file_name: fastq1 = os.path.join(fastq_path, file_name) elif 'R2' in file_name: fastq2 = os.path.join(fastq_path, file_name) logger.info(f'fastq files paths are:\n{fastq1}\n{fastq2}') if not os.path.exists(fastq1): logger.error('R1.fastq is missing...') raise OSError('R1.fastq does not exist...') if not os.path.exists(fastq2): logger.error('R2.fastq is missing...') raise OSError('R2.fastq does not exist...') verify_fastq_files_format(error_path, fastq1, fastq2) vdjca_path = os.path.join(outpath, 'alignments.vdjca') clones_clns_path = os.path.join(outpath, 'clones.clns') align_cmd = ('mixcr align' #align command ' -f' #overwrite output file if already exists f' -s {"mouse" if MMU else "human"}' #consider species (mouse/human) ' -c IGH,IGL,IGK' #immunological chain gene(s) to align #f' --report {outpath}/align_report.txt' #create report file f' --library {lib_path.split(".json")[0]}' # mixcr requires lib name without json suffix!! ' -a' #save reads' ids from fastq files ' --threads 4' #number of threads #' --verbose' f' {fastq1} {fastq2}' #input files- 2 X fastq files f' {vdjca_path}') assemble_cmd = ('mixcr assemble' #assemble command ' -r ' + outpath + '/assemble_report.txt' #create report file ' -f' #overwrite output file if already exists f' -i {outpath}/index_file' #keep mapping between initial reads and final clones ' -OseparateByC=true' #separate by isotypes ' --threads 4' #number of threads #' -OcloneFactoryParameters.vParameters.featureToAlign=VRegion' #align v region and not v transcript #' -OassemblingFeatures=[CDR3]' #define sequence to create clones by #' -OminimalClonalSequenceLength=6' #minimum number of nucleotides in clonal sequence f' {vdjca_path}' #input file - VDJCA from previous step f' {clones_clns_path}') #output file exportAlignments_cmd = ('mixcr exportAlignments' #exportAlignments command ' -f' #overwrite output file if already exists f' --preset-file {CONSTS.ASAP_EXEC+"/" if remote_run else ""}aln_fields.txt' #export fields specified in aln_fields file f' -cloneIdWithMappingType {outpath}/index_file' #indicate stase of each read f' {vdjca_path}' #input file- VDJCA from previous step f' {outpath}/alignments.txt') #output file exportClones_cmds = [] # for chain in chains: # exportClones_cmd = ('mixcr exportClones' #exportClones command # ' -f' #overwrite output file if already exists # f' --chains {chain}' # f' --preset-file {CONSTS.ASAP_EXEC+"/" if remote_run else ""}assemble_fields.txt' #export fields specified in assemble_fields file # f' -readIds {outpath}/index_file' # ' -o' #remove out-of-frame clones # ' -t' #remove stop codon clones # f' {clones_clns_path}' #input file # f' {outpath}/{chain}_clones.txt') # exportClones_cmds.append(exportClones_cmd) return align_cmd, assemble_cmd, exportAlignments_cmd, exportClones_cmds
def parse_alignment_file(mixcr_output_path, parsed_mixcr_output_path, sequence_annotation_file_suffix, mutations_file_suffix, len_threshold, qlty_threshold): '''parse alignment procedure''' # input: alignments file, path for output files, length threshold, quality thresholds of total sequence and of CDR3 region # output: none. creates output files as specified in "notes" file # column indices of the relevant data from mixcr's output (for more details see 'alignments.txt' file) overlapped_reads = 0 quality = 1 accession_number = 2 DNA_FR1 = 4 DNA_FR4 = 10 AA_FR1 = 11 AA_CDR3 = 16 AA_FR4 = 17 best_v_family = 23 best_d_family = 24 best_j_family = 25 best_v_alignment = 31 # dictionary to convert ASCII code to quality values ascii_to_quality_dict = { '!': 0, '"': 1, '#': 2, '$': 3, '%': 4, '&': 5, "'": 6, '(': 7, ')': 8, '*': 9, '+': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '<': 27, '=': 28, '>': 29, '?': 30, '@': 31, 'A': 32, 'B': 33, 'C': 34, 'D': 35, 'E': 36, 'F': 37, 'G': 38, 'H': 39, 'I': 40, 'J': 41, 'K': 42 } t1 = time.time() allowed_chain_types = ['IGH', 'IGK', 'IGL', 'unknown'] #do not use += or append here! total_lines = 1 # Shifted by one because of the header. More convenient when looking in notepad++... sequences_frequency_counter: {str: int} = {} # don't use dict.fromkeys here. Causes a BUG!!! chain_to_aa_read_to_meta_data_dict = dict( zip(allowed_chain_types, [{} for chain in allowed_chain_types])) chain_to_core_dna_to_mutations_info_dict = dict( zip(allowed_chain_types, [{} for chain in allowed_chain_types])) chain_to_core_aa_to_dna_reads_and_accession_numbers = dict( zip(allowed_chain_types, [{} for chain in allowed_chain_types])) pseudo_count = 1 chain_to_count_dict = dict.fromkeys(allowed_chain_types, 0) isotypes_count_dict = dict.fromkeys( ['A', 'A1', 'A2', 'D', 'E', 'G', 'M', 'unknown'], 0) errors_count_dict = dict.fromkeys([ 'no_overlap', 'too_short_length', 'too_low_quality', 'missing_cdr3', 'nonsense_stop_codon', 'inappropriate_end_j_seq' ], 0) alignments_txt_path = os.path.join(mixcr_output_path, 'alignments.txt') #alignments_filtered_txt_path = os.path.join(parsed_mixcr_output_path, 'alignments_filtered.txt') logger.info('Start parsing {}'.format(alignments_txt_path)) with open(alignments_txt_path) as f: logger.info('File was opened succssefully.') # skip header-related variables- use to extract specified fields from alignments file header = f.readline() logger.info('First line of file is:\n{}'.format(header)) #alignments_filtered_txt = header #iterate over alignments file line by line for line in f: # avoid a bug that happens when a tab is used instead of space line = re.sub('\t([012]:N:[012]:[012])', r' \1', line) #logger.debug('Next line of file is:\n{}'.format(line)) line_tokens = line.split('\t') #count total number of entries provided by mixcr alignment total_lines += 1 logger.debug(total_lines) if total_lines % 100000 == 0: logger.info('total_lines: {}'.format(total_lines)) # If the first token contains two sequences (separated by a comma) it means that # MiXCR was unable to find an overlap between the two paired-end reads. if ',' in line_tokens[overlapped_reads]: errors_count_dict['no_overlap'] += 1 continue chain = line_tokens[best_v_family][:3] # sanity check # if line_tokens[20][:3] != line_tokens[best_v_family_col][:3]: # logger.debug(line) # logger.debug(line[20][:3]) # logger.debug(line[best_v_family_col][:3]) # logger.debug('line[20][:3] != line[best_v_family_col][:3]') dna_read = line_tokens[overlapped_reads] # a combination that should generate the relevant part of the antibody dna # (from the end of the 5' primer until the end of the end_j_seq) #core_dna = line_tokens[6] + line_tokens[4] + line_tokens[10] + line_tokens[8] + line_tokens[14] + line_tokens[12] + line_tokens[16] core_dna = ''.join(line_tokens[DNA_FR1:DNA_FR4 + 1]) # discard too short core dna's read_len = len(core_dna) if read_len < len_threshold: errors_count_dict['too_short_length'] += 1 continue # discard low quality reads sequencing_quality = line_tokens[quality] #calculate average quality of read average_quality = sum( [ascii_to_quality_dict[k] for k in sequencing_quality]) / read_len if average_quality < qlty_threshold: errors_count_dict['too_low_quality'] += 1 continue #verify CDR3 is present cdr3 = line_tokens[AA_CDR3] if cdr3 == '': # or '*' in cdr3 : errors_count_dict['missing_cdr3'] += 1 continue # this should be the translation of the core_dna #core_aa = line_tokens[7] + line_tokens[5] + line_tokens[11] + line_tokens[9] + line_tokens[15] + line_tokens[13] + line_tokens[17] core_aa = ''.join(line_tokens[AA_FR1:AA_FR4 + 1]) if logger.level <= 10: # debug mode #sanity checks if core_aa != Bio.Seq.translate( core_dna[:len(core_dna) // 3 * 3]): logger.debug( 'core_aa is NOT identical to the translated core_dna') logger.debug('core_aa:\n{}'.format(core_aa)) logger.debug('translated core_dna:\n{}'.format( Bio.Seq.translate(core_dna))) if (core_dna not in dna_read) and (not core_aa.endswith('VTVS_')): logger.debug('dna_read:\n{}'.format(dna_read)) logger.debug('core dna:\n{}'.format(core_dna)) if not core_aa.endswith(aa.end_j_seq): logger.debug('end_j_seq after fixation is: {}'.format( core_aa[-len(aa.end_j_seq[0]):])) # verify that core_aa is not non-sense if '*' in core_aa: logger.debug( 'line {} in alignment.txt file: STOP codon in core_aa!!!\n{}' .format(total_lines, core_aa)) errors_count_dict['nonsense_stop_codon'] += 1 continue # verify that there is a proper end_j_seq. # MUST be after making sure that '*' is NOT in core_aa (otherwise it makes problems with the regex). if chain == 'IGH': has_end_j_seq = False for end_j_seq in aa.end_j_seq: # aa.end_j_seq is a tuple with at least one string if match_with_up_to_k_mismatches(core_aa[-len(end_j_seq):], end_j_seq): has_end_j_seq = True break if match_with_up_to_k_mismatches( core_aa[-len(end_j_seq) - 1:-1], end_j_seq): # in case of 'VTVSS_', remove last (full/partial) "codon" core_aa = core_aa[:-1] end_j_seq = core_aa[-len( end_j_seq ):] # update current_end_j_seq. Maybe it's with one mismatch core_dna = trim_ORF( core_dna, end_j_seq ) # sometimes it's a full codon, sometimes partial has_end_j_seq = True break if not has_end_j_seq: logger.debug( 'IGH with no end_j_seq in core_aa:\n{}'.format( core_aa)) errors_count_dict['inappropriate_end_j_seq'] += 1 # if core_aa[-6:-1] in aa.end_j_seq: # errors_count_dict['VTVSS_'] = errors_count_dict.get('VTVSS_',0) + 1 continue #no more filtrations after this point!! #alignments_filtered_txt += line if chain not in allowed_chain_types: logger.error('chain_type {} not in {}'.format( chain, allowed_chain_types)) logger.error(line_tokens) chain = 'unknown' # update chain counts chain_to_count_dict[chain] += 1 if chain == 'IGH': isotype = get_isotype(dna_read, core_dna, end_j_seq) # update isotype counts isotypes_count_dict[isotype] += 1 else: #No need to count these isotype = 'NONE' # update aa_sequence counts sequences_frequency_counter[ core_aa] = sequences_frequency_counter.get(core_aa, 0) + 1 # set annotation for the (unique) aa_sequence (only for the first time) if core_aa not in chain_to_aa_read_to_meta_data_dict[chain]: chain_to_aa_read_to_meta_data_dict[chain][ core_aa] = get_meta_data(line_tokens, chain, isotype, core_dna, core_aa, cdr3, best_v_family, best_d_family, best_j_family) # update mutation counts and Ka_Ks for the (unique) dna_sequence (only for the first time) if core_dna not in chain_to_core_dna_to_mutations_info_dict[chain]: #extract mutations field from column number $best_v_alignment_col that looks like this: #1|292|312|21|313|SG5CI8ASG15CSA36CSG90ASA91GDC95I98GSC143TSC148ASC218TSC259A|1288.0 mutations_field = line_tokens[best_v_alignment].split("|")[5] update_mutation_count( core_dna, mutations_field, chain_to_core_dna_to_mutations_info_dict[chain], pseudo_count ) # chain_to_core_dna_to_num_of_non_synonymous_mutations[chain], pseudo_count) # track mapping between each aa sequence and the reads behind it chain_to_core_aa_to_dna_reads_and_accession_numbers[chain][ core_aa] = chain_to_core_aa_to_dna_reads_and_accession_numbers[ chain].get(core_aa, []) + [ (core_dna, line_tokens[accession_number]) ] # for chain in chain_to_core_dna_to_num_of_mutations: # core_dna_to_num_of_mutations = chain_to_core_dna_to_num_of_mutations[chain] # if core_dna_to_num_of_mutations != {}: # mutation_counts_file = parsed_mixcr_output_path + '/' + chain + mutations_file_suffix # write_dict_to_file(mutation_counts_file, core_dna_to_num_of_mutations) for chain in allowed_chain_types: core_dna_to_mutations_info_dict = chain_to_core_dna_to_mutations_info_dict[ chain] if core_dna_to_mutations_info_dict != {}: mutations_info_file = parsed_mixcr_output_path + '/' + chain + mutations_file_suffix write_dict_to_file(mutations_info_file, core_dna_to_mutations_info_dict, value_type=list, header='dna' + '\t' + ';'.join([ 'Ka_per_codon', 'Ks_per_codon', 'number_of_baspair_mutations' ])) #for chain in chain_to_aa_read_to_meta_data_dict: aa_read_to_meta_data_dict = chain_to_aa_read_to_meta_data_dict[chain] if aa_read_to_meta_data_dict != {}: with open( parsed_mixcr_output_path + '/' + chain + sequence_annotation_file_suffix, 'w') as f: f.write('\t'.join([ 'chain', 'isotype', 'dna', 'aa', 'missing_cdr3', 'v_type', 'd_type', 'j_type', 'counts' ]) + '\n') for core_aa in aa_read_to_meta_data_dict: f.write('\t'.join( aa_read_to_meta_data_dict[core_aa] + [str(sequences_frequency_counter[core_aa])]) + '\n') core_aa_to_dna_reads_and_accession_numbers = chain_to_core_aa_to_dna_reads_and_accession_numbers[ chain] if core_aa_to_dna_reads_and_accession_numbers != {}: aa_to_read_and_accession_path = os.path.join( parsed_mixcr_output_path, chain + '_AA_to_DNA_reads.fasta') write_mapping_file(core_aa_to_dna_reads_and_accession_numbers, aa_to_read_and_accession_path) t2 = time.time() logger.debug('sum(isotypes_count_dict.values():' + str(sum(isotypes_count_dict.values()))) outfile_report = parsed_mixcr_output_path + '/alignment_report.log' write_reports(outfile_report, t1, t2, errors_count_dict, total_lines, chain_to_count_dict, isotypes_count_dict) outfile_pie_chart = outfile_report.replace('log', 'png') if isotypes_count_dict: generate_alignment_report_pie_chart(outfile_pie_chart, isotypes_count_dict)