def filterSeq(seq_file, filter_func, filter_args={}, out_args=default_out_args, nproc=None, queue_size=None): """ Filters sequences by fraction of ambiguous nucleotides Arguments: seq_file = the sequence file to filter filter_func = the function to use for filtering sequences filter_args = a dictionary of arguments to pass to filter_func out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Define output file label dictionary cmd_dict = {filterLength:'length', filterMissing:'missing', filterRepeats:'repeats', filterQuality:'quality', maskQuality:'maskqual', trimQuality:'trimqual'} # Print parameter info log = OrderedDict() log['START'] = 'FilterSeq' log['COMMAND'] = cmd_dict.get(filter_func, filter_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(filter_args): log[k.upper()] = filter_args[k] log['NPROC'] = nproc printLog(log) # Check input type in_type = getFileType(seq_file) if in_type != 'fastq' and filter_func in (filterQuality, maskQuality, trimQuality): sys.exit('ERROR: Input file must be FASTQ for %s mode' % cmd_dict[filter_func]) # Define feeder function and arguments feed_func = feedSeqQueue feed_args = {'seq_file': seq_file} # Define worker function and arguments work_func = processSeqQueue work_args = {'process_func': filter_func, 'process_args': filter_args} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'task_label': cmd_dict[filter_func], 'out_args': out_args} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'FilterSeq' printLog(result['log']) return result['out_files']
def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False, parse_scores=False, parse_regions=False, parse_junction=False, out_args=default_out_args): """ Main for IMGT aligned sample sequences. Arguments: aligner_output : zipped file or unzipped folder output by IMGT. seq_file : FASTA file input to IMGT (from which to get seqID). no_parse : if ID is to be parsed for pRESTO output with default delimiters. partial : If True put incomplete alignments in the pass file. parse_scores : if True add alignment score fields to output file. parse_regions : if True add FWR and CDR region fields to output file. out_args : common output argument dictionary from parseCommonArgs. Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDb' log['ALIGNER'] = 'IMGT' log['ALIGNER_OUTPUT'] = aligner_output log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' log['NO_PARSE'] = no_parse log['PARTIAL'] = partial log['SCORES'] = parse_scores log['REGIONS'] = parse_regions log['JUNCTION'] = parse_junction printLog(log) start_time = time() printMessage('Loading sequence files', start_time=start_time, width=25) # Extract IMGT files temp_dir, imgt_files = extractIMGT(aligner_output) # Count records in IMGT files total_count = countDbFile(imgt_files['summary']) # Get (parsed) IDs from fasta file submitted to IMGT id_dict = getIDforIMGT(seq_file) if seq_file else {} printMessage('Done', start_time=start_time, end=True, width=25) # Parse IMGT output and write db with open(imgt_files['summary'], 'r') as summary_handle, \ open(imgt_files['gapped'], 'r') as gapped_handle, \ open(imgt_files['ntseq'], 'r') as ntseq_handle, \ open(imgt_files['junction'], 'r') as junction_handle: parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle, parse_scores=parse_scores, parse_regions=parse_regions, parse_junction=parse_junction) file_prefix = getFilePrefix(aligner_output, out_args) writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, partial=partial, out_args=out_args) # Cleanup temp directory temp_dir.cleanup() return None
def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, region_fields=False, out_args=default_out_args): """ Main for IMGT aligned sample sequences Arguments: imgt_output = zipped file or unzipped folder output by IMGT seq_file = FASTA file input to IMGT (from which to get seqID) no_parse = if ID is to be parsed for pRESTO output with default delimiters score_fields = if True add alignment score fields to output file region_fields = if True add FWR and CDR region fields to output file out_args = common output argument dictionary from parseCommonArgs Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDb' log['ALIGNER'] = 'IMGT' log['ALIGN_RESULTS'] = imgt_output log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' log['NO_PARSE'] = no_parse log['SCORE_FIELDS'] = score_fields log['REGION_FIELDS'] = region_fields printLog(log) # Get individual IMGT result files temp_dir, imgt_files = extractIMGT(imgt_output) # Formalize out_dir and file-prefix if not out_args['out_dir']: out_dir = os.path.dirname(os.path.abspath(imgt_output)) else: out_dir = os.path.abspath(out_args['out_dir']) if not os.path.exists(out_dir): os.mkdir(out_dir) if out_args['out_name']: file_prefix = out_args['out_name'] else: file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] file_prefix = os.path.join(out_dir, file_prefix) total_count = countDbFile(imgt_files[0]) # Get (parsed) IDs from fasta file submitted to IMGT id_dict = getIDforIMGT(seq_file) if seq_file else {} # Create imgt_dict = readIMGT(imgt_files, score_fields=score_fields, region_fields=region_fields) writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, score_fields=score_fields, region_fields=region_fields, out_args=out_args) # Delete temp directory rmtree(temp_dir)
def addDbFile(db_file, fields, values, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file = the database file name fields = a list of fields to add values = a list of values to assign to all rows of each field out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames} # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def indexDbFile(db_file, field=default_index_field, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file = the database file name field = the name of the index field to add out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=field) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Add count and write updated row rec.update({field: rec_count}) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False, parse_regions=False, parse_scores=False, parse_igblast_cdr3=False, out_args=default_out_args): """ Main for IgBLAST aligned sample sequences. Arguments: aligner_output : IgBLAST output file to process. seq_file : fasta file input to IgBlast (from which to get sequence). repo : folder with germline repertoire files. no_parse : if ID is to be parsed for pRESTO output with default delimiters. partial : If True put incomplete alignments in the pass file. parse_regions : if True add FWR and CDR fields to output file. parse_scores : if True add alignment score fields to output file. parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST out_args : common output argument dictionary from parseCommonArgs. Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['ALIGNER'] = 'IgBlast' log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) log['SEQ_FILE'] = os.path.basename(seq_file) log['NO_PARSE'] = no_parse log['PARTIAL'] = partial log['SCORES'] = parse_scores log['REGIONS'] = parse_regions printLog(log) start_time = time() printMessage('Loading sequence files', start_time=start_time, width=25) # Count records in sequence file total_count = countSeqFile(seq_file) # Get input sequence dictionary seq_dict = getSeqDict(seq_file) # Create germline repo dictionary repo_dict = readRepo(repo) printMessage('Done', start_time=start_time, end=True, width=25) # Parse and write output with open(aligner_output, 'r') as f: parse_iter = IgBLASTReader(f, seq_dict, repo_dict, parse_scores=parse_scores, parse_regions=parse_regions, parse_igblast_cdr3=parse_igblast_cdr3) file_prefix = getFilePrefix(aligner_output, out_args) writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, no_parse=no_parse, partial=partial, out_args=out_args) return None
def dropDbFile(db_file, fields, out_args=default_out_args): """ Deletes entire fields from a database file Arguments: db_file = the database file name fields = a list of fields to drop out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write row pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, region_fields=False, out_args=default_out_args): """ Main for IgBlast aligned sample sequences Arguments: igblast_output = IgBlast output file to process seq_file = fasta file input to IgBlast (from which to get sequence) repo = folder with germline repertoire files no_parse = if ID is to be parsed for pRESTO output with default delimiters score_fields = if True add alignment score fields to output file region_fields = if True add FWR and CDR region fields to output file out_args = common output argument dictionary from parseCommonArgs Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['ALIGNER'] = 'IgBlast' log['ALIGN_RESULTS'] = os.path.basename(igblast_output) log['SEQ_FILE'] = os.path.basename(seq_file) log['NO_PARSE'] = no_parse log['SCORE_FIELDS'] = score_fields log['REGION_FIELDS'] = region_fields printLog(log) # Get input sequence dictionary seq_dict = getSeqforIgBlast(seq_file) # Formalize out_dir and file-prefix if not out_args['out_dir']: out_dir = os.path.split(igblast_output)[0] else: out_dir = os.path.abspath(out_args['out_dir']) if not os.path.exists(out_dir): os.mkdir(out_dir) if out_args['out_name']: file_prefix = out_args['out_name'] else: file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) file_prefix = os.path.join(out_dir, file_prefix) total_count = countSeqFile(seq_file) # Create repo_dict = getRepo(repo) igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, score_fields=score_fields, region_fields=region_fields) writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, score_fields=score_fields, region_fields=region_fields, out_args=out_args)
def indexDbFile(db_file, field=default_index_field, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file = the database file name field = the name of the index field to add out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=field) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Add count and write updated row rec.update({field:rec_count}) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def writeOffsetFile(primer_file, align_func=runMuscle, align_args={}, reverse=False, out_args=default_out_args): """ Generates an offset table from a sequence file Arguments: primer_file = name of file containing primer sequences align_func = the function to use to align sequence sets align_args = a dictionary of arguments to pass to align_func reverse = if True count tail gaps; if False count head gaps out_args = common output argument dictionary from parseCommonArgs Returns: the name of the offset output file """ log = OrderedDict() log['START'] = 'AlignSets' log['COMMAND'] = 'table' log['FILE'] = os.path.basename(primer_file) log['REVERSE'] = reverse printLog(log) # Read primer file primers = readPrimerFile(primer_file) # Get offset dictionary seq_list = [SeqRecord(Seq(v, IUPAC.ambiguous_dna), id=k) for k, v in primers.items()] offset_dict = getOffsets(seq_list, align_func, align_args, reverse) # Print log and write offsets to file log = OrderedDict() for s in seq_list: log[s.id] = '%s %i' % (s.seq, offset_dict[s.id]) printLog(log) # Write offset table out_tag = 'reverse' if reverse else 'forward' with getOutputHandle(primer_file, 'offsets-%s' % out_tag, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') as out_handle: for k, v in offset_dict.items(): out_handle.write('%s\t%i\n' % (k, v)) # Print final log log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['END'] = 'AlignSets' printLog(log) return out_handle.name
def dropDbFile(db_file, fields, out_file=None, out_args=default_out_args): """ Deletes entire fields from a database file Arguments: db_file : the database file name. fields : a list of fields to drop. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Exclude dropped field from output out_fields = [f for f in db_iter.fields if f not in fields] # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write row pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def renameDbFile(db_file, fields, names, out_file=None, out_args=default_out_args): """ Renames fields in a database file Arguments: db_file : the database file name. fields : a list of fields to rename. values : a list of new names for fields. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'rename' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['NAMES'] = ','.join(names) printLog(log) # Open file handles db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Get header and rename fields out_fields = list(db_iter.fields) for f, n in zip(fields, names): i = out_fields.index(f) out_fields[i] = n # Open writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # TODO: repeating renaming is unnecessary. # Rename fields for f, n in zip(fields, names): rec[n] = rec.pop(f) # Write pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, copy_actions=None, max_field=None, min_field=None, inner=False, keep_missing=False, out_args=default_out_args): """ Removes duplicate sequences from a file Arguments: seq_file = filename of the sequence file to sample from max_missing = number of ambiguous characters to allow in a unique sequence uniq_fields = a list of annotations that define a sequence as unique if they differ copy_fields = a list of annotations to copy into unique sequence annotations copy_actions = the list of collapseAnnotation actions to take on copy_fields max_field = a numeric field whose maximum value determines the retained sequence min_field = a numeric field whose minimum value determines the retained sequence inner = if True exclude consecutive outer ambiguous characters from iterations and matching keep_missing = if True retain sequences with more ambiguous characters than max_missing as unique out_args = common output argument dictionary from parseCommonArgs Returns: the collapsed output file name """ log = OrderedDict() log['START'] = 'CollapseSeq' log['FILE'] = os.path.basename(seq_file) log['MAX_MISSING'] = max_missing log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \ if uniq_fields is not None else None log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \ if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \ if copy_actions is not None else None log['MAX_FIELD'] = max_field log['MIN_FIELD'] = min_field log['INNER'] = inner log['KEEP_MISSING'] = keep_missing printLog(log) # TODO: storing all sequences in memory is faster # Read input file in_type = getFileType(seq_file) #seq_dict = readSeqFile(seq_file, index=True) seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False)) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count total sequences rec_count = len(seq_dict) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Find sequences with duplicates uniq_dict = {} # Added list typing for compatibility issue with Python 2.7.5 on OS X # TypeError: object of type 'dictionary-keyiterator' has no len() search_keys = list(seq_dict.keys()) dup_keys = [] for n in range(0, max_missing + 1): # Find unique sequences uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, uniq_fields, copy_fields, max_field, min_field, inner, out_args['delimiter']) # Update list of duplicates dup_keys.extend(dup_list) # Update log log = OrderedDict() log['ITERATION'] = n + 1 log['MISSING'] = n log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) printLog(log, handle=log_handle) # Break if no keys to search remain if len(search_keys) == 0: break # Write unique sequences with getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as uniq_handle: for val in uniq_dict.values(): # Define output sequence out_seq = val[0] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_app = OrderedDict() if copy_fields is not None and copy_actions is not None: for f, a, s in zip(copy_fields, copy_actions, val[3:]): out_app[f] = s out_app = collapseAnnotation(out_app, a, f, delimiter=out_args['delimiter']) out_ann.pop(f, None) out_app['DUPCOUNT'] = val[1] out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' # Write unique sequence SeqIO.write(out_seq, uniq_handle, out_args['out_type']) # Write sequence with high missing character counts if keep_missing: for k in search_keys: out_seq = seq_dict[k] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' SeqIO.write(out_seq, uniq_handle, out_args['out_type']) # Write sequence with high missing character counts if out_args['failed'] and not keep_missing: with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as missing_handle: for k in search_keys: SeqIO.write(seq_dict[k], missing_handle, out_args['out_type']) if out_args['failed']: # Write duplicate sequences with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as dup_handle: for k in dup_keys: SeqIO.write(seq_dict[k], dup_handle, out_args['out_type']) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(uniq_handle.name) log['SEQUENCES'] = rec_count log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) log['END'] = 'CollapseSeq' printLog(log) # Close file handles if log_handle is not None: log_handle.close() return uniq_handle.name
def addDbFile(db_file, fields, values, out_file=None, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file : the database file name. fields : a list of fields to add. values : a list of values to assign to all rows of each field. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open inut db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Add fields out_fields = list(db_iter.fields) out_fields.extend(fields) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = { k: v for k, v in zip(fields, values) if k not in db_iter.fields } # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def convertDbClip(db_file, id_field=default_id_field, seq_field=default_seq_field, germ_field=default_germ_field, cluster_field=None, meta_fields=None, out_args=default_out_args): """ Builds fasta files from database records Arguments: db_file = the database file name id_field = the field containing identifiers seq_field = the field containing sample sequences germ_field = the field containing germline sequences cluster_field = the field containing clonal groupings if None write the germline for each record meta_fields = a list of fields to add to sequence annotations out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'fasta' log['FILE'] = os.path.basename(db_file) log['ID_FIELD'] = id_field log['SEQ_FIELD'] = seq_field log['GERM_FIELD'] = germ_field log['CLUSTER_FIELD'] = cluster_field if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='clip') # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = germ_count = pass_count = fail_count = 0 cluster_last = None for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Update cluster ID cluster = rec.get(cluster_field, None) # Get germline SeqRecord when needed if cluster_field is None: germ = getDbSeqRecord(rec, id_field, germ_field, meta_fields, delimiter=out_args['delimiter']) germ.id = '>' + germ.id elif cluster != cluster_last: germ = getDbSeqRecord(rec, cluster_field, germ_field, delimiter=out_args['delimiter']) germ.id = '>' + germ.id else: germ = None # Get read SeqRecord seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, delimiter=out_args['delimiter']) # Write germline if germ is not None: germ_count += 1 SeqIO.write(germ, pass_handle, 'fasta') # Write sequences if seq is not None: pass_count += 1 SeqIO.write(seq, pass_handle, 'fasta') else: fail_count += 1 # Set last cluster ID cluster_last = cluster # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GERMLINES'] = germ_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def selectSeqFile(seq_file, field, value_list=None, value_file=None, negate=False, out_file=None, out_args=default_out_args): """ Select from a sequence file Arguments: seq_file : filename of the sequence file to sample from. field : the annotation field to check for required values. value_list : a list of annotation values that a sample must contain one of. value_file : a tab delimited file containing values to select. negate : if True select entires that do not contain the specific values. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output file name. """ # Reads value_file def _read_file(value_file, field): field_list = [] try: with open(value_file, 'rt') as handle: reader_dict = csv.DictReader(handle, dialect='excel-tab') for row in reader_dict: field_list.append(row[field]) except IOError: printError('File %s cannot be read.' % value_file) except: printError('File %s is invalid.' % value_file) return field_list # Print console log log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field if value_list is not None: log['VALUE_LIST'] = ','.join([str(x) for x in value_list]) if value_file is not None: log['VALUE_FILE'] = os.path.basename(value_file) log['NOT'] = negate printLog(log) # Read value_file if value_list is not None and value_file is not None: printError('Specify only one of value_list and value_file.') elif value_file is not None: value_list = _read_file(value_file, field) # Read sequence file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Output output handle if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'selected', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Generate subset of records start_time = time() pass_count, fail_count, rec_count = 0, 0, 0 value_set = set(value_list) for rec in seq_iter: printCount(rec_count, 1e5, start_time=start_time) rec_count += 1 # Parse annotations into a list of values ann = parseAnnotation(rec.description, delimiter=out_args['delimiter'])[field] ann = ann.split(out_args['delimiter'][2]) # Write if xor(negate, not value_set.isdisjoint(ann)): # Write SeqIO.write(rec, out_handle, out_args['out_type']) pass_count += 1 else: fail_count += 1 printCount(rec_count, 1e5, start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'SplitSeq' printLog(log) return out_handle.name
def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, score_fields=False, region_fields=False, out_args=default_out_args): """ Writes tab-delimited database file in output directory Arguments: db_gen = a generator of IgRecord objects containing alignment data file_prefix = directory and prefix for CLIP tab-delim file total_count = number of records (for progress bar) id_dict = a dictionary of {IMGT ID: full seq description} no_parse = if ID is to be parsed for pRESTO output with default delimiters score_fields = if True add alignment score fields to output file region_fields = if True add FWR and CDR region fields to output file out_args = common output argument dictionary from parseCommonArgs Returns: None """ pass_file = "%s_db-pass.tab" % file_prefix fail_file = "%s_db-fail.tab" % file_prefix ordered_fields = ['SEQUENCE_ID', 'SEQUENCE_INPUT', 'FUNCTIONAL', 'IN_FRAME', 'STOP', 'MUTATED_INVARIANT', 'INDELS', 'V_CALL', 'D_CALL', 'J_CALL', 'SEQUENCE_VDJ', 'SEQUENCE_IMGT', 'V_SEQ_START', 'V_SEQ_LENGTH', 'V_GERM_START_VDJ', 'V_GERM_LENGTH_VDJ', 'V_GERM_START_IMGT', 'V_GERM_LENGTH_IMGT', 'N1_LENGTH', 'D_SEQ_START', 'D_SEQ_LENGTH', 'D_GERM_START', 'D_GERM_LENGTH', 'N2_LENGTH', 'J_SEQ_START', 'J_SEQ_LENGTH', 'J_GERM_START', 'J_GERM_LENGTH', 'JUNCTION_LENGTH', 'JUNCTION'] if score_fields: ordered_fields.extend(['V_SCORE', 'V_IDENTITY', 'V_EVALUE', 'V_BTOP', 'J_SCORE', 'J_IDENTITY', 'J_EVALUE', 'J_BTOP']) if region_fields: ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) # TODO: This is not the best approach. should pass in output fields. # Initiate passed handle pass_handle = None # Open failed file if out_args['failed']: fail_handle = open(fail_file, 'wt') fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) else: fail_handle = None fail_writer = None # Initialize counters and file pass_writer = None start_time = time() rec_count = pass_count = fail_count = 0 for record in db_gen: #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) printProgress(rec_count, total_count, 0.05, start_time) rec_count += 1 # Count pass or fail if (record.v_call == 'None' and record.j_call == 'None') or \ record.functional is None or \ not record.seq_vdj or \ not record.junction: # print(record.v_call, record.j_call, record.functional, record.junction) fail_count += 1 if fail_writer is not None: fail_writer.writerow(record.toDict()) continue else: pass_count += 1 # Build sample sequence description if record.id in id_dict: record.id = id_dict[record.id] # Parse sequence description into new columns if not no_parse: record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) record.id = record.annotations['ID'] del record.annotations['ID'] # TODO: This is not the best approach. should pass in output fields. # If first sequence, use parsed description to create new columns and initialize writer if pass_writer is None: if not no_parse: ordered_fields.extend(list(record.annotations.keys())) pass_handle = open(pass_file, 'wt') pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) # Write row to tab-delim CLIP file pass_writer.writerow(record.toDict()) # Print log #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) printProgress(rec_count, total_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = pass_file log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'MakeDb' printLog(log) if pass_handle is not None: pass_handle.close() if fail_handle is not None: fail_handle.close()
def alignRecords(db_file, seq_fields, group_func, align_func, group_args={}, align_args={}, format='changeo', out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Performs a multiple alignment on sets of sequences Arguments: db_file : filename of the input database. seq_fields : the sequence fields to multiple align. group_func : function to use to group records. align_func : function to use to multiple align sequence groups. group_args : dictionary of arguments to pass to group_func. align_args : dictionary of arguments to pass to align_func. format : output format. One of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes. if None defaults to the number of CPUs. queue_size : maximum size of the argument queue. if None defaults to 2*nproc. Returns: dict : names of the 'pass' and 'fail' output files. """ # Define subcommand label dictionary cmd_dict = { alignAcross: 'across', alignWithin: 'within', alignBlocks: 'block' } # Print parameter info log = OrderedDict() log['START'] = 'AlignRecords' log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__) log['FILE'] = os.path.basename(db_file) log['SEQ_FIELDS'] = ','.join(seq_fields) if 'group_fields' in group_args: log['GROUP_FIELDS'] = ','.join(group_args['group_fields']) if 'mode' in group_args: log['MODE'] = group_args['mode'] if 'action' in group_args: log['ACTION'] = group_args['action'] log['NPROC'] = nproc printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Define feeder function and arguments if 'group_fields' in group_args and group_args['group_fields'] is not None: group_args['group_fields'] = [ schema.toReceptor(f) for f in group_args['group_fields'] ] feed_func = feedDbQueue feed_args = { 'db_file': db_file, 'reader': reader, 'group_func': group_func, 'group_args': group_args } # Define worker function and arguments field_map = OrderedDict([(schema.toReceptor(f), '%s_align' % f) for f in seq_fields]) align_args['field_map'] = field_map work_func = processDbQueue work_args = {'process_func': align_func, 'process_args': align_args} # Define collector function and arguments out_fields = getDbFields(db_file, add=list(field_map.values()), reader=reader) out_args['out_type'] = schema.out_type collect_func = collectDbQueue collect_args = { 'db_file': db_file, 'label': 'align', 'fields': out_fields, 'writer': writer, 'out_file': out_file, 'out_args': out_args } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'AlignRecords' printLog(result['log']) output = {k: v for k, v in result.items() if k in ('pass', 'fail')} return output
def assembleCloneGermline(db_file, repo, seq_field=default_seq_field, v_field=default_v_field, germ_types=default_germ_types, out_args=default_out_args): """ Assemble one germline sequence for each clone in a tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'True' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader references = readRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS'] add_fields += ['GERMLINE_V_CALL'] add_fields += ['GERMLINE_D_CALL'] add_fields += ['GERMLINE_J_CALL'] # Create output file handle and Db writer writers = {} pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None writers['fail'] = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) counts = {} clone_count = counts['pass'] = counts['fail'] = 0 # Iterate over rows clone = 'initial' clone_dict = OrderedDict() for i, row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) # Clone isn't over yet if row.get('CLONE', '') == clone: clone_dict[i] = row # Clone just finished elif clone_dict: clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, references, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Now deal with current row (first of next clone) clone = row['CLONE'] clone_dict = OrderedDict([(i, row)]) # Last case is only for first row of file else: clone = row['CLONE'] clone_dict = OrderedDict([(i, row)]) clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, references, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Print log printProgress(i + 1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = counts['pass'] log['FAIL'] = counts['fail'] log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def clusterSets(seq_file, barcode_field=default_barcode_field, cluster_field=default_cluster_field, ident=default_ident, seq_start=None, seq_end=None, usearch_exec=default_usearch_exec, out_args=default_out_args, nproc=None, queue_size=None): """ Performs clustering on sets of sequences Arguments: seq_file = the sample sequence file name barcode_field = the annotation containing set IDs ident = the identity threshold for clustering sequences seq_start = the start position to trim sequences at before clustering seq_end = the end position to trim sequences at before clustering usearch_exec = the path to the executable for usearch nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: the clustered output file name """ # Print parameter info log = OrderedDict() log['START'] = 'ClusterSets' log['FILE'] = os.path.basename(seq_file) log['BARCODE_FIELD'] = barcode_field log['CLUSTER_FIELD'] = cluster_field log['IDENTITY'] = ident log['SEQUENCE_START'] = seq_start log['SEQUENCE_END'] = seq_end log['NPROC'] = nproc printLog(log) # Define cluster function parameters cluster_args = {'usearch_exec':usearch_exec, 'ident':ident, 'seq_start':seq_start, 'seq_end':seq_end} # Define feeder function and arguments index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = {'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args} # Define worker function and arguments work_func = processCSQueue work_args = {'cluster_field': cluster_field, 'cluster_args': cluster_args, 'delimiter': out_args['delimiter']} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'task_label': 'cluster', 'out_args': out_args, 'index_field': barcode_field} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'ClusterSets' printLog(log) return result['out_files']
def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'False' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader references = readRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS'] # Create output file handle and Db writer pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None fail_writer = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) pass_count = fail_count = 0 # Iterate over rows for i, row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) result_log, germlines = joinGermline(row, references, seq_field=seq_field, v_field=v_field, germ_types=germ_types) # Add germline field(s) to dictionary if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full'] if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask'] if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly'] if 'regions' in germ_types: row['GERMLINE_REGIONS'] = germlines['regions'] # Write row to pass or fail file if 'ERROR' in result_log: fail_count += 1 if fail_writer is not None: fail_writer.writerow(row) else: result_log['SEQUENCE'] = row[seq_field] result_log['GERMLINE'] = germlines['full'] result_log['REGIONS'] = germlines['regions'] pass_count += 1 pass_writer.writerow(row) printLog(result_log, handle=log_handle) # Print log printProgress(i + 1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def tableHeaders(seq_file, fields, out_args=default_out_args): """ Builds a table of sequence header annotations Arguments: seq_file = the sequence file name fields = the list of fields to output out_args = common output argument dictionary from parseCommonArgs Returns: the output table file name """ log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = 'table' log['FILE'] = os.path.basename(seq_file) printLog(log) # Open file handles seq_iter = readSeqFile(seq_file) out_handle = getOutputHandle(seq_file, out_label='headers', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Count records result_count = countSeqFile(seq_file) # Open csv writer and write header out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', delimiter='\t', fieldnames=fields) out_writer.writeheader() # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) # Get annotations seq_count += 1 ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter']) # Write records if ann: pass_count += 1 out_writer.writerow(ann) else: fail_count += 1 # Print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file = the sequence file name modify_func = the function defining the modification operation modify_args = a dictionary of arguments to pass to modify_func out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def sortDbFile(db_file, field, numeric=False, descend=False, out_file=None, out_args=default_out_args): """ Sorts records by values in an annotation field Arguments: db_file : the database filename field : the field name to sort by numeric : if True sort field numerically; if False sort field alphabetically descend : if True sort in descending order; if False sort in ascending order out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: str : output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'sort' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUMERIC'] = numeric printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Store all records in a dictionary start_time = time() printMessage("Indexing: Running", start_time=start_time) db_dict = {i: r for i, r in enumerate(db_iter)} result_count = len(db_dict) # Sort db_dict by field values tag_dict = {k: v[field] for k, v in db_dict.items()} if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()} sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend) printMessage("Indexing: Done", start_time=start_time, end=True) # Iterate over records start_time = time() rec_count = 0 for key in sorted_keys: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write records pass_writer.writeDict(db_dict[key]) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def samplePairSeqFile(seq_file_1, seq_file_2, max_count, field=None, values=None, coord_type=default_coord, out_args=default_out_args): """ Samples from paired-end sequence files Arguments: seq_file_1 : filename of the first paired-end sequence file seq_file_2 : filename of the second paired-end sequence file max_count : a list of the maximum number of sequences to sample field : the annotation field to check for required values values : a list of annotation values that a sample must contain one of coord_type : the sequence header format out_args : common output argument dictionary from parseCommonArgs Returns: list: seq_file_1 and seq_file_2 output file names """ # Sequence index key function def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) # Function to sample from two lists of sequence indices def _sample_list(n, index_1, index_2): key_set = set(index_1).intersection(index_2) max_n = len(key_set) return random.sample(key_set, min(n, max_n)) # Function to sample from two dictionaries of grouped sequence indices def _sample_dict(n, index_1, index_2): group_set = set(index_1.keys()).intersection(index_2.keys()) sample_list = [] for k in group_set: key_set = set(index_1[k]).intersection(index_2[k]) max_n = len(key_set) sample_list.extend(random.sample(key_set, min(n, max_n))) return sample_list # Print console log log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'samplepair' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['MAX_COUNTS'] = ','.join([str(x) for x in max_count]) log['FIELD'] = field log['VALUES'] = ','.join(values) if values else None printLog(log) # Define output type in_type_1 = getFileType(seq_file_1) in_type_2 = getFileType(seq_file_2) if out_args['out_type'] is None: out_type_1 = in_type_1 out_type_2 = in_type_2 else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Index input files start_time = time() printMessage('Reading files', start_time=start_time, width=25) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) seq_dict_2 = readSeqFile(seq_file_2, index=True, key_func=_key_func) # Subset keys to those meeting field/value criteria if field is not None and values is not None: _sample = _sample_list printMessage('Subsetting by annotation', start_time=start_time, width=25) seq_index_1 = subsetSeqIndex(seq_dict_1, field, values, delimiter=out_args['delimiter']) seq_index_2 = subsetSeqIndex(seq_dict_2, field, values, delimiter=out_args['delimiter']) elif field is not None and values is None: _sample = _sample_dict printMessage('Indexing by annotation', start_time=start_time, width=25) seq_index_1 = indexSeqSets(seq_dict_1, field, delimiter=out_args['delimiter']) seq_index_2 = indexSeqSets(seq_dict_2, field, delimiter=out_args['delimiter']) else: _sample = _sample_list seq_index_1 = list(seq_dict_1.keys()) seq_index_2 = list(seq_dict_2.keys()) printMessage('Done', start_time=start_time, end=True, width=25) # Generate sample set for each value in max_count out_files = [] for i, n in enumerate(max_count): start_time = time() printMessage('Sampling n=%i' % n, start_time=start_time, width=25) # Sample sample_keys = _sample(n, seq_index_1, seq_index_2) sample_count = len(sample_keys) # Open file handles out_handle_1 = getOutputHandle(seq_file_1, 'sample%i-n%i' % (i + 1, sample_count), out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) out_handle_2 = getOutputHandle(seq_file_2, 'sample%i-n%i' % (i + 1, sample_count), out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) out_files.append((out_handle_1.name, out_handle_2.name)) for k in sample_keys: SeqIO.write(seq_dict_1[k], out_handle_1, out_type_1) SeqIO.write(seq_dict_2[k], out_handle_2, out_type_2) printMessage('Done', start_time=start_time, end=True, width=25) # Print log for iteration log = OrderedDict() log['MAX_COUNT'] = n log['SAMPLED'] = sample_count log['OUTPUT1'] = os.path.basename(out_files[i][0]) log['OUTPUT2'] = os.path.basename(out_files[i][1]) printLog(log) # Close file handles out_handle_1.close() out_handle_2.close() # Print log log = OrderedDict() log['END'] = 'SplitSeq' printLog(log) return out_files
def mergeDbFiles(db_files, drop=False, out_file=None, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_files : list of database file names. drop : if True drop columns not present in all files. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'merge' log['FILES'] = ','.join([os.path.basename(f) for f in db_files]) log['DROP'] = drop printLog(log) # Open input db_handles = [open(f, 'rt') for f in db_files] db_iters = [TSVReader(x) for x in db_handles] result_count = sum([countDbFile(f) for f in db_files]) # Define output fields field_list = [x.fields for x in db_iters] if drop: field_set = set.intersection(*map(set, field_list)) else: field_set = set.union(*map(set, field_list)) field_order = OrderedDict([(f, None) for f in chain(*field_list)]) out_fields = [f for f in field_order if f in field_set] # Open output file if out_file is not None: pass_handle = open(out_file, 'w') else: __, __, out_args['out_type'] = splitName(db_files[0]) pass_handle = getOutputHandle(db_files[0], out_label='parse-merge', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Iterate over records start_time = time() rec_count = 0 for db in db_iters: for rec in db: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write records pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() for x in db_handles: x.close() return pass_handle.name
def collectDbQueue(alive, result_queue, collect_queue, db_file, task_label, out_args, add_fields=None): """ Pulls from results queue, assembles results and manages log and file IO Arguments: alive : multiprocessing.Value boolean controlling whether processing continues; when False function returns result_queue : multiprocessing.Queue holding worker results collect_queue : multiprocessing.Queue to store collector return values db_file : Database file name task_label : Task label used to tag the output files out_args : Common output argument dictionary from parseCommonArgs add_fields : List of fields added to the writer not present in the in_file; if None do not add fields Returns: None : Adds a dictionary with key value pairs to collect_queue containing 'log' defining a log object, 'out_files' defining the output file names """ try: result_count = countDbFile(db_file) # Define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] # Defined valid alignment output handle pass_handle = getOutputHandle(db_file, '%s-pass' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, '%s-fail' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Iterator over results queue until sentinel object reached start_time = time() set_count = rec_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration printProgress(pass_count, result_count, 0.05, start_time) # Update counts for current iteration set_count += 1 rec_count += result.data_count # Write log printLog(result.log, handle=log_handle) # Write alignments if result: pass_count += result.data_count for rec in result.results: pass_writer.writerow(rec.toDict()) else: fail_count += result.data_count if fail_handle is not None: for rec in result.data: pass_writer.writerow(rec.toDict()) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(pass_count, result_count, 0.05, start_time) # Update return values log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GROUPS'] = set_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log':log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() except: alive.value = False raise return None
def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None, group_func=None, group_args={}, clone_args={}, cluster_args={}, out_args=default_out_args, nproc=None, queue_size=None): """ Define clonally related sequences Arguments: db_file = filename of input database feed_func = the function that feeds the queue work_func = the worker function that will run on each CPU collect_func = the function that collects results from the workers group_func = the function to use for assigning preclones clone_func = the function to use for determining clones within preclonal groups group_args = a dictionary of arguments to pass to group_func clone_args = a dictionary of arguments to pass to clone_func out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Print parameter info log = OrderedDict() log['START'] = 'DefineClones' log['DB_FILE'] = os.path.basename(db_file) if group_func is not None: log['GROUP_FUNC'] = group_func.__name__ log['GROUP_ARGS'] = group_args log['CLONE_FUNC'] = clone_func.__name__ # TODO: this is yucky, but can be fixed by using a model class clone_log = clone_args.copy() if 'dist_mat' in clone_log: del clone_log['dist_mat'] log['CLONE_ARGS'] = clone_log if cluster_func is not None: log['CLUSTER_FUNC'] = cluster_func.__name__ log['CLUSTER_ARGS'] = cluster_args log['NPROC'] = nproc printLog(log) # Define feeder function and arguments feed_args = {'db_file': db_file, 'group_func': group_func, 'group_args': group_args} # Define worker function and arguments work_args = {'clone_func': clone_func, 'clone_args': clone_args} # Define collector function and arguments collect_args = {'db_file': db_file, 'out_args': out_args, 'cluster_func': cluster_func, 'cluster_args': cluster_args} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'DefineClones' printLog(result['log']) return result['out_files']
def convertDbFasta(db_file, id_field=default_id_field, seq_field=default_seq_field, meta_fields=None, out_args=default_out_args): """ Builds fasta files from database records Arguments: db_file = the database file name id_field = the field containing identifiers seq_field = the field containing sequences meta_fields = a list of fields to add to sequence annotations out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'fasta' log['FILE'] = os.path.basename(db_file) log['ID_FIELD'] = id_field log['SEQ_FIELD'] = seq_field if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) printLog(log) # Open file handles out_type = 'fasta' db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = fail_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Get SeqRecord seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, out_args['delimiter']) # Write sequences if seq is not None: pass_count += 1 SeqIO.write(seq, pass_handle, out_type) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def assemblePairs(head_file, tail_file, assemble_func, assemble_args={}, coord_type=default_coord_type, rc=None, head_fields=None, tail_fields=None, out_args=default_out_args, nproc=None, queue_size=None): """ Generates consensus sequences Arguments: head_file = the head sequence file name tail_file = the tail sequence file name assemble_func = the function to use to assemble paired ends assemble_args = a dictionary of arguments to pass to the assembly function coord_type = the sequence header format rc = Defines which sequences ('head','tail','both') to reverse complement before assembly; if None do not reverse complement sequences head_fields = list of annotations in head_file records to copy to assembled record; if None do not copy an annotation tail_fields = list of annotations in tail_file records to copy to assembled record; if None do not copy an annotation out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Define subcommand label dictionary cmd_dict = {alignAssembly:'align', joinSeqPair:'join', referenceAssembly:'reference'} # Print parameter info log = OrderedDict() log['START'] = 'AssemblePairs' log['COMMAND'] = cmd_dict.get(assemble_func, assemble_func.__name__) log['FILE1'] = os.path.basename(head_file) log['FILE2'] = os.path.basename(tail_file) log['COORD_TYPE'] = coord_type if 'ref_file' in assemble_args: log['REFFILE'] = assemble_args['ref_file'] if 'alpha' in assemble_args: log['ALPHA'] = assemble_args['alpha'] if 'max_error' in assemble_args: log['MAX_ERROR'] = assemble_args['max_error'] if 'min_len' in assemble_args: log['MIN_LEN'] = assemble_args['min_len'] if 'max_len' in assemble_args: log['MAX_LEN'] = assemble_args['max_len'] if 'scan_reverse' in assemble_args: log['SCAN_REVERSE'] = assemble_args['scan_reverse'] if 'gap' in assemble_args: log['GAP'] = assemble_args['gap'] if 'min_ident' in assemble_args: log['MIN_IDENT'] = assemble_args['min_ident'] if 'evalue' in assemble_args: log['EVALUE'] = assemble_args['evalue'] if 'max_hits' in assemble_args: log['MAX_HITS'] = assemble_args['max_hits'] if 'fill' in assemble_args: log['FILL'] = assemble_args['fill'] log['NPROC'] = nproc printLog(log) # Count input files head_count = countSeqFile(head_file) tail_count = countSeqFile(tail_file) if head_count != tail_count: sys.exit('Error: FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records' \ % (head_count, tail_count)) # Define feeder function and arguments feed_func = feedPairQueue # feed_args = {'seq_file_1': head_file, # 'seq_file_2': tail_file, # 'index_dict': index_dict} feed_args = {'seq_file_1': head_file, 'seq_file_2': tail_file, 'coord_type': coord_type, 'delimiter': out_args['delimiter']} # Define worker function and arguments process_args = {'assemble_func': assemble_func, 'assemble_args': assemble_args, 'rc': rc, 'fields_1': head_fields, 'fields_2': tail_fields, 'delimiter': out_args['delimiter']} work_func = processSeqQueue work_args = {'process_func': processAssembly, 'process_args': process_args} # Define collector function and arguments collect_func = collectPairQueue # collect_args = {'result_count': pair_count, # 'seq_file_1': head_file, # 'seq_file_2': tail_file, # 'out_args': out_args} collect_args = {'result_count': head_count, 'seq_file_1': head_file, 'seq_file_2': tail_file, 'out_args': out_args} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'AssemblePairs' printLog(log) return result['out_files']
def addDbFile(db_file, fields, values, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file = the database file name fields = a list of fields to add values = a list of values to assign to all rows of each field out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = { k: v for k, v in zip(fields, values) if k not in db_iter.fieldnames } # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def convertToGenbank(db_file, inference=None, db_xref=None, molecule=default_molecule, product=default_product, features=None, c_field=None, label=None, count_field=None, index_field=None, allow_stop=False, asis_id=False, asis_calls=False, allele_delim=default_allele_delim, build_asn=False, asn_template=None, tbl2asn_exec=default_tbl2asn_exec, format=default_format, out_file=None, out_args=default_out_args): """ Builds GenBank submission fasta and table files Arguments: db_file : the database file name. inference : reference alignment tool. db_xref : reference database link. molecule : source molecule (eg, "mRNA", "genomic DNA") product : Product (protein) name. features : dictionary of sample features (BioSample attributes) to add to the description of each record. c_field : column containing the C region gene call. label : a string to use as a label for the ID. if None do not add a field label. count_field : field name to populate the AIRR_READ_COUNT note. index_field : field name to populate the AIRR_CELL_INDEX note. allow_stop : if True retain records with junctions having stop codons. asis_id : if True use the original sequence ID for the output IDs. asis_calls : if True do not parse gene calls for IMGT nomenclature. allele_delim : delimiter separating the gene name from the allele number when asis_calls=True. build_asn : if True run tbl2asn on the generated .tbl and .fsa files. asn_template : template file (.sbt) to pass to tbl2asn. tbl2asn_exec : name of or path to the tbl2asn executable. format : input and output format. out_file : output file name without extension. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: tuple : the output (feature table, fasta) file names. """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'genbank' log['FILE'] = os.path.basename(db_file) printLog(log) # Define format operators try: reader, __, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Open input db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['sequence_input', 'v_call', 'd_call', 'j_call', 'v_seq_start', 'd_seq_start', 'j_seq_start'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Open output if out_file is not None: out_name, __ = os.path.splitext(out_file) fsa_handle = open('%s.fsa' % out_name, 'w') tbl_handle = open('%s.tbl' % out_name, 'w') else: fsa_handle = getOutputHandle(db_file, out_label='genbank', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='fsa') tbl_handle = getOutputHandle(db_file, out_label='genbank', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tbl') # Count records result_count = countDbFile(db_file) # Define writer writer = csv.writer(tbl_handle, delimiter='\t', quoting=csv.QUOTE_NONE) # Iterate over records start_time = time() rec_count, pass_count, fail_count = 0, 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Extract table dictionary name = None if asis_id else rec_count seq = makeGenbankSequence(rec, name=name, label=label, count_field=count_field, index_field=index_field, molecule=molecule, features=features) tbl = makeGenbankFeatures(rec, start=seq['start'], end=seq['end'], product=product, db_xref=db_xref, inference=inference, c_field=c_field, allow_stop=allow_stop, asis_calls=asis_calls, allele_delim=allele_delim) if tbl is not None: pass_count +=1 # Write table writer.writerow(['>Features', seq['record'].id]) for feature, qualifiers in tbl.items(): writer.writerow(feature) if qualifiers: for x in qualifiers: writer.writerow(list(chain(['', '', ''], x))) # Write sequence SeqIO.write(seq['record'], fsa_handle, 'fasta') else: fail_count += 1 # Final progress bar printProgress(rec_count, result_count, 0.05, start_time=start_time) # Run tbl2asn if build_asn: start_time = time() printMessage('Running tbl2asn', start_time=start_time, width=25) result = runASN(fsa_handle.name, template=asn_template, exec=tbl2asn_exec) printMessage('Done', start_time=start_time, end=True, width=25) # Print ending console log log = OrderedDict() log['OUTPUT_TBL'] = os.path.basename(tbl_handle.name) log['OUTPUT_FSA'] = os.path.basename(fsa_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertDb' printLog(log) # Close file handles tbl_handle.close() fsa_handle.close() db_handle.close() return (tbl_handle.name, fsa_handle.name)
def assembleCloneGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Assemble one germline sequence for each clone in a tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'True' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader repo_dict = getRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] # Create output file handle and Db writer writers = {} pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None writers['fail'] = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) counts = {} clone_count = counts['pass'] = counts['fail'] = 0 # Iterate over rows clone = 'initial' clone_dict = OrderedDict() for i,row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) # Clone isn't over yet if row.get('CLONE','') == clone: clone_dict[row["SEQUENCE_ID"]] = row # Clone just finished elif clone_dict: clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Now deal with current row (first of next clone) clone = row['CLONE'] clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)]) # Last case is only for first row of file else: clone = row['CLONE'] clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)]) clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Print log printProgress(i+1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = counts['pass'] log['FAIL'] = counts['fail'] log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def maskPrimers(seq_file, primer_file, align_func, align_args={}, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Masks or cuts primers from sample sequences using local alignment Arguments: seq_file : name of file containing sample sequences. primer_file : name of the file containing primer sequences. align_func : the function to call for alignment. align_arcs : a dictionary of arguments to pass to align_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes; if None defaults to the number of CPUs. queue_size : maximum size of the argument queue; if None defaults to 2*nproc. Returns: list: a list of successful output file names. """ # Define subcommand label dictionary cmd_dict = { alignPrimers: 'align', scorePrimers: 'score', extractPrimers: 'extract' } # Print parameter info log = OrderedDict() log['START'] = 'MaskPrimers' log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__) log['SEQ_FILE'] = os.path.basename(seq_file) if primer_file is not None: log['PRIMER_FILE'] = os.path.basename(primer_file) if 'mode' in align_args: log['MODE'] = align_args['mode'] if 'max_error' in align_args: log['MAX_ERROR'] = align_args['max_error'] if 'start' in align_args: log['START_POS'] = align_args['start'] if 'length' in align_args: log['LENGTH'] = align_args['length'] if 'max_len' in align_args: log['MAX_LEN'] = align_args['max_len'] if 'rev_primer' in align_args: log['REV_PRIMER'] = align_args['rev_primer'] if 'skip_rc' in align_args: log['SKIP_RC'] = align_args['skip_rc'] if 'gap_penalty' in align_args: log['GAP_PENALTY'] = ', '.join( [str(x) for x in align_args['gap_penalty']]) if 'barcode' in align_args: log['BARCODE'] = align_args['barcode'] if 'barcode' in align_args and align_args['barcode']: log['BARCODE_FIELD'] = align_args['barcode_field'] log['PRIMER_FIELD'] = align_args['primer_field'] log['NPROC'] = nproc printLog(log) # Define alignment arguments and compile primers for align mode if primer_file is not None: primers = readPrimerFile(primer_file) if 'rev_primer' in align_args and align_args['rev_primer']: primers = {k: reverseComplement(v) for k, v in primers.items()} align_args['primers'] = primers align_args['score_dict'] = getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0)) if align_func is alignPrimers: align_args['primers_regex'] = compilePrimers(primers) align_args['delimiter'] = out_args['delimiter'] # Define feeder function and arguments feed_func = feedSeqQueue feed_args = {'seq_file': seq_file} # Define worker function and arguments work_func = processSeqQueue work_args = {'process_func': align_func, 'process_args': align_args} # Define collector function and arguments collect_func = collectSeqQueue collect_args = { 'seq_file': seq_file, 'label': 'primers', 'out_file': out_file, 'out_args': out_args } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'MaskPrimers' printLog(result['log']) return result['out_files']
def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'False' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader repo_dict = getRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] # Create output file handle and Db writer pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None fail_writer = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) pass_count = fail_count = 0 # Iterate over rows for i,row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) result_log, germlines = joinGermline(row, repo_dict, germ_types, v_field, seq_field) # Add germline field(s) to dictionary if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full'] if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask'] if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly'] # Write row to pass or fail file if 'ERROR' in result_log: fail_count += 1 if fail_writer is not None: fail_writer.writerow(row) else: result_log['SEQUENCE'] = row[seq_field] result_log['GERMLINE'] = germlines['full'] result_log['REGIONS'] = germlines['regions'] pass_count += 1 pass_writer.writerow(row) printLog(result_log, handle=log_handle) # Print log printProgress(i+1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def indexDbFile(db_file, field=default_index_field, out_file=None, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file : the database file name. field : the name of the index field to add. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Append index field out_fields = list(db_iter.fields) out_fields.append(field) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Add count and write updated row rec.update({field: rec_count}) pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def deleteDbFile(db_file, fields, values, logic='any', regex=False, out_args=default_out_args): """ Deletes records from a database file Arguments: db_file = the database file name fields = a list of fields to check for deletion criteria values = a list of values defining deletion targets logic = one of 'any' or 'all' defining whether one or all fields must have a match. regex = if False do exact full string matches; if True allow partial regex matches. out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'delete' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-delete', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = fail_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Check for deletion values in all fields delete = _logic_func( [_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if not delete: pass_count += 1 pass_writer.writerow(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['KEPT'] = pass_count log['DELETED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): """ Divides a tab-delimited database file into segments by description tags Arguments: db_file : filename of the tab-delimited database file to split field : the field name by which to split db_file num_split : the numerical threshold by which to group sequences; if None treat field as textual out_args : common output argument dictionary from parseCommonArgs Returns: list : a list of output file names. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'split' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUM_SPLIT'] = num_split printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Determine total numbers of records rec_count = countDbFile(db_file) start_time = time() count = 0 # Sort records into files based on textual field if num_split is None: # Create set of unique field tags with open(db_file, 'rt') as tmp_handle: tmp_iter = TSVReader(tmp_handle) tag_list = list(set([row[field] for row in tmp_iter])) # Forbidden characters in filename and replacements no_good = { '\/': 'f', '\\': 'b', '?': 'q', '\%': 'p', '*': 's', ':': 'c', '\|': 'pi', '\"': 'dq', '\'': 'sq', '<': 'gt', '>': 'lt', ' ': '_' } # Replace forbidden characters in tag_list tag_dict = {} for tag in tag_list: for c, r in no_good.items(): tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ if c in tag else tag_dict.get(tag, tag)) # Create output handles handles_dict = { tag: getOutputHandle(db_file, out_label='%s-%s' % (field, label), out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']) for tag, label in tag_dict.items() } # Create Db writer instances writers_dict = { tag: TSVWriter(handles_dict[tag], fields=out_fields) for tag in tag_dict } # Iterate over records for row in db_iter: printProgress(count, rec_count, 0.05, start_time=start_time) count += 1 # Write row to appropriate file tag = row[field] writers_dict[tag].writeDict(row) # Sort records into files based on numeric num_split else: num_split = float(num_split) # Create output handles handles_dict = { 'under': getOutputHandle(db_file, out_label='under-%.1f' % num_split, out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']), 'atleast': getOutputHandle(db_file, out_label='atleast-%.1f' % num_split, out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']) } # Create Db writer instances writers_dict = { 'under': TSVWriter(handles_dict['under'], fields=out_fields), 'atleast': TSVWriter(handles_dict['atleast'], fields=out_fields) } # Iterate over records for row in db_iter: printProgress(count, rec_count, 0.05, start_time=start_time) count += 1 tag = row[field] tag = 'under' if float(tag) < num_split else 'atleast' writers_dict[tag].writeDict(row) # Write log printProgress(count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['RECORDS'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'ParseDb' printLog(log) # Close output file handles db_handle.close() for t in handles_dict: handles_dict[t].close() return [handles_dict[t].name for t in handles_dict]
def renameDbFile(db_file, fields, names, out_args=default_out_args): """ Renames fields in a database file Arguments: db_file = the database file name fields = a list of fields to rename values = a list of new names for fields out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'rename' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['NAMES'] = ','.join(names) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Get header and rename fields header = (readDbFile(db_file, ig=False)).fieldnames for f, n in zip(fields, names): i = header.index(f) header[i] = n # Open writer and write new header # TODO: should modify getDbWriter to take a list of fields pass_writer = csv.DictWriter(pass_handle, fieldnames=header, dialect='excel-tab') pass_writer.writeheader() # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # TODO: repeating renaming is unnecessary. should had a non-dict reader/writer to DbCore # Rename fields for f, n in zip(fields, names): rec[n] = rec.pop(f) # Write pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def selectDbFile(db_file, fields, values, logic='any', regex=False, out_file=None, out_args=default_out_args): """ Selects records from a database file Arguments: db_file : the database file name fields : a list of fields to check for selection criteria values : a list of values defining selection targets logic : one of 'any' or 'all' defining whether one or all fields must have a match. regex : if False do exact full string matches; if True allow partial regex matches. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: str : output file name. """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all # Print console log log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) log['REGEX'] = regex printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count, pass_count, fail_count = 0, 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Check for selection values in all fields select = _logic_func( [_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if select: pass_count += 1 pass_writer.writeDict(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['SELECTED'] = pass_count log['DISCARDED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def buildConsensus(seq_file, barcode_field=default_barcode_field, min_count=default_min_count, min_freq=default_min_freq, min_qual=default_min_qual, primer_field=None, primer_freq=None, max_gap=None, max_error=None, max_diversity=None, copy_fields=None, copy_actions=None, dependent=False, out_args=default_out_args, nproc=None, queue_size=None): """ Generates consensus sequences Arguments: seq_file = the sample sequence file name barcode_field = the annotation field containing set IDs min_count = threshold number of sequences to define a consensus min_freq = the frequency cutoff to assign a base min_qual = the quality cutoff to assign a base primer_field = the annotation field containing primer tags; if None do not annotate with primer tags primer_freq = the maximum primer tag frequency that must be meet to build a consensus; if None do not filter by primer frequency max_gap = the maximum frequency of (., -) characters allowed before deleting a position; if None do not delete positions max_error = a threshold defining the maximum allowed error rate to retain a read group; if None do not calculate error rate max_diversity = a threshold defining the average pairwise error rate required to retain a read group; if None do not calculate diversity dependent = if False treat barcode group sequences as independent data copy_fields = a list of annotations to copy into consensus sequence annotations; if None no additional annotations will be copied copy_actions = the list of actions to take for each copy_fields; one of ['set', 'majority', 'min', 'max', 'sum'] out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Print parameter info log = OrderedDict() log['START'] = 'BuildConsensus' log['FILE'] = os.path.basename(seq_file) log['BARCODE_FIELD'] = barcode_field log['MIN_COUNT'] = min_count log['MIN_FREQUENCY'] = min_freq log['MIN_QUALITY'] = min_qual log['MAX_GAP'] = max_gap log['PRIMER_FIELD'] = primer_field log['PRIMER_FREQUENCY'] = primer_freq log['MAX_ERROR'] = max_error log['MAX_DIVERSITY'] = max_diversity log['DEPENDENT'] = dependent log['COPY_FIELDS'] = ','.join(copy_fields) if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join(copy_actions) if copy_actions is not None else None log['NPROC'] = nproc printLog(log) # Set consensus building function in_type = getFileType(seq_file) if in_type == 'fastq': cons_func = qualityConsensus cons_args = {'min_qual': min_qual, 'min_freq': min_freq, 'dependent': dependent} elif in_type == 'fasta': cons_func = frequencyConsensus cons_args = {'min_freq': min_freq} else: sys.exit('ERROR: Input file must be FASTA or FASTQ') # Define feeder function and arguments index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = {'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args} # Define worker function and arguments work_func = processBCQueue work_args = {'cons_func': cons_func, 'cons_args': cons_args, 'min_count': min_count, 'primer_field': primer_field, 'primer_freq': primer_freq, 'max_gap': max_gap, 'max_error': max_error, 'max_diversity': max_diversity, 'copy_fields': copy_fields, 'copy_actions': copy_actions, 'delimiter': out_args['delimiter']} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'task_label': 'consensus', 'out_args': out_args, 'index_field': barcode_field} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'BuildConsensus' printLog(result['log']) return result['out_files']
def updateDbFile(db_file, field, values, updates, out_file=None, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_file : the database file name. field : the field to update. values : a list of values to specifying which rows to update. updates : a list of values to update each value with. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'update' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['VALUES'] = ','.join(values) log['UPDATES'] = ','.join(updates) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count, pass_count = 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Updated values if found for x, y in zip(values, updates): if rec[field] == x: rec[field] = y pass_count += 1 # Write records pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['UPDATED'] = pass_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def selectDbFile(db_file, fields, values, logic='any', regex=False, out_args=default_out_args): """ Selects records from a database file Arguments: db_file = the database file name fields = a list of fields to check for selection criteria values = a list of values defining selection targets logic = one of 'any' or 'all' defining whether one or all fields must have a match. regex = if False do exact full string matches; if True allow partial regex matches. out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all # Print console log log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) log['REGEX'] =regex printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = fail_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Check for selection values in all fields select = _logic_func([_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if select: pass_count += 1 pass_writer.writerow(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['SELECTED'] = pass_count log['DISCARDED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def tableLog(record_file, fields, out_args=default_out_args): """ Converts a pRESTO log to a table of annotations Arguments: record_file = the log file name fields = the list of fields to output out_args = common output argument dictionary from parseCommonArgs Returns: the output table file name """ log = OrderedDict() log['START'] = 'ParseLog' log['FILE'] = os.path.basename(record_file) printLog(log) # Open file handles log_handle = open(record_file, 'rU') out_handle = getOutputHandle(record_file, 'table', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Open csv writer and write header out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', delimiter='\t', fieldnames=fields) out_writer.writeheader() # Iterate over log records start_time = time() record = '' rec_count = pass_count = fail_count = 0 for line in log_handle: if line.strip() == '' and record: # Print progress for previous iteration printProgress(rec_count, None, 1e5, start_time) # Parse record block rec_count += 1 record_dict = parseLogRecord(record) # Write records if any([f in fields for f in record_dict]): pass_count += 1 out_writer.writerow(record_dict) elif record_dict: fail_count += 1 # Empty record string record = '' else: # Append to record record += line else: # Write final record if record: record_dict = parseLogRecord(record) if any([f in fields for f in record_dict]): pass_count += 1 out_writer.writerow(record_dict) elif record_dict: fail_count += 1 # Print counts printProgress(rec_count, None, 1e5, start_time, end=True) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseLog' printLog(log) # Close file handles log_handle.close() out_handle.close() return log_handle.name
def sortDbFile(db_file, field, numeric=False, descend=False, out_args=default_out_args): """ Sorts records by values in an annotation field Arguments: db_file = the database filename field = the field name to sort by numeric = if True sort field numerically; if False sort field alphabetically descend = if True sort in descending order; if False sort in ascending order out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'sort' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUMERIC'] = numeric printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Store all records in a dictionary start_time = time() printMessage("Indexing: Running", start_time=start_time) db_dict = {i:r for i, r in enumerate(db_iter)} result_count = len(db_dict) # Sort db_dict by field values tag_dict = {k:v[field] for k, v in db_dict.items()} if numeric: tag_dict = {k:float(v or 0) for k, v in tag_dict.items()} sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend) printMessage("Indexing: Done", start_time=start_time, end=True) # Iterate over records start_time = time() rec_count = 0 for key in sorted_keys: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write records pass_writer.writerow(db_dict[key]) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}): """ Assembles results from a queue of individual sequence results and manages log/file I/O Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process result_queue = a multiprocessing.Queue holding processQueue results collect_queue = a multiprocessing.Queue to store collector return values db_file = the input database file name out_args = common output argument dictionary from parseCommonArgs cluster_func = the function to call for carrying out clustering on distance matrix cluster_args = a dictionary of arguments to pass to cluster_func Returns: None (adds 'log' and 'out_files' to collect_dict) """ # Open output files try: # Count records and define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] result_count = countDbFile(db_file) # Defined successful output handle pass_handle = getOutputHandle(db_file, out_label='clone-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, out_label='clone-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None fail_writer = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: #sys.stderr.write('Exception in collector file opening step\n') alive.value = False raise # Get results from queue and write to files try: #print 'START COLLECT', alive.value # Iterator over results queue until sentinel object reached start_time = time() rec_count = clone_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break #print "COLLECT", alive.value, result['id'] # Print progress for previous iteration and update record count if rec_count == 0: print('PROGRESS> Assigning clones') printProgress(rec_count, result_count, 0.05, start_time) rec_count += len(result.data) # Write passed and failed records if result: for clone in result.results.values(): clone_count += 1 for i, rec in enumerate(clone): rec.annotations['CLONE'] = clone_count pass_writer.writerow(rec.toDict()) pass_count += 1 result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) else: for i, rec in enumerate(result.data): if fail_writer is not None: fail_writer.writerow(rec.toDict()) fail_count += 1 result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) # Write log printLog(result.log, handle=log_handle) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(rec_count, result_count, 0.05, start_time) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() # Update return list log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log':log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) except: #sys.stderr.write('Exception in collector result processing step\n') alive.value = False raise return None
def collectPairQueue(alive, result_queue, collect_queue, result_count, seq_file_1, seq_file_2, out_args): """ Pulls from results queue, assembles results and manages log and file IO Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues; when False function returns result_queue = a multiprocessing.Queue holding worker results collect_queue = a multiprocessing.Queue holding collector return values result_count = the number of expected assembled sequences seq_file_1 = the first sequence file name seq_file_2 = the second sequence file name out_args = common output argument dictionary from parseCommonArgs Returns: None (adds a dictionary of {log: log object, out_files: output file names} to collect_queue) """ try: # Count records and define output format out_type = getFileType(seq_file_1) if out_args['out_type'] is None \ else out_args['out_type'] # Defined valid assembly output handle pass_handle = getOutputHandle(seq_file_1, 'assemble-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) # Defined failed assembly output handles if out_args['failed']: # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] fail_handle_1 = getOutputHandle(seq_file_1, 'assemble-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type) fail_handle_2 = getOutputHandle(seq_file_2, 'assemble-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type) else: fail_handle_1 = None fail_handle_2 = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Iterator over results queue until sentinel object reached start_time = time() iter_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration printProgress(iter_count, result_count, 0.05, start_time) # Update counts for iteration iter_count += 1 # Write log printLog(result.log, handle=log_handle) # Write assembled sequences if result: pass_count += 1 SeqIO.write(result.results, pass_handle, out_type) else: fail_count += 1 if fail_handle_1 is not None and fail_handle_2 is not None: SeqIO.write(result.data[0], fail_handle_1, out_type) SeqIO.write(result.data[1], fail_handle_2, out_type) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(iter_count, result_count, 0.05, start_time) # Update return values log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['PAIRS'] = iter_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log':log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) # Close file handles pass_handle.close() if fail_handle_1 is not None: fail_handle_1.close() if fail_handle_2 is not None: fail_handle_2.close() if log_handle is not None: log_handle.close() except: alive.value = False raise return None
def convertHeaders(seq_file, convert_func, convert_args={}, out_args=default_out_args): """ Converts sequence headers to the pRESTO format Arguments: seq_file = the sequence file name convert_func = the function used to convert sequence headers convert_args = a dictionary of arguments to pass to convert_func out_args = common output argument dictionary from parseCommonArgs Returns: the output sequence file name """ # Define subcommand label dictionary cmd_dict = {convertGenericHeader:'generic', convert454Header:'454', convertGenbankHeader:'genbank', convertIlluminaHeader:'illumina', convertIMGTHeader:'imgt', convertSRAHeader:'sra'} log = OrderedDict() log['START'] = 'ConvertHeaders' log['COMMAND'] = cmd_dict[convert_func] log['FILE'] = os.path.basename(seq_file) printLog(log) # Open input file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count records result_count = countSeqFile(seq_file) # Open output file handles pass_handle = getOutputHandle(seq_file, 'convert-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) if out_args['failed']: fail_handle = getOutputHandle(seq_file, 'convert-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) else: fail_handle = None # Set additional conversion arguments if convert_func in [convertGenericHeader, convertGenbankHeader]: convert_args.update({'delimiter':out_args['delimiter']}) # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration and update count printProgress(seq_count, result_count, 0.05, start_time) seq_count += 1 # Convert header header = convert_func(seq.description, **convert_args) if header is not None: # Write successfully converted sequences pass_count += 1 seq.id = seq.name = flattenAnnotation(header, out_args['delimiter']) seq.description = '' SeqIO.write(seq, pass_handle, out_args['out_type']) else: fail_count += 1 if fail_handle is not None: # Write successfully unconverted sequences SeqIO.write(seq, fail_handle, out_args['out_type']) # Print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertHeaders' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() return pass_handle.name
def convertToBaseline(db_file, id_field=default_id_field, seq_field=default_seq_field, germ_field=default_germ_field, cluster_field=None, meta_fields=None, out_file=None, out_args=default_out_args): """ Builds fasta files from database records Arguments: db_file : the database file name. id_field : the field containing identifiers. seq_field : the field containing sample sequences. germ_field : the field containing germline sequences. cluster_field : the field containing clonal groupings; if None write the germline for each record. meta_fields : a list of fields to add to sequence annotations. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'fasta' log['FILE'] = os.path.basename(db_file) log['ID_FIELD'] = id_field log['SEQ_FIELD'] = seq_field log['GERM_FIELD'] = germ_field log['CLUSTER_FIELD'] = cluster_field if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) result_count = countDbFile(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='clip') # Iterate over records start_time = time() rec_count, germ_count, pass_count, fail_count = 0, 0, 0, 0 cluster_last = None for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Update cluster ID cluster = rec.get(cluster_field, None) # Get germline SeqRecord when needed if cluster_field is None: germ = buildSeqRecord(rec, id_field, germ_field, meta_fields) germ.id = '>' + germ.id elif cluster != cluster_last: germ = buildSeqRecord(rec, cluster_field, germ_field) germ.id = '>' + germ.id else: germ = None # Get read SeqRecord seq = buildSeqRecord(rec, id_field, seq_field, meta_fields) # Write germline if germ is not None: germ_count += 1 SeqIO.write(germ, pass_handle, 'fasta') # Write sequences if seq is not None: pass_count += 1 SeqIO.write(seq, pass_handle, 'fasta') else: fail_count += 1 # Set last cluster ID cluster_last = cluster # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GERMLINES'] = germ_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def convertToFasta(db_file, id_field=default_id_field, seq_field=default_seq_field, meta_fields=None, out_file=None, out_args=default_out_args): """ Builds fasta files from database records Arguments: db_file : the database file name. id_field : the field containing identifiers. seq_field : the field containing sequences. meta_fields : a list of fields to add to sequence annotations. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'fasta' log['FILE'] = os.path.basename(db_file) log['ID_FIELD'] = id_field log['SEQ_FIELD'] = seq_field if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) printLog(log) # Open input out_type = 'fasta' db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) result_count = countDbFile(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) # Iterate over records start_time = time() rec_count, pass_count, fail_count = 0, 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Get SeqRecord seq = buildSeqRecord(rec, id_field, seq_field, meta_fields) # Write sequences if seq is not None: pass_count += 1 SeqIO.write(seq, pass_handle, out_type) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def updateDbFile(db_file, field, values, updates, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_file = the database file name field = the field to update values = a list of values to specifying which rows to update updates = a list of values to update each value with out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'update' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['VALUES'] = ','.join(values) log['UPDATES'] = ','.join(updates) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Updated values if found for x, y in zip(values, updates): if rec[field] == x: rec[field] = y pass_count += 1 # Write records pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['UPDATED'] = pass_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def sortSeqFile(seq_file, field, numeric=False, max_count=None, out_args=default_out_args): """ Sorts a sequence file by annotation fields Arguments: seq_file : filename of the sequence file to split field : position of field in sequence description to split by numeric : if True sort field numerically; if False sort field alphabetically max_count : maximum number of records in each output file if None do not create multiple files out_args : common output argument dictionary from parseCommonArgs Returns: list: output file names """ log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'sort' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field log['NUMERIC'] = numeric log['MAX_COUNT'] = max_count printLog(log) # Open file handles in_type = getFileType(seq_file) seq_dict = readSeqFile(seq_file, index=True) if out_args['out_type'] is None: out_args['out_type'] = in_type # Get annotations and sort seq_dict by annotation values tag_dict = { k: parseAnnotation(seq_dict[k].description, delimiter=out_args['delimiter'])[field] for k in seq_dict } if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()} sorted_keys = sorted(tag_dict, key=tag_dict.get) # Determine total numbers of records rec_count = len(seq_dict) if max_count >= rec_count: max_count = None # Open initial output file handles file_count = 1 if max_count is None: out_label = 'sorted' else: out_label = 'sorted-part%06i' % file_count out_handle = getOutputHandle(seq_file, out_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) out_files = [out_handle.name] # Loop through sorted sequence dictionary keys start_time = time() last_tag = None saved_keys = [] seq_count = chunk_count = 0 for key in sorted_keys: # Print progress for previous iteration and update count printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write saved group of sequences when tag changes if last_tag is not None and tag_dict[key] != last_tag: # Open new output file if needed if max_count is not None and chunk_count + len( saved_keys) > max_count: # Update partition counts file_count += 1 chunk_count = 0 # Open new file handle out_handle.close() out_handle = getOutputHandle(seq_file, 'sorted-part%06i' % file_count, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Append output file name to out_files out_files.append(out_handle.name) # Write saved sequences for k in saved_keys: chunk_count += 1 SeqIO.write(seq_dict[k], out_handle, out_args['out_type']) # Reset saved keys to current key only saved_keys = [key] else: # Update list of saved keys if tag is unchanged saved_keys.append(key) # Check if total records reached, write all saved keys, and exit loop if seq_count == rec_count: for k in saved_keys: chunk_count += 1 SeqIO.write(seq_dict[k], out_handle, out_args['out_type']) out_handle.close() break # Update tag tracker last_tag = tag_dict[key] # Print log printProgress(seq_count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, f in enumerate(out_files): log['OUTPUT%i' % (i + 1)] = os.path.basename(f) log['SEQUENCES'] = seq_count log['PARTS'] = len(out_files) log['END'] = 'SplitSeq' printLog(log) # Close file handles out_handle.close() return out_files
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, coord_type=default_coord_type, out_args=default_out_args): """ Generates consensus sequences Arguments: seq_file_1 = the file containing the grouped sequences and annotations seq_file_2 = the file to assign annotations to from seq_file_1 fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations coord_type = the sequence header format out_args = common output argument dictionary from parseCommonArgs Returns: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2) """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) seq_2.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) seq_1.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): """ Divides a tab-delimited database file into segments by description tags Arguments: db_file = filename of the tab-delimited database file to split field = the field name by which to split db_file num_split = the numerical threshold by which to group sequences; if None treat field as textual out_args = common output argument dictionary from parseCommonArgs Returns: a list of output file names """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'split' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUM_SPLIT'] = num_split printLog(log) # Open IgRecord reader iter object reader = readDbFile(db_file, ig=False) # Determine total numbers of records rec_count = countDbFile(db_file) start_time = time() count = 0 # Sort records into files based on textual field if num_split is None: # Create set of unique field tags tmp_iter = readDbFile(db_file, ig=False) tag_list = list(set([row[field] for row in tmp_iter])) # Forbidden characters in filename and replacements noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c', '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'} # Replace forbidden characters in tag_list tag_dict = {} for tag in tag_list: for c,r in noGood.items(): tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ if c in tag else tag_dict.get(tag, tag)) # Create output handles handles_dict = {tag:getOutputHandle(db_file, '%s-%s' % (field, label), out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir']) for tag, label in tag_dict.items()} # Create Db writer instances writers_dict = {tag:getDbWriter(handles_dict[tag], db_file) for tag in tag_dict} # Iterate over IgRecords for row in reader: printProgress(count, rec_count, 0.05, start_time) count += 1 # Write row to appropriate file tag = row[field] writers_dict[tag].writerow(row) # Sort records into files based on numeric num_split else: num_split = float(num_split) # Create output handles handles_dict = {'under':getOutputHandle(db_file, 'under-%.1f' % num_split, out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir']), 'atleast':getOutputHandle(db_file, 'atleast-%.1f' % num_split, out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir'])} # Create Db writer instances writers_dict = {'under':getDbWriter(handles_dict['under'], db_file), 'atleast':getDbWriter(handles_dict['atleast'], db_file)} # Iterate over IgRecords for row in reader: printProgress(count, rec_count, 0.05, start_time) count += 1 tag = row[field] tag = 'under' if float(tag) < num_split else 'atleast' writers_dict[tag].writerow(row) # Write log printProgress(count, rec_count, 0.05, start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['RECORDS'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'ParseDb' printLog(log) # Close output file handles for t in handles_dict: handles_dict[t].close() return [handles_dict[t].name for t in handles_dict]
def insertGaps(db_file, references=None, format=default_format, out_file=None, out_args=default_out_args): """ Inserts IMGT numbering into V fields Arguments: db_file : the database file name. references : folder with germline repertoire files. If None, do not updated alignment columns wtih IMGT gaps. format : input format. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'imgt' log['FILE'] = os.path.basename(db_file) printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Open input db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['sequence_imgt', 'v_germ_start_imgt'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Load references reference_dict = readGermlines(references) # Check for IMGT-gaps in germlines if all('...' not in x for x in reference_dict.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Open output writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='gap', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=schema.out_type) pass_writer = writer(pass_handle, fields=db_iter.fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Update IMGT fields imgt_dict = correctIMGTFields(rec, reference_dict) # Write records if imgt_dict is not None: pass_count += 1 rec.setDict(imgt_dict, parse=False) pass_writer.writeReceptor(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = rec_count - pass_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def convertToChangeo(db_file, out_file=None, out_args=default_out_args): """ Converts an AIRR formatted file into an Change-O formatted file Arguments: db_file: the database file name. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'changeo' log['FILE'] = os.path.basename(db_file) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = AIRRReader(db_handle) # Set output fields replacing length with end fields in_fields = [AIRRSchema.toReceptor(f) for f in db_iter.fields] out_fields = [] for f in in_fields: out_fields.append(f) if f in ReceptorData.end_fields and ReceptorData.end_fields[f][0] in in_fields: out_fields.append(ReceptorData.end_fields[f][1]) out_fields = list(OrderedDict.fromkeys(out_fields)) out_fields = [ChangeoSchema.fromReceptor(f) for f in out_fields] # Open output writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='changeo', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=ChangeoSchema.out_type) pass_writer = ChangeoWriter(pass_handle, fields=out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write records pass_writer.writeReceptor(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name