def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False, parse_regions=False, parse_scores=False, parse_igblast_cdr3=False, out_args=default_out_args): """ Main for IgBLAST aligned sample sequences. Arguments: aligner_output : IgBLAST output file to process. seq_file : fasta file input to IgBlast (from which to get sequence). repo : folder with germline repertoire files. no_parse : if ID is to be parsed for pRESTO output with default delimiters. partial : If True put incomplete alignments in the pass file. parse_regions : if True add FWR and CDR fields to output file. parse_scores : if True add alignment score fields to output file. parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST out_args : common output argument dictionary from parseCommonArgs. Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['ALIGNER'] = 'IgBlast' log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) log['SEQ_FILE'] = os.path.basename(seq_file) log['NO_PARSE'] = no_parse log['PARTIAL'] = partial log['SCORES'] = parse_scores log['REGIONS'] = parse_regions printLog(log) start_time = time() printMessage('Loading sequence files', start_time=start_time, width=25) # Count records in sequence file total_count = countSeqFile(seq_file) # Get input sequence dictionary seq_dict = getSeqDict(seq_file) # Create germline repo dictionary repo_dict = readRepo(repo) printMessage('Done', start_time=start_time, end=True, width=25) # Parse and write output with open(aligner_output, 'r') as f: parse_iter = IgBLASTReader(f, seq_dict, repo_dict, parse_scores=parse_scores, parse_regions=parse_regions, parse_igblast_cdr3=parse_igblast_cdr3) file_prefix = getFilePrefix(aligner_output, out_args) writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, no_parse=no_parse, partial=partial, out_args=out_args) return None
def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, region_fields=False, out_args=default_out_args): """ Main for IgBlast aligned sample sequences Arguments: igblast_output = IgBlast output file to process seq_file = fasta file input to IgBlast (from which to get sequence) repo = folder with germline repertoire files no_parse = if ID is to be parsed for pRESTO output with default delimiters score_fields = if True add alignment score fields to output file region_fields = if True add FWR and CDR region fields to output file out_args = common output argument dictionary from parseCommonArgs Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['ALIGNER'] = 'IgBlast' log['ALIGN_RESULTS'] = os.path.basename(igblast_output) log['SEQ_FILE'] = os.path.basename(seq_file) log['NO_PARSE'] = no_parse log['SCORE_FIELDS'] = score_fields log['REGION_FIELDS'] = region_fields printLog(log) # Get input sequence dictionary seq_dict = getSeqforIgBlast(seq_file) # Formalize out_dir and file-prefix if not out_args['out_dir']: out_dir = os.path.split(igblast_output)[0] else: out_dir = os.path.abspath(out_args['out_dir']) if not os.path.exists(out_dir): os.mkdir(out_dir) if out_args['out_name']: file_prefix = out_args['out_name'] else: file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) file_prefix = os.path.join(out_dir, file_prefix) total_count = countSeqFile(seq_file) # Create repo_dict = getRepo(repo) igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, score_fields=score_fields, region_fields=region_fields) writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, score_fields=score_fields, region_fields=region_fields, out_args=out_args)
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file = the sequence file name modify_func = the function defining the modification operation modify_args = a dictionary of arguments to pass to modify_func out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def downsizeSeqFile(seq_file, max_count, out_args=default_out_args): """ Splits a FASTA/FASTQ file into segments with a limited number of records Arguments: seq_file : filename of the FASTA file to split max_count : number of records in each output file out_args : common output argument dictionary from parseCommonArgs Returns: list: output file names """ log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'count' log['FILE'] = os.path.basename(seq_file) log['MAX_COUNT'] = max_count printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Determine total numbers of records rec_count = countSeqFile(seq_file) # Loop through iterator writing each record and opening new output handle as needed start_time = time() seq_count, part_num = 0, 1 out_handle = getOutputHandle(seq_file, 'part%06i' % part_num, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) out_files = [out_handle.name] for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, rec_count, 0.05, start_time=start_time) # Update count seq_count += 1 # Write records SeqIO.write(seq, out_handle, out_args['out_type']) # Break if total records reached to avoid extra empty file if seq_count == rec_count: break # Open new file if needed if seq_count % max_count == 0: out_handle.close() part_num += 1 out_handle = getOutputHandle(seq_file, 'part%06i' % part_num, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) out_files.append(out_handle.name) # Print log printProgress(seq_count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, f in enumerate(out_files): log['OUTPUT%i' % (i + 1)] = os.path.basename(f) log['SEQUENCES'] = rec_count log['PARTS'] = len(out_files) log['END'] = 'SplitSeq' printLog(log) # Close file handles out_handle.close() return out_files
def groupSeqFile(seq_file, field, threshold=None, out_args=default_out_args): """ Divides a sequence file into segments by description tags Arguments: seq_file : filename of the sequence file to split field : The annotation field to split seq_file by threshold : The numerical threshold for group sequences by; if None treat field as textual out_args : common output argument dictionary from parseCommonArgs Returns: list: output file names """ log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'group' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field log['THRESHOLD'] = threshold printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Determine total numbers of records rec_count = countSeqFile(seq_file) # Process sequences start_time = time() seq_count = 0 if threshold is None: # Sort records into files based on textual field # Create set of unique field tags temp_iter = readSeqFile(seq_file) tag_list = getAnnotationValues(temp_iter, field, unique=True, delimiter=out_args['delimiter']) if sys.platform != 'win32': import resource # Increase open file handle limit if needed file_limit = resource.getrlimit(resource.RLIMIT_NOFILE)[0] file_count = len(tag_list) + 256 if file_limit < file_count and file_count <= 8192: #print file_limit, file_count resource.setrlimit(resource.RLIMIT_NOFILE, (file_count, file_count)) elif file_count > 8192: e = '''OS file limit would need to be set to %i. If you are sure you want to do this, then increase the file limit in the OS (via ulimit) and rerun this tool. ''' % file_count printError(dedent(e)) # Create output handles # out_label = '%s=%s' % (field, tag) handles_dict = { tag: getOutputHandle(seq_file, '%s-%s' % (field, tag), out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) for tag in tag_list } # Iterate over sequences for seq in seq_iter: printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write sequences tag = parseAnnotation(seq.description, delimiter=out_args['delimiter'])[field] SeqIO.write(seq, handles_dict[tag], out_args['out_type']) else: # Sort records into files based on numeric threshold threshold = float(threshold) # Create output handles handles_dict = { 'under': getOutputHandle(seq_file, 'under-%.1g' % threshold, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']), 'atleast': getOutputHandle(seq_file, 'atleast-%.1g' % threshold, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) } # Iterate over sequences for seq in seq_iter: printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write sequences tag = parseAnnotation(seq.description, delimiter=out_args['delimiter'])[field] tag = 'under' if float(tag) < threshold else 'atleast' SeqIO.write(seq, handles_dict[tag], out_args['out_type']) # Print log printProgress(seq_count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['SEQUENCES'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'SplitSeq' printLog(log) # Close output file handles for k in handles_dict: handles_dict[k].close() return [handles_dict[k].name for k in handles_dict]
def collectPairQueue(alive, result_queue, collect_queue, seq_file_1, seq_file_2, label, out_file=None, out_args=default_out_args): """ Pulls from results queue, assembles results and manages log and file IO Arguments: alive : a multiprocessing.Value boolean controlling whether processing continues; when False function returns. result_queue : a multiprocessing.Queue holding worker results. collect_queue : a multiprocessing.Queue holding collector return values. seq_file_1 : the first sequence file name. seq_file_2 : the second sequence file name. label : task label used to tag the output files. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: None: adds a dictionary of {log: log object, out_files: output file names} to collect_queue. """ # Define output format out_type = getFileType(seq_file_1) if out_args['out_type'] is None \ else out_args['out_type'] # Define output names if out_args['out_name'] is None: out_name_1, out_name_2 = None, None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Wrapper for opening handles and writers def _open(x, in_file, out_name, out_file=out_file): if out_file is not None and x == 'pass': handle = open(out_file, 'w') else: handle = getOutputHandle(in_file, out_label='%s-%s' % (label, x), out_dir=out_args['out_dir'], out_name=out_name, out_type=out_type) return handle try: # Count input size result_count = countSeqFile(seq_file_1) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Initialize file handles pass_handle, fail_handle_1, fail_handle_2 = None, None, None # Iterator over results queue until sentinel object reached start_time = time() iter_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration printProgress(iter_count, result_count, 0.05, start_time=start_time) # Update counts for iteration iter_count += 1 # Write log printLog(result.log, handle=log_handle) # Write assembled sequences if result: pass_count += 1 try: SeqIO.write(result.results, pass_handle, out_type) except AttributeError: # Open pass file pass_handle = _open('pass', seq_file_1, out_args['out_name']) SeqIO.write(result.results, pass_handle, out_type) else: fail_count += 1 if out_args['failed']: try: SeqIO.write(result.data[0], fail_handle_1, out_type) SeqIO.write(result.data[1], fail_handle_2, out_type) except AttributeError: # Open fail file fail_handle_1 = _open('fail', seq_file_1, out_name_1) fail_handle_2 = _open('fail', seq_file_2, out_name_2) SeqIO.write(result.data[0], fail_handle_1, out_type) SeqIO.write(result.data[1], fail_handle_2, out_type) else: sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(iter_count, result_count, 0.05, start_time=start_time) # Update return values log = OrderedDict() log['OUTPUT'] = os.path.basename( pass_handle.name) if pass_handle is not None else None log['PAIRS'] = iter_count log['PASS'] = pass_count log['FAIL'] = fail_count # Close file handles and generate return data # collect_dict = {'log': log, 'pass': None, 'fail': None} collect_dict = {'log': log, 'out_files': []} if pass_handle is not None: # collect_dict['pass'] = pass_handle.name collect_dict['out_files'].append(pass_handle.name) pass_handle.close() if fail_handle_1 is not None: # collect_dict['fail'] = fail_handle.name collect_dict['out_files'].append(fail_handle_1.name) fail_handle_1.close() if fail_handle_2 is not None: # collect_dict['fail'] = fail_handle.name collect_dict['out_files'].append(fail_handle_2.name) fail_handle_2.close() if log_handle is not None: log_handle.close() collect_queue.put(collect_dict) except: alive.value = False raise return None
def collectSeqQueue(alive, result_queue, collect_queue, seq_file, label, index_field=None, out_file=None, out_args=default_out_args): """ Pulls from results queue, assembles results and manages log and file IO Arguments: alive : a multiprocessing.Value boolean controlling whether processing continues; when False function returns. result_queue : Multiprocessing.Queue holding worker results. collect_queue : Multiprocessing.Queue to store collector return values. seq_file : sample sequence file name. label : task label used to tag the output files. out_file : output file name. Automatically generated from the input file if None. out_args : Common output argument dictionary from parseCommonArgs. index_field : Field defining set membership for sequence sets if None data queue contained individual records. Returns: None: Adds a dictionary with key value pairs to collect_queue containing 'log' defining a log object, 'out_files' defining the output file names """ # Define output format out_type = getFileType(seq_file) if out_args['out_type'] is None \ else out_args['out_type'] # Wrapper for opening handles and writers def _open(x, label=label, out_file=out_file): if out_file is not None and x == 'pass': handle = open(out_file, 'w') else: handle = getOutputHandle(seq_file, out_label='%s-%s' % (label, x), out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) return handle try: # Count records if index_field is None: result_count = countSeqFile(seq_file) else: result_count = countSeqSets(seq_file, index_field, out_args['delimiter']) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Initialize output handles pass_handle, fail_handle = None, None # Iterator over results queue until sentinel object reached start_time = time() set_count = seq_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration printProgress(set_count, result_count, 0.05, start_time=start_time) # Update counts for current iteration set_count += 1 seq_count += result.data_count # Write log printLog(result.log, handle=log_handle) # Write records if result: pass_count += 1 try: SeqIO.write(result.results, pass_handle, out_type) except AttributeError: # Open pass file pass_handle = _open('pass') SeqIO.write(result.results, pass_handle, out_type) else: fail_count += 1 if out_args['failed']: try: SeqIO.write(result.data, fail_handle, out_type) except AttributeError: # Open fail file fail_handle = _open('fail') SeqIO.write(result.data, fail_handle, out_type) else: sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(set_count, result_count, 0.05, start_time=start_time) # Update return values log = OrderedDict() log['OUTPUT'] = os.path.basename( pass_handle.name) if pass_handle is not None else None log['SEQUENCES'] = seq_count if index_field is not None: log['SETS'] = set_count log['PASS'] = pass_count log['FAIL'] = fail_count # Close file handles and generate return data #collect_dict = {'log': log, 'pass': None, 'fail': None} collect_dict = {'log': log, 'out_files': []} if pass_handle is not None: #collect_dict['pass'] = pass_handle.name collect_dict['out_files'].append(pass_handle.name) pass_handle.close() if fail_handle is not None: #collect_dict['fail'] = fail_handle.name collect_dict['out_files'].append(fail_handle.name) fail_handle.close() if log_handle is not None: log_handle.close() collect_queue.put(collect_dict) except: alive.value = False raise return None
def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False, asis_id=True, asis_calls=False, extended=False, regions='default', format='changeo', out_file=None, out_args=default_out_args): """ Main for IgBLAST aligned sample sequences. Arguments: aligner_file (str): IgBLAST output file to process. seq_file (str): fasta file input to IgBlast (from which to get sequence). repo (str): folder with germline repertoire files. amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False. partial : If True put incomplete alignments in the pass file. asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters. asis_calls (bool): if True do not parse gene calls for allele names. extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output. regions (str): name of the IMGT FWR/CDR region definitions to use. format (str): output format. one of 'changeo' or 'airr'. out_file (str): output file name. Automatically generated from the input file if None. out_args (dict): common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast' log['ALIGNER_FILE'] = os.path.basename(aligner_file) log['SEQ_FILE'] = os.path.basename(seq_file) log['ASIS_ID'] = asis_id log['ASIS_CALLS'] = asis_calls log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) # Set amino acid conditions if amino_acid: format = '%s-aa' % format parser = IgBLASTReaderAA else: parser = IgBLASTReader # Start start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Count records in sequence file total_count = countSeqFile(seq_file) # Get input sequence dictionary seq_dict = getSeqDict(seq_file) # Create germline repo dictionary references = readGermlines(repo, asis=asis_calls) # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = parser.customFields(schema=schema) fields.extend(custom) # Parse and write output with open(aligner_file, 'r') as f: parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls) germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter) output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id, regions=regions, writer=writer, out_file=out_file, out_args=out_args) return output
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, coord_type=default_coord_type, out_args=default_out_args): """ Generates consensus sequences Arguments: seq_file_1 = the file containing the grouped sequences and annotations seq_file_2 = the file to assign annotations to from seq_file_1 fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations coord_type = the sequence header format out_args = common output argument dictionary from parseCommonArgs Returns: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2) """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) seq_2.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) seq_1.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]
def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args): """ Converts sequence headers to the pRESTO format Arguments: seq_file : the sequence file name. convert_func : the function used to convert sequence headers. convert_args : a dictionary of arguments to pass to convert_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: the output sequence file name. """ # Define subcommand label dictionary cmd_dict = {convertGenericHeader: 'generic', convert454Header: '454', convertGenbankHeader: 'genbank', convertIlluminaHeader: 'illumina', convertIMGTHeader: 'imgt', convertMIGECHeader: 'migec', convertSRAHeader: 'sra'} log = OrderedDict() log['START'] = 'ConvertHeaders' log['COMMAND'] = cmd_dict[convert_func] log['FILE'] = os.path.basename(seq_file) printLog(log) # Open input file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Wrapper for opening handles and writers def _open(x, out_file=out_file): if out_file is not None and x == 'pass': handle = open(out_file, 'w') else: handle = getOutputHandle(seq_file, 'convert-%s' % x, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) return handle # Count records result_count = countSeqFile(seq_file) # Set additional conversion arguments if convert_func in [convertGenericHeader, convertGenbankHeader]: convert_args.update({'delimiter': out_args['delimiter']}) # Intialize file handles pass_handle, fail_handle = None, None # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration and update count printProgress(seq_count, result_count, 0.05, start_time=start_time) seq_count += 1 # Convert header header = convert_func(seq.description, **convert_args) if header is not None: # Write successfully converted sequences pass_count += 1 seq.id = seq.name = flattenAnnotation(header, out_args['delimiter']) seq.description = '' try: SeqIO.write(seq, pass_handle, out_args['out_type']) except AttributeError: # Open output file pass_handle = _open('pass') SeqIO.write(seq, pass_handle, out_args['out_type']) else: fail_count += 1 if out_args['failed']: # Write unconverted sequences try: SeqIO.write(seq, fail_handle, out_args['out_type']) except AttributeError: # Open output file pass_handle = _open('fail') SeqIO.write(seq, fail_handle, out_args['out_type']) # Print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertHeaders' printLog(log) # Close file handles if fail_handle is not None: pass_handle.close() if fail_handle is not None: fail_handle.close() return pass_handle.name
def estimateBarcode(seq_file, barcode_field=default_barcode_field, distance_types=default_distance_types, out_args=default_out_args): """ Calculates error rates of barcode sequences Arguments: seq_file : the sample sequence file name barcode_field : the annotation field containing barcode sequences. distance_types : distance types to include. out_args : common output argument dictionary from parseCommonArgs Returns: tuple: names of the output files. """ # Function to extract to make SeqRecord object from a barcode annotation def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']): header = parseAnnotation(seq.description, delimiter=delimiter) return header[field] # Print parameter info log = OrderedDict() log['START'] = 'EstimateError' log['COMMAND'] = 'barcode' log['FILE'] = os.path.basename(seq_file) log['BARCODE_FIELD'] = barcode_field printLog(log) # Count sequence file and parse into a list of SeqRecords result_count = countSeqFile(seq_file) barcode_iter = (_barcode(x) for x in readSeqFile(seq_file)) # Compute bin_count defaults to the length of the barcode + 1 bin_count = len(_barcode(next(readSeqFile(seq_file)))) + 1 mismatch = initializeMismatchDictionary(0, distance_types=distance_types, bin_count=bin_count) # Calculate distances distance_mismatch = calculateDistances(barcode_iter, bin_count=bin_count) mismatch['dist'] = { header: distance_mismatch[header] for header in distance_types } # Generate a df dist_df = pd.DataFrame.from_dict(mismatch['dist']) dist_df.index = dist_df.index / len(dist_df.index) dist_df[['all']] = dist_df[['all']].astype(int) #find the threshold (average minimum between 0 and 0.75) dist = mismatch['dist']['all'] thresh_df = pd.DataFrame.from_dict({'thresh': {'ALL': dist_df.index[int(np.mean([index for index in np.argsort(dist[:int(len(dist)*0.75)]) \ if dist[index] == np.min(dist)]))]} }) file_args = { 'out_dir': out_args['out_dir'], 'out_name': out_args['out_name'], 'out_type': 'tab' } # Output as tsv with getOutputHandle(seq_file, 'distance-barcode', **file_args) as dist_handle, \ getOutputHandle(seq_file, 'threshold-barcode', **file_args) as thresh_handle: dist_df.to_csv(dist_handle, sep='\t', na_rep='NA', index=True, index_label='DISTANCE', columns=['all'], header=['ALL'], float_format='%.6f') thresh_df.to_csv(thresh_handle, sep='\t', na_rep='NA', index=True, index_label='TYPE', columns=['thresh'], header=['THRESHOLD'], float_format='%.6f') # Update log log['OUTPUT1'] = os.path.basename(dist_handle.name) log['OUTPUT2'] = os.path.basename(thresh_handle.name) log['SEQUENCES'] = result_count log['ALL_THRESHOLD'] = '%.6f' % thresh_df['thresh']['ALL'] log['END'] = 'EstimateError' printLog(log) return (dist_handle.name, thresh_handle.name)
def assemblePairs(head_file, tail_file, assemble_func, assemble_args={}, coord_type=default_coord_type, rc=None, head_fields=None, tail_fields=None, out_args=default_out_args, nproc=None, queue_size=None): """ Generates consensus sequences Arguments: head_file = the head sequence file name tail_file = the tail sequence file name assemble_func = the function to use to assemble paired ends assemble_args = a dictionary of arguments to pass to the assembly function coord_type = the sequence header format rc = Defines which sequences ('head','tail','both') to reverse complement before assembly; if None do not reverse complement sequences head_fields = list of annotations in head_file records to copy to assembled record; if None do not copy an annotation tail_fields = list of annotations in tail_file records to copy to assembled record; if None do not copy an annotation out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Define subcommand label dictionary cmd_dict = {alignAssembly:'align', joinSeqPair:'join', referenceAssembly:'reference'} # Print parameter info log = OrderedDict() log['START'] = 'AssemblePairs' log['COMMAND'] = cmd_dict.get(assemble_func, assemble_func.__name__) log['FILE1'] = os.path.basename(head_file) log['FILE2'] = os.path.basename(tail_file) log['COORD_TYPE'] = coord_type if 'ref_file' in assemble_args: log['REFFILE'] = assemble_args['ref_file'] if 'alpha' in assemble_args: log['ALPHA'] = assemble_args['alpha'] if 'max_error' in assemble_args: log['MAX_ERROR'] = assemble_args['max_error'] if 'min_len' in assemble_args: log['MIN_LEN'] = assemble_args['min_len'] if 'max_len' in assemble_args: log['MAX_LEN'] = assemble_args['max_len'] if 'scan_reverse' in assemble_args: log['SCAN_REVERSE'] = assemble_args['scan_reverse'] if 'gap' in assemble_args: log['GAP'] = assemble_args['gap'] if 'min_ident' in assemble_args: log['MIN_IDENT'] = assemble_args['min_ident'] if 'evalue' in assemble_args: log['EVALUE'] = assemble_args['evalue'] if 'max_hits' in assemble_args: log['MAX_HITS'] = assemble_args['max_hits'] if 'fill' in assemble_args: log['FILL'] = assemble_args['fill'] log['NPROC'] = nproc printLog(log) # Count input files head_count = countSeqFile(head_file) tail_count = countSeqFile(tail_file) if head_count != tail_count: sys.exit('Error: FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records' \ % (head_count, tail_count)) # Define feeder function and arguments feed_func = feedPairQueue # feed_args = {'seq_file_1': head_file, # 'seq_file_2': tail_file, # 'index_dict': index_dict} feed_args = {'seq_file_1': head_file, 'seq_file_2': tail_file, 'coord_type': coord_type, 'delimiter': out_args['delimiter']} # Define worker function and arguments process_args = {'assemble_func': assemble_func, 'assemble_args': assemble_args, 'rc': rc, 'fields_1': head_fields, 'fields_2': tail_fields, 'delimiter': out_args['delimiter']} work_func = processSeqQueue work_args = {'process_func': processAssembly, 'process_args': process_args} # Define collector function and arguments collect_func = collectPairQueue # collect_args = {'result_count': pair_count, # 'seq_file_1': head_file, # 'seq_file_2': tail_file, # 'out_args': out_args} collect_args = {'result_count': head_count, 'seq_file_1': head_file, 'seq_file_2': tail_file, 'out_args': out_args} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'AssemblePairs' printLog(log) return result['out_files']
def assemblePairs(head_file, tail_file, assemble_func, assemble_args={}, coord_type=default_coord, rc='tail', head_fields=None, tail_fields=None, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Generates consensus sequences Arguments: head_file : the head sequence file name tail_file : the tail sequence file name assemble_func : the function to use to assemble paired ends assemble_args : a dictionary of arguments to pass to the assembly function coord_type : the sequence header format rc : Defines which sequences ('head', 'tail', 'both', 'none') to reverse complement before assembly; if 'none' do not reverse complement sequences head_fields : list of annotations in head_file records to copy to assembled record; if None do not copy an annotation tail_fields : list of annotations in tail_file records to copy to assembled record; if None do not copy an annotation out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: list: a list of successful output file names. """ # Define subcommand label dictionary cmd_dict = { alignAssembly: 'align', joinAssembly: 'join', referenceAssembly: 'reference', sequentialAssembly: 'sequential' } cmd_name = cmd_dict.get(assemble_func, assemble_func.__name__) # Print parameter info log = OrderedDict() log['START'] = 'AssemblePairs' log['COMMAND'] = cmd_name log['FILE1'] = os.path.basename(head_file) log['FILE2'] = os.path.basename(tail_file) log['COORD_TYPE'] = coord_type if 'ref_file' in assemble_args: log['REFFILE'] = assemble_args['ref_file'] if 'alpha' in assemble_args: log['ALPHA'] = assemble_args['alpha'] if 'max_error' in assemble_args: log['MAX_ERROR'] = assemble_args['max_error'] if 'min_len' in assemble_args: log['MIN_LEN'] = assemble_args['min_len'] if 'max_len' in assemble_args: log['MAX_LEN'] = assemble_args['max_len'] if 'scan_reverse' in assemble_args: log['SCAN_REVERSE'] = assemble_args['scan_reverse'] if 'gap' in assemble_args: log['GAP'] = assemble_args['gap'] if 'min_ident' in assemble_args: log['MIN_IDENT'] = assemble_args['min_ident'] if 'evalue' in assemble_args: log['EVALUE'] = assemble_args['evalue'] if 'max_hits' in assemble_args: log['MAX_HITS'] = assemble_args['max_hits'] if 'fill' in assemble_args: log['FILL'] = assemble_args['fill'] if 'aligner' in assemble_args: log['ALIGNER'] = assemble_args['aligner'] log['NPROC'] = nproc printLog(log) # Count input files head_count = countSeqFile(head_file) tail_count = countSeqFile(tail_file) if head_count != tail_count: printError('FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records.' \ % (head_count, tail_count)) # Setup for reference alignment if cmd_name in ('reference', 'sequential'): ref_file = assemble_args.pop('ref_file') db_exec = assemble_args.pop('db_exec') # Build reference sequence dictionary assemble_args['ref_dict'] = readReferenceFile(ref_file) # Build reference database files try: db_func = { 'blastn': makeBlastnDb, 'usearch': makeUBlastDb }[assemble_args['aligner']] ref_db, db_handle = db_func(ref_file, db_exec) assemble_args['ref_db'] = ref_db except: printError('Error building reference database for aligner %s with executable %s.' \ % (assemble_args['aligner'], db_exec)) # Define feeder function and arguments feed_func = feedPairQueue feed_args = { 'seq_file_1': head_file, 'seq_file_2': tail_file, 'coord_type': coord_type, 'delimiter': out_args['delimiter'] } # Define worker function and arguments process_args = { 'assemble_func': assemble_func, 'assemble_args': assemble_args, 'rc': rc, 'fields_1': head_fields, 'fields_2': tail_fields, 'delimiter': out_args['delimiter'] } work_func = processSeqQueue work_args = {'process_func': assemblyWorker, 'process_args': process_args} # Define collector function and arguments collect_func = collectPairQueue collect_args = { 'seq_file_1': head_file, 'seq_file_2': tail_file, 'label': 'assemble', 'out_file': out_file, 'out_args': out_args } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Close reference database handle if cmd_name in ('reference', 'sequential'): try: db_handle.close() except AttributeError: db_handle.cleanup() except: printError('Cannot close reference database file.') # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'AssemblePairs' printLog(log) return result['out_files']
def clusterBarcodes(seq_file, ident=default_cluster_ident, length_ratio=default_length_ratio, barcode_field=default_barcode_field, cluster_field=default_cluster_field, cluster_prefix=default_cluster_prefix, cluster_tool=default_cluster_tool, cluster_exec=default_cluster_exec, out_file=None, out_args=default_out_args, nproc=None): """ Performs clustering on sets of sequences Arguments: seq_file : the sample sequence file name. ident : the identity threshold for clustering sequences. length_ratio : minimum short/long length ratio allowed within a cluster. barcode_field : the annotation field containing barcode sequences. cluster_field : the name of the output cluster field. cluster_prefix : string defining a prefix for the cluster identifier. seq_start : the start position to trim sequences at before clustering. seq_end : the end position to trim sequences at before clustering. cluster_tool : the clustering tool to use; one of cd-hit or usearch. cluster_exec : the path to the executable for usearch. out_file : output file name. Automatically generated from the input file if None. out_args : output arguments. nproc : the number of processQueue processes; if None defaults to the number of CPUs. Returns: str: the clustered output file name """ # Function to modify SeqRecord header with cluster identifier def _header(seq, cluster, field=cluster_field, prefix=cluster_prefix, delimiter=out_args['delimiter']): label = '%s%i' % (prefix, cluster) header = parseAnnotation(seq.description, delimiter=delimiter) header = mergeAnnotation(header, {field: label}, delimiter=delimiter) seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter) seq.description = '' return seq # Function to extract to make SeqRecord object from a barcode annotation def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']): header = parseAnnotation(seq.description, delimiter=delimiter) return SeqRecord(Seq(header[field]), id=seq.id) # Print parameter info log = OrderedDict() log['START'] = 'ClusterSets' log['COMMAND'] = 'barcode' log['FILE'] = os.path.basename(seq_file) log['IDENTITY'] = ident log['BARCODE_FIELD'] = barcode_field log['CLUSTER_FIELD'] = cluster_field log['CLUSTER_PREFIX'] = cluster_prefix log['CLUSTER_TOOL'] = cluster_tool log['NPROC'] = nproc printLog(log) # Set cluster tool try: cluster_func = map_cluster_tool.get(cluster_tool) except: printError('Invalid clustering tool %s.' % cluster_tool) # Check the minimum identity if ident < min_cluster_ident[cluster_tool]: printError('Minimum identity %s too low for clustering tool %s.' % (str(ident), cluster_tool)) # Count sequence file and parse into a list of SeqRecords result_count = countSeqFile(seq_file) barcode_iter = (_barcode(x) for x in readSeqFile(seq_file)) # Perform clustering start_time = time() printMessage('Running %s' % cluster_tool, start_time=start_time, width=25) cluster_dict = cluster_func(barcode_iter, ident=ident, length_ratio=length_ratio, seq_start=0, seq_end=None, threads=nproc, cluster_exec=cluster_exec) printMessage('Done', start_time=start_time, end=True, width=25) # Determine file type if out_args['out_type'] is None: out_args['out_type'] = getFileType(seq_file) # Open output file handles if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(seq_file, 'cluster-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Open indexed sequence file seq_dict = readSeqFile(seq_file, index=True) # Iterate over sequence records and update header with cluster annotation start_time = time() rec_count = pass_count = 0 for cluster, id_list in cluster_dict.items(): printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += len(id_list) # TODO: make a generator. Figure out how to get pass_count updated # Define output sequences seq_output = [_header(seq_dict[x], cluster) for x in id_list] # Write output pass_count += len(seq_output) SeqIO.write(seq_output, pass_handle, out_args['out_type']) # Update progress printProgress(rec_count, result_count, 0.05, start_time=start_time) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLUSTERS'] = len(cluster_dict) log['SEQUENCES'] = result_count log['PASS'] = pass_count log['FAIL'] = rec_count - pass_count log['END'] = 'ClusterSets' printLog(log) # Close handles pass_handle.close() return pass_handle.name
def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True, extended=False, format=default_format, out_file=None, out_args=default_out_args): """ Main for iHMMuneAlign aligned sample sequences. Arguments: aligner_file : iHMMune-Align output file to process. seq_file : fasta file input to iHMMuneAlign (from which to get sequence). repo : folder with germline repertoire files. partial : If True put incomplete alignments in the pass file. asis_id : if ID is to be parsed for pRESTO output with default delimiters. extended : if True parse alignment scores, FWR and CDR region fields. format : output format. One of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['COMMAND'] = 'ihmm' log['ALIGNER_FILE'] = os.path.basename(aligner_file) log['SEQ_FILE'] = os.path.basename(seq_file) log['ASIS_ID'] = asis_id log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Count records in sequence file total_count = countSeqFile(seq_file) # Get input sequence dictionary seq_dict = getSeqDict(seq_file) # Create germline repo dictionary references = readGermlines(repo) # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema) fields.extend(custom) # Parse and write output with open(aligner_file, 'r') as f: parse_iter = IHMMuneReader(f, seq_dict, references) germ_iter = (addGermline(x, references) for x in parse_iter) output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, asis_id=asis_id, partial=partial, writer=writer, out_file=out_file, out_args=out_args) return output
def tableHeaders(seq_file, fields, out_args=default_out_args): """ Builds a table of sequence header annotations Arguments: seq_file = the sequence file name fields = the list of fields to output out_args = common output argument dictionary from parseCommonArgs Returns: the output table file name """ log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = 'table' log['FILE'] = os.path.basename(seq_file) printLog(log) # Open file handles seq_iter = readSeqFile(seq_file) out_handle = getOutputHandle(seq_file, out_label='headers', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Count records result_count = countSeqFile(seq_file) # Open csv writer and write header out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', delimiter='\t', fieldnames=fields) out_writer.writeheader() # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) # Get annotations seq_count += 1 ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter']) # Write records if ann: pass_count += 1 out_writer.writerow(ann) else: fail_count += 1 # Print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def tableHeaders(seq_file, fields, out_file=None, out_args=default_out_args): """ Builds a table of sequence header annotations Arguments: seq_file : the sequence file name. fields : the list of fields to output. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output table file name """ log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = 'table' log['FILE'] = os.path.basename(seq_file) printLog(log) # Open file handles seq_iter = readSeqFile(seq_file) if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'headers', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Count records result_count = countSeqFile(seq_file) # Open csv writer and write header out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', delimiter='\t', fieldnames=fields) out_writer.writeheader() # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time=start_time) # Get annotations seq_count += 1 ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter']) # Write records if ann: pass_count += 1 out_writer.writerow(ann) else: fail_count += 1 # Print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file : the sequence file name. modify_func : the function defining the modification operation. modify_args : a dictionary of arguments to pass to modify_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output file name. """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time=start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def convertHeaders(seq_file, convert_func, convert_args={}, out_args=default_out_args): """ Converts sequence headers to the pRESTO format Arguments: seq_file = the sequence file name convert_func = the function used to convert sequence headers convert_args = a dictionary of arguments to pass to convert_func out_args = common output argument dictionary from parseCommonArgs Returns: the output sequence file name """ # Define subcommand label dictionary cmd_dict = {convertGenericHeader:'generic', convert454Header:'454', convertGenbankHeader:'genbank', convertIlluminaHeader:'illumina', convertIMGTHeader:'imgt', convertSRAHeader:'sra'} log = OrderedDict() log['START'] = 'ConvertHeaders' log['COMMAND'] = cmd_dict[convert_func] log['FILE'] = os.path.basename(seq_file) printLog(log) # Open input file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count records result_count = countSeqFile(seq_file) # Open output file handles pass_handle = getOutputHandle(seq_file, 'convert-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) if out_args['failed']: fail_handle = getOutputHandle(seq_file, 'convert-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) else: fail_handle = None # Set additional conversion arguments if convert_func in [convertGenericHeader, convertGenbankHeader]: convert_args.update({'delimiter':out_args['delimiter']}) # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration and update count printProgress(seq_count, result_count, 0.05, start_time) seq_count += 1 # Convert header header = convert_func(seq.description, **convert_args) if header is not None: # Write successfully converted sequences pass_count += 1 seq.id = seq.name = flattenAnnotation(header, out_args['delimiter']) seq.description = '' SeqIO.write(seq, pass_handle, out_args['out_type']) else: fail_count += 1 if fail_handle is not None: # Write successfully unconverted sequences SeqIO.write(seq, fail_handle, out_args['out_type']) # Print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertHeaders' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() return pass_handle.name
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, action=None, coord_type=default_coord, out_args=default_out_args): """ Syncronized paired end files and copies annotations between them Arguments: seq_file_1 : the file containing the grouped sequences and annotations. seq_file_2 : the file to assign annotations to from seq_file_1. fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations. fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations. action : the collapse action to take on all copied annotation if they already exist in the target header. coord_type : the sequence header format. out_args : common output argument dictionary from parseCommonArgs. Returns: list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2). """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time=start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation( ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) # Collapse if necessary if action is not None: merge_ann = collapseAnnotation( merge_ann, action, fields=fields_1, delimiter=out_args['delimiter']) # Flatten seq_2.id = flattenAnnotation( merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation( ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) # Collapse if necessary if action is not None: merge_ann = collapseAnnotation( merge_ann, action, fields=fields_2, delimiter=out_args['delimiter']) # Flatten seq_1.id = flattenAnnotation( merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time=start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]