def setUp(self): print('-> %s()' % self._testMethodName) # Set pandas options pd.set_option('display.width', 200) # Load data self.head_list = [s.upper() for s in readSeqFile(head_file)] self.tail_list = [s.upper() for s in readSeqFile(tail_file)] self.ref_dict = {s.id: s.upper() for s in readSeqFile(ref_file)} self.start = time.time()
def _read_pairs(seq_file_1, seq_file_2): iter_1 = readSeqFile(seq_file_1, index=False) iter_2 = readSeqFile(seq_file_2, index=False) for seq_1, seq_2 in zip(iter_1, iter_2): key_1 = getCoordKey(seq_1.description, coord_type=coord_type, delimiter=delimiter) key_2 = getCoordKey(seq_2.description, coord_type=coord_type, delimiter=delimiter) if key_1 == key_2: yield (key_1, [seq_1, seq_2]) else: raise Exception('Coordinates for sequences %s and %s do not match' \ % (key_1, key_2))
def feedSeqQueue(alive, data_queue, seq_file, index_func=None, index_args={}): """ Feeds the data queue with SeqRecord objects Arguments: alive : multiprocessing.Value boolean controlling whether processing continues; when False function returns data_queue : multiprocessing.Queue to hold data for processing seq_file : Sequence file to read input from index_func : Function to use to define sequence sets if None do not index sets and feed individual records index_args : Dictionary of arguments to pass to index_func Returns: None """ try: # Read input file and index sequence sets if required if index_func is None: seq_iter = readSeqFile(seq_file) data_iter = ((s.id, s) for s in seq_iter) else: seq_dict = readSeqFile(seq_file, index=True) index_dict = index_func(seq_dict, **index_args) data_iter = ((k, [seq_dict[i] for i in v]) \ for k, v in index_dict.items()) except: alive.value = False raise try: # Iterate over data_iter and feed data queue while alive.value: # Get data from queue if data_queue.full(): continue else: data = next(data_iter, None) # Exit upon reaching end of iterator if data is None: break # Feed queue data_queue.put(SeqData(*data)) else: sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False raise return None
def _read_pairs(seq_file_1, seq_file_2): iter_1 = readSeqFile(seq_file_1, index=False) iter_2 = readSeqFile(seq_file_2, index=False) for seq_1, seq_2 in zip(iter_1, iter_2): key_1 = getCoordKey(seq_1.description, coord_type=coord_type, delimiter=delimiter) key_2 = getCoordKey(seq_2.description, coord_type=coord_type, delimiter=delimiter) if key_1 == key_2 or coord_type == 'r1': yield (key_1, [seq_1, seq_2]) else: import pdb; pdb.set_trace() pass; # args.debug=False raise Exception('Coordinates for sequences %s and %s do not match' \ % (key_1, key_2))
def getSeqDict(seq_file): """ Create a dictionary from a sequence file. Arguments: seq_file : sequence file. Returns: dict : sequence description as keys with Bio.SeqRecords as values. """ seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description) return seq_dict
def getIDforIMGT(seq_file): """ Create a sequence ID translation using IMGT truncation. Arguments: seq_file : a fasta file of sequences input to IMGT. Returns: dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value. """ # Create a sequence ID translation using IDs truncate up to space or 50 chars ids = {} for rec in readSeqFile(seq_file): if len(rec.description) <= 50: id_key = rec.description else: id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50]) ids.update({id_key: rec.description}) return ids
def groupSeqFile(seq_file, field, threshold=None, out_args=default_out_args): """ Divides a sequence file into segments by description tags Arguments: seq_file : filename of the sequence file to split field : The annotation field to split seq_file by threshold : The numerical threshold for group sequences by; if None treat field as textual out_args : common output argument dictionary from parseCommonArgs Returns: list: output file names """ log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'group' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field log['THRESHOLD'] = threshold printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Determine total numbers of records rec_count = countSeqFile(seq_file) # Process sequences start_time = time() seq_count = 0 if threshold is None: # Sort records into files based on textual field # Create set of unique field tags temp_iter = readSeqFile(seq_file) tag_list = getAnnotationValues(temp_iter, field, unique=True, delimiter=out_args['delimiter']) if sys.platform != 'win32': import resource # Increase open file handle limit if needed file_limit = resource.getrlimit(resource.RLIMIT_NOFILE)[0] file_count = len(tag_list) + 256 if file_limit < file_count and file_count <= 8192: #print file_limit, file_count resource.setrlimit(resource.RLIMIT_NOFILE, (file_count, file_count)) elif file_count > 8192: e = '''OS file limit would need to be set to %i. If you are sure you want to do this, then increase the file limit in the OS (via ulimit) and rerun this tool. ''' % file_count printError(dedent(e)) # Create output handles # out_label = '%s=%s' % (field, tag) handles_dict = { tag: getOutputHandle(seq_file, '%s-%s' % (field, tag), out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) for tag in tag_list } # Iterate over sequences for seq in seq_iter: printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write sequences tag = parseAnnotation(seq.description, delimiter=out_args['delimiter'])[field] SeqIO.write(seq, handles_dict[tag], out_args['out_type']) else: # Sort records into files based on numeric threshold threshold = float(threshold) # Create output handles handles_dict = { 'under': getOutputHandle(seq_file, 'under-%.1g' % threshold, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']), 'atleast': getOutputHandle(seq_file, 'atleast-%.1g' % threshold, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) } # Iterate over sequences for seq in seq_iter: printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write sequences tag = parseAnnotation(seq.description, delimiter=out_args['delimiter'])[field] tag = 'under' if float(tag) < threshold else 'atleast' SeqIO.write(seq, handles_dict[tag], out_args['out_type']) # Print log printProgress(seq_count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['SEQUENCES'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'SplitSeq' printLog(log) # Close output file handles for k in handles_dict: handles_dict[k].close() return [handles_dict[k].name for k in handles_dict]
def tableHeaders(seq_file, fields, out_args=default_out_args): """ Builds a table of sequence header annotations Arguments: seq_file = the sequence file name fields = the list of fields to output out_args = common output argument dictionary from parseCommonArgs Returns: the output table file name """ log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = 'table' log['FILE'] = os.path.basename(seq_file) printLog(log) # Open file handles seq_iter = readSeqFile(seq_file) out_handle = getOutputHandle(seq_file, out_label='headers', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Count records result_count = countSeqFile(seq_file) # Open csv writer and write header out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', delimiter='\t', fieldnames=fields) out_writer.writeheader() # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) # Get annotations seq_count += 1 ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter']) # Write records if ann: pass_count += 1 out_writer.writerow(ann) else: fail_count += 1 # Print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def selectSeqFile(seq_file, field, value_list=None, value_file=None, negate=False, out_file=None, out_args=default_out_args): """ Select from a sequence file Arguments: seq_file : filename of the sequence file to sample from. field : the annotation field to check for required values. value_list : a list of annotation values that a sample must contain one of. value_file : a tab delimited file containing values to select. negate : if True select entires that do not contain the specific values. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output file name. """ # Reads value_file def _read_file(value_file, field): field_list = [] try: with open(value_file, 'rt') as handle: reader_dict = csv.DictReader(handle, dialect='excel-tab') for row in reader_dict: field_list.append(row[field]) except IOError: printError('File %s cannot be read.' % value_file) except: printError('File %s is invalid.' % value_file) return field_list # Print console log log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field if value_list is not None: log['VALUE_LIST'] = ','.join([str(x) for x in value_list]) if value_file is not None: log['VALUE_FILE'] = os.path.basename(value_file) log['NOT'] = negate printLog(log) # Read value_file if value_list is not None and value_file is not None: printError('Specify only one of value_list and value_file.') elif value_file is not None: value_list = _read_file(value_file, field) # Read sequence file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Output output handle if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'selected', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Generate subset of records start_time = time() pass_count, fail_count, rec_count = 0, 0, 0 value_set = set(value_list) for rec in seq_iter: printCount(rec_count, 1e5, start_time=start_time) rec_count += 1 # Parse annotations into a list of values ann = parseAnnotation(rec.description, delimiter=out_args['delimiter'])[field] ann = ann.split(out_args['delimiter'][2]) # Write if xor(negate, not value_set.isdisjoint(ann)): # Write SeqIO.write(rec, out_handle, out_args['out_type']) pass_count += 1 else: fail_count += 1 printCount(rec_count, 1e5, start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'SplitSeq' printLog(log) return out_handle.name
def samplePairSeqFile(seq_file_1, seq_file_2, max_count, field=None, values=None, coord_type=default_coord, out_args=default_out_args): """ Samples from paired-end sequence files Arguments: seq_file_1 : filename of the first paired-end sequence file seq_file_2 : filename of the second paired-end sequence file max_count : a list of the maximum number of sequences to sample field : the annotation field to check for required values values : a list of annotation values that a sample must contain one of coord_type : the sequence header format out_args : common output argument dictionary from parseCommonArgs Returns: list: seq_file_1 and seq_file_2 output file names """ # Sequence index key function def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) # Function to sample from two lists of sequence indices def _sample_list(n, index_1, index_2): key_set = set(index_1).intersection(index_2) max_n = len(key_set) return random.sample(key_set, min(n, max_n)) # Function to sample from two dictionaries of grouped sequence indices def _sample_dict(n, index_1, index_2): group_set = set(index_1.keys()).intersection(index_2.keys()) sample_list = [] for k in group_set: key_set = set(index_1[k]).intersection(index_2[k]) max_n = len(key_set) sample_list.extend(random.sample(key_set, min(n, max_n))) return sample_list # Print console log log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'samplepair' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['MAX_COUNTS'] = ','.join([str(x) for x in max_count]) log['FIELD'] = field log['VALUES'] = ','.join(values) if values else None printLog(log) # Define output type in_type_1 = getFileType(seq_file_1) in_type_2 = getFileType(seq_file_2) if out_args['out_type'] is None: out_type_1 = in_type_1 out_type_2 = in_type_2 else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Index input files start_time = time() printMessage('Reading files', start_time=start_time, width=25) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) seq_dict_2 = readSeqFile(seq_file_2, index=True, key_func=_key_func) # Subset keys to those meeting field/value criteria if field is not None and values is not None: _sample = _sample_list printMessage('Subsetting by annotation', start_time=start_time, width=25) seq_index_1 = subsetSeqIndex(seq_dict_1, field, values, delimiter=out_args['delimiter']) seq_index_2 = subsetSeqIndex(seq_dict_2, field, values, delimiter=out_args['delimiter']) elif field is not None and values is None: _sample = _sample_dict printMessage('Indexing by annotation', start_time=start_time, width=25) seq_index_1 = indexSeqSets(seq_dict_1, field, delimiter=out_args['delimiter']) seq_index_2 = indexSeqSets(seq_dict_2, field, delimiter=out_args['delimiter']) else: _sample = _sample_list seq_index_1 = list(seq_dict_1.keys()) seq_index_2 = list(seq_dict_2.keys()) printMessage('Done', start_time=start_time, end=True, width=25) # Generate sample set for each value in max_count out_files = [] for i, n in enumerate(max_count): start_time = time() printMessage('Sampling n=%i' % n, start_time=start_time, width=25) # Sample sample_keys = _sample(n, seq_index_1, seq_index_2) sample_count = len(sample_keys) # Open file handles out_handle_1 = getOutputHandle(seq_file_1, 'sample%i-n%i' % (i + 1, sample_count), out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) out_handle_2 = getOutputHandle(seq_file_2, 'sample%i-n%i' % (i + 1, sample_count), out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) out_files.append((out_handle_1.name, out_handle_2.name)) for k in sample_keys: SeqIO.write(seq_dict_1[k], out_handle_1, out_type_1) SeqIO.write(seq_dict_2[k], out_handle_2, out_type_2) printMessage('Done', start_time=start_time, end=True, width=25) # Print log for iteration log = OrderedDict() log['MAX_COUNT'] = n log['SAMPLED'] = sample_count log['OUTPUT1'] = os.path.basename(out_files[i][0]) log['OUTPUT2'] = os.path.basename(out_files[i][1]) printLog(log) # Close file handles out_handle_1.close() out_handle_2.close() # Print log log = OrderedDict() log['END'] = 'SplitSeq' printLog(log) return out_files
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, copy_actions=None, max_field=None, min_field=None, inner=False, keep_missing=False, out_file=None, out_args=default_out_args): """ Removes duplicate sequences from a file Arguments: seq_file : filename of the sequence file to sample from. max_missing : number of ambiguous characters to allow in a unique sequence. uniq_fields : a list of annotations that define a sequence as unique if they differ. copy_fields : a list of annotations to copy into unique sequence annotations. copy_actions : the list of collapseAnnotation actions to take on copy_fields. max_field : a numeric field whose maximum value determines the retained sequence. min_field : a numeric field whose minimum value determines the retained sequence. inner : if True exclude consecutive outer ambiguous characters from iterations and matching. keep_missing : if True retain sequences with more ambiguous characters than max_missing as unique. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: the collapsed output file name. """ log = OrderedDict() log['START'] = 'CollapseSeq' log['FILE'] = os.path.basename(seq_file) log['MAX_MISSING'] = max_missing log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \ if uniq_fields is not None else None log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \ if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \ if copy_actions is not None else None log['MAX_FIELD'] = max_field log['MIN_FIELD'] = min_field log['INNER'] = inner log['KEEP_MISSING'] = keep_missing printLog(log) # Read input file in_type = getFileType(seq_file) seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False)) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count total sequences rec_count = len(seq_dict) # Open unique record output handle if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Find sequences with duplicates uniq_dict = {} # Added list typing for compatibility issue with Python 2.7.5 on OS X # TypeError: object of type 'dictionary-keyiterator' has no len() search_keys = list(seq_dict.keys()) dup_keys = [] for n in range(0, max_missing + 1): # Find unique sequences uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, uniq_fields, copy_fields, max_field, min_field, inner, out_args['delimiter']) # Update list of duplicates dup_keys.extend(dup_list) # Break if no keys to search remain if len(search_keys) == 0: break # Write unique sequences for val in uniq_dict.values(): # Define output sequence out_seq = val.seq out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_app = OrderedDict() if copy_fields is not None and copy_actions is not None: for f, a in zip(copy_fields, copy_actions): x = collapseAnnotation(val.annotations, a, f, delimiter=out_args['delimiter']) out_app[f] = x[f] out_ann.pop(f, None) out_app['DUPCOUNT'] = val.count out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' # Write unique sequence SeqIO.write(out_seq, pass_handle, out_args['out_type']) # Update log log = OrderedDict() log['HEADER'] = out_seq.id log['DUPCOUNT'] = val.count for i, k in enumerate(val.keys, start=1): log['ID%i' % i] = k for i, k in enumerate(val.keys, start=1): log['SEQ%i' % i] = str(seq_dict[k].seq) printLog(log, handle=log_handle) # Write sequence with high missing character counts if keep_missing: for k in search_keys: out_seq = seq_dict[k] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' SeqIO.write(out_seq, pass_handle, out_args['out_type']) # Write sequence with high missing character counts if out_args['failed'] and not keep_missing: with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as missing_handle: for k in search_keys: SeqIO.write(seq_dict[k], missing_handle, out_args['out_type']) if out_args['failed']: # Write duplicate sequences with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as dup_handle: for k in dup_keys: SeqIO.write(seq_dict[k], dup_handle, out_args['out_type']) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['SEQUENCES'] = rec_count log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) log['END'] = 'CollapseSeq' printLog(log) # Close file handles pass_file = pass_handle.name if pass_handle is not None: pass_handle.close() if log_handle is not None: log_handle.close() return pass_file
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, copy_actions=None, max_field=None, min_field=None, inner=False, keep_missing=False, out_args=default_out_args): """ Removes duplicate sequences from a file Arguments: seq_file = filename of the sequence file to sample from max_missing = number of ambiguous characters to allow in a unique sequence uniq_fields = a list of annotations that define a sequence as unique if they differ copy_fields = a list of annotations to copy into unique sequence annotations copy_actions = the list of collapseAnnotation actions to take on copy_fields max_field = a numeric field whose maximum value determines the retained sequence min_field = a numeric field whose minimum value determines the retained sequence inner = if True exclude consecutive outer ambiguous characters from iterations and matching keep_missing = if True retain sequences with more ambiguous characters than max_missing as unique out_args = common output argument dictionary from parseCommonArgs Returns: the collapsed output file name """ log = OrderedDict() log['START'] = 'CollapseSeq' log['FILE'] = os.path.basename(seq_file) log['MAX_MISSING'] = max_missing log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \ if uniq_fields is not None else None log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \ if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \ if copy_actions is not None else None log['MAX_FIELD'] = max_field log['MIN_FIELD'] = min_field log['INNER'] = inner log['KEEP_MISSING'] = keep_missing printLog(log) # TODO: storing all sequences in memory is faster # Read input file in_type = getFileType(seq_file) #seq_dict = readSeqFile(seq_file, index=True) seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False)) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count total sequences rec_count = len(seq_dict) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Find sequences with duplicates uniq_dict = {} # Added list typing for compatibility issue with Python 2.7.5 on OS X # TypeError: object of type 'dictionary-keyiterator' has no len() search_keys = list(seq_dict.keys()) dup_keys = [] for n in range(0, max_missing + 1): # Find unique sequences uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, uniq_fields, copy_fields, max_field, min_field, inner, out_args['delimiter']) # Update list of duplicates dup_keys.extend(dup_list) # Update log log = OrderedDict() log['ITERATION'] = n + 1 log['MISSING'] = n log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) printLog(log, handle=log_handle) # Break if no keys to search remain if len(search_keys) == 0: break # Write unique sequences with getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as uniq_handle: for val in uniq_dict.values(): # Define output sequence out_seq = val[0] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_app = OrderedDict() if copy_fields is not None and copy_actions is not None: for f, a, s in zip(copy_fields, copy_actions, val[3:]): out_app[f] = s out_app = collapseAnnotation(out_app, a, f, delimiter=out_args['delimiter']) out_ann.pop(f, None) out_app['DUPCOUNT'] = val[1] out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' # Write unique sequence SeqIO.write(out_seq, uniq_handle, out_args['out_type']) # Write sequence with high missing character counts if keep_missing: for k in search_keys: out_seq = seq_dict[k] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' SeqIO.write(out_seq, uniq_handle, out_args['out_type']) # Write sequence with high missing character counts if out_args['failed'] and not keep_missing: with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as missing_handle: for k in search_keys: SeqIO.write(seq_dict[k], missing_handle, out_args['out_type']) if out_args['failed']: # Write duplicate sequences with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as dup_handle: for k in dup_keys: SeqIO.write(seq_dict[k], dup_handle, out_args['out_type']) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(uniq_handle.name) log['SEQUENCES'] = rec_count log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) log['END'] = 'CollapseSeq' printLog(log) # Close file handles if log_handle is not None: log_handle.close() return uniq_handle.name
args_dict['assemble_args'] = {'alpha': args_dict['alpha'], 'max_error': args_dict['max_error'], 'min_len': args_dict['min_len'], 'max_len': args_dict['max_len'], 'scan_reverse': args_dict['scan_reverse'], 'assembly_stats': AssemblyStats(args_dict['max_len'] + 1)} del args_dict['alpha'] del args_dict['max_error'] del args_dict['min_len'] del args_dict['max_len'] del args_dict['scan_reverse'] elif args_dict['assemble_func'] is joinSeqPair: args_dict['assemble_args'] = {'gap':args_dict['gap']} del args_dict['gap'] elif args_dict['assemble_func'] is referenceAssembly: ref_dict = {s.id:s.upper() for s in readSeqFile(args_dict['ref_file'])} #ref_file = makeUsearchDb(args_dict['ref_file'], args_dict['usearch_exec']) args_dict['assemble_args'] = {'ref_file': args_dict['ref_file'], 'ref_dict': ref_dict, 'min_ident': args_dict['min_ident'], 'evalue': args_dict['evalue'], 'max_hits': args_dict['max_hits'], 'fill': args_dict['fill'], 'usearch_exec': args_dict['usearch_exec']} del args_dict['ref_file'] del args_dict['min_ident'] del args_dict['evalue'] del args_dict['max_hits'] del args_dict['fill'] del args_dict['usearch_exec']
def clusterBarcodes(seq_file, ident=default_cluster_ident, length_ratio=default_length_ratio, barcode_field=default_barcode_field, cluster_field=default_cluster_field, cluster_prefix=default_cluster_prefix, cluster_tool=default_cluster_tool, cluster_exec=default_cluster_exec, out_file=None, out_args=default_out_args, nproc=None): """ Performs clustering on sets of sequences Arguments: seq_file : the sample sequence file name. ident : the identity threshold for clustering sequences. length_ratio : minimum short/long length ratio allowed within a cluster. barcode_field : the annotation field containing barcode sequences. cluster_field : the name of the output cluster field. cluster_prefix : string defining a prefix for the cluster identifier. seq_start : the start position to trim sequences at before clustering. seq_end : the end position to trim sequences at before clustering. cluster_tool : the clustering tool to use; one of cd-hit or usearch. cluster_exec : the path to the executable for usearch. out_file : output file name. Automatically generated from the input file if None. out_args : output arguments. nproc : the number of processQueue processes; if None defaults to the number of CPUs. Returns: str: the clustered output file name """ # Function to modify SeqRecord header with cluster identifier def _header(seq, cluster, field=cluster_field, prefix=cluster_prefix, delimiter=out_args['delimiter']): label = '%s%i' % (prefix, cluster) header = parseAnnotation(seq.description, delimiter=delimiter) header = mergeAnnotation(header, {field: label}, delimiter=delimiter) seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter) seq.description = '' return seq # Function to extract to make SeqRecord object from a barcode annotation def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']): header = parseAnnotation(seq.description, delimiter=delimiter) return SeqRecord(Seq(header[field]), id=seq.id) # Print parameter info log = OrderedDict() log['START'] = 'ClusterSets' log['COMMAND'] = 'barcode' log['FILE'] = os.path.basename(seq_file) log['IDENTITY'] = ident log['BARCODE_FIELD'] = barcode_field log['CLUSTER_FIELD'] = cluster_field log['CLUSTER_PREFIX'] = cluster_prefix log['CLUSTER_TOOL'] = cluster_tool log['NPROC'] = nproc printLog(log) # Set cluster tool try: cluster_func = map_cluster_tool.get(cluster_tool) except: printError('Invalid clustering tool %s.' % cluster_tool) # Check the minimum identity if ident < min_cluster_ident[cluster_tool]: printError('Minimum identity %s too low for clustering tool %s.' % (str(ident), cluster_tool)) # Count sequence file and parse into a list of SeqRecords result_count = countSeqFile(seq_file) barcode_iter = (_barcode(x) for x in readSeqFile(seq_file)) # Perform clustering start_time = time() printMessage('Running %s' % cluster_tool, start_time=start_time, width=25) cluster_dict = cluster_func(barcode_iter, ident=ident, length_ratio=length_ratio, seq_start=0, seq_end=None, threads=nproc, cluster_exec=cluster_exec) printMessage('Done', start_time=start_time, end=True, width=25) # Determine file type if out_args['out_type'] is None: out_args['out_type'] = getFileType(seq_file) # Open output file handles if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(seq_file, 'cluster-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Open indexed sequence file seq_dict = readSeqFile(seq_file, index=True) # Iterate over sequence records and update header with cluster annotation start_time = time() rec_count = pass_count = 0 for cluster, id_list in cluster_dict.items(): printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += len(id_list) # TODO: make a generator. Figure out how to get pass_count updated # Define output sequences seq_output = [_header(seq_dict[x], cluster) for x in id_list] # Write output pass_count += len(seq_output) SeqIO.write(seq_output, pass_handle, out_args['out_type']) # Update progress printProgress(rec_count, result_count, 0.05, start_time=start_time) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLUSTERS'] = len(cluster_dict) log['SEQUENCES'] = result_count log['PASS'] = pass_count log['FAIL'] = rec_count - pass_count log['END'] = 'ClusterSets' printLog(log) # Close handles pass_handle.close() return pass_handle.name
def sampleSeqFile(seq_file, max_count, field=None, values=None, out_args=default_out_args): """ Samples from a sequence file Arguments: seq_file : filename of the sequence file to sample from max_count : a list of the maximum number of sequences to sample field : the annotation field to check for required values values : a list of annotation values that a sample must contain one of out_args : common output argument dictionary from parseCommonArgs Returns: str: output file name """ # Function to sample from a list of sequence indices def _sample_list(n, index_list): max_n = len(index_list) r = random.sample(range(max_n), n) if n < max_n else range(max_n) return [index_list[x] for x in r] # Function to sample from a dictionary of grouped sequence indices def _sample_dict(n, index_dict): sample_list = [] for v in index_dict.values(): max_n = len(v) r = random.sample(range(max_n), n) if n < max_n else range(max_n) sample_list.extend([v[x] for x in r]) return sample_list # Print console log log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'sample' log['FILE'] = os.path.basename(seq_file) log['MAX_COUNTS'] = ','.join([str(x) for x in max_count]) log['FIELD'] = field log['VALUES'] = ','.join(values) if values else None printLog(log) # Read input files and open output files start_time = time() printMessage('Reading files', start_time=start_time, width=25) in_type = getFileType(seq_file) seq_dict = readSeqFile(seq_file, index=True) if out_args['out_type'] is None: out_args['out_type'] = in_type # Generate subset of records if field is not None and values is not None: _sample = _sample_list printMessage('Subsetting by annotation', start_time=start_time, width=25) seq_index = subsetSeqIndex(seq_dict, field, values, delimiter=out_args['delimiter']) elif field is not None and values is None: _sample = _sample_dict printMessage('Indexing by annotation', start_time=start_time, width=25) seq_index = indexSeqSets(seq_dict, field, delimiter=out_args['delimiter']) else: _sample = _sample_list seq_index = [k for k in seq_dict] printMessage('Done', start_time=start_time, end=True, width=25) # Generate sample set for each value in max_count out_files = [] for i, n in enumerate(max_count): start_time = time() printMessage('Sampling n=%i' % n, start_time=start_time, width=25) # Sample from records sample_keys = _sample(n, seq_index) sample_count = len(sample_keys) # Write sampled sequences to files with getOutputHandle(seq_file, 'sample%i-n%i' % (i + 1, sample_count), out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) as out_handle: for k in sample_keys: SeqIO.write(seq_dict[k], out_handle, out_args['out_type']) out_files.append(out_handle.name) printMessage('Done', start_time=start_time, end=True, width=25) # Print log for iteration log = OrderedDict() log['MAX_COUNT'] = n log['SAMPLED'] = sample_count log['OUTPUT'] = os.path.basename(out_files[i]) printLog(log) # Print log log = OrderedDict() log['END'] = 'SplitSeq' printLog(log) return out_files
def downsizeSeqFile(seq_file, max_count, out_args=default_out_args): """ Splits a FASTA/FASTQ file into segments with a limited number of records Arguments: seq_file : filename of the FASTA file to split max_count : number of records in each output file out_args : common output argument dictionary from parseCommonArgs Returns: list: output file names """ log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'count' log['FILE'] = os.path.basename(seq_file) log['MAX_COUNT'] = max_count printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Determine total numbers of records rec_count = countSeqFile(seq_file) # Loop through iterator writing each record and opening new output handle as needed start_time = time() seq_count, part_num = 0, 1 out_handle = getOutputHandle(seq_file, 'part%06i' % part_num, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) out_files = [out_handle.name] for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, rec_count, 0.05, start_time=start_time) # Update count seq_count += 1 # Write records SeqIO.write(seq, out_handle, out_args['out_type']) # Break if total records reached to avoid extra empty file if seq_count == rec_count: break # Open new file if needed if seq_count % max_count == 0: out_handle.close() part_num += 1 out_handle = getOutputHandle(seq_file, 'part%06i' % part_num, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) out_files.append(out_handle.name) # Print log printProgress(seq_count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, f in enumerate(out_files): log['OUTPUT%i' % (i + 1)] = os.path.basename(f) log['SEQUENCES'] = rec_count log['PARTS'] = len(out_files) log['END'] = 'SplitSeq' printLog(log) # Close file handles out_handle.close() return out_files
def estimateBarcode(seq_file, barcode_field=default_barcode_field, distance_types=default_distance_types, out_args=default_out_args): """ Calculates error rates of barcode sequences Arguments: seq_file : the sample sequence file name barcode_field : the annotation field containing barcode sequences. distance_types : distance types to include. out_args : common output argument dictionary from parseCommonArgs Returns: tuple: names of the output files. """ # Function to extract to make SeqRecord object from a barcode annotation def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']): header = parseAnnotation(seq.description, delimiter=delimiter) return header[field] # Print parameter info log = OrderedDict() log['START'] = 'EstimateError' log['COMMAND'] = 'barcode' log['FILE'] = os.path.basename(seq_file) log['BARCODE_FIELD'] = barcode_field printLog(log) # Count sequence file and parse into a list of SeqRecords result_count = countSeqFile(seq_file) barcode_iter = (_barcode(x) for x in readSeqFile(seq_file)) # Compute bin_count defaults to the length of the barcode + 1 bin_count = len(_barcode(next(readSeqFile(seq_file)))) + 1 mismatch = initializeMismatchDictionary(0, distance_types=distance_types, bin_count=bin_count) # Calculate distances distance_mismatch = calculateDistances(barcode_iter, bin_count=bin_count) mismatch['dist'] = { header: distance_mismatch[header] for header in distance_types } # Generate a df dist_df = pd.DataFrame.from_dict(mismatch['dist']) dist_df.index = dist_df.index / len(dist_df.index) dist_df[['all']] = dist_df[['all']].astype(int) #find the threshold (average minimum between 0 and 0.75) dist = mismatch['dist']['all'] thresh_df = pd.DataFrame.from_dict({'thresh': {'ALL': dist_df.index[int(np.mean([index for index in np.argsort(dist[:int(len(dist)*0.75)]) \ if dist[index] == np.min(dist)]))]} }) file_args = { 'out_dir': out_args['out_dir'], 'out_name': out_args['out_name'], 'out_type': 'tab' } # Output as tsv with getOutputHandle(seq_file, 'distance-barcode', **file_args) as dist_handle, \ getOutputHandle(seq_file, 'threshold-barcode', **file_args) as thresh_handle: dist_df.to_csv(dist_handle, sep='\t', na_rep='NA', index=True, index_label='DISTANCE', columns=['all'], header=['ALL'], float_format='%.6f') thresh_df.to_csv(thresh_handle, sep='\t', na_rep='NA', index=True, index_label='TYPE', columns=['thresh'], header=['THRESHOLD'], float_format='%.6f') # Update log log['OUTPUT1'] = os.path.basename(dist_handle.name) log['OUTPUT2'] = os.path.basename(thresh_handle.name) log['SEQUENCES'] = result_count log['ALL_THRESHOLD'] = '%.6f' % thresh_df['thresh']['ALL'] log['END'] = 'EstimateError' printLog(log) return (dist_handle.name, thresh_handle.name)
def sortSeqFile(seq_file, field, numeric=False, max_count=None, out_args=default_out_args): """ Sorts a sequence file by annotation fields Arguments: seq_file : filename of the sequence file to split field : position of field in sequence description to split by numeric : if True sort field numerically; if False sort field alphabetically max_count : maximum number of records in each output file if None do not create multiple files out_args : common output argument dictionary from parseCommonArgs Returns: list: output file names """ log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'sort' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field log['NUMERIC'] = numeric log['MAX_COUNT'] = max_count printLog(log) # Open file handles in_type = getFileType(seq_file) seq_dict = readSeqFile(seq_file, index=True) if out_args['out_type'] is None: out_args['out_type'] = in_type # Get annotations and sort seq_dict by annotation values tag_dict = { k: parseAnnotation(seq_dict[k].description, delimiter=out_args['delimiter'])[field] for k in seq_dict } if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()} sorted_keys = sorted(tag_dict, key=tag_dict.get) # Determine total numbers of records rec_count = len(seq_dict) if max_count >= rec_count: max_count = None # Open initial output file handles file_count = 1 if max_count is None: out_label = 'sorted' else: out_label = 'sorted-part%06i' % file_count out_handle = getOutputHandle(seq_file, out_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) out_files = [out_handle.name] # Loop through sorted sequence dictionary keys start_time = time() last_tag = None saved_keys = [] seq_count = chunk_count = 0 for key in sorted_keys: # Print progress for previous iteration and update count printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write saved group of sequences when tag changes if last_tag is not None and tag_dict[key] != last_tag: # Open new output file if needed if max_count is not None and chunk_count + len( saved_keys) > max_count: # Update partition counts file_count += 1 chunk_count = 0 # Open new file handle out_handle.close() out_handle = getOutputHandle(seq_file, 'sorted-part%06i' % file_count, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Append output file name to out_files out_files.append(out_handle.name) # Write saved sequences for k in saved_keys: chunk_count += 1 SeqIO.write(seq_dict[k], out_handle, out_args['out_type']) # Reset saved keys to current key only saved_keys = [key] else: # Update list of saved keys if tag is unchanged saved_keys.append(key) # Check if total records reached, write all saved keys, and exit loop if seq_count == rec_count: for k in saved_keys: chunk_count += 1 SeqIO.write(seq_dict[k], out_handle, out_args['out_type']) out_handle.close() break # Update tag tracker last_tag = tag_dict[key] # Print log printProgress(seq_count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, f in enumerate(out_files): log['OUTPUT%i' % (i + 1)] = os.path.basename(f) log['SEQUENCES'] = seq_count log['PARTS'] = len(out_files) log['END'] = 'SplitSeq' printLog(log) # Close file handles out_handle.close() return out_files
def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args): """ Converts sequence headers to the pRESTO format Arguments: seq_file : the sequence file name. convert_func : the function used to convert sequence headers. convert_args : a dictionary of arguments to pass to convert_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: the output sequence file name. """ # Define subcommand label dictionary cmd_dict = {convertGenericHeader: 'generic', convert454Header: '454', convertGenbankHeader: 'genbank', convertIlluminaHeader: 'illumina', convertIMGTHeader: 'imgt', convertMIGECHeader: 'migec', convertSRAHeader: 'sra'} log = OrderedDict() log['START'] = 'ConvertHeaders' log['COMMAND'] = cmd_dict[convert_func] log['FILE'] = os.path.basename(seq_file) printLog(log) # Open input file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Wrapper for opening handles and writers def _open(x, out_file=out_file): if out_file is not None and x == 'pass': handle = open(out_file, 'w') else: handle = getOutputHandle(seq_file, 'convert-%s' % x, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) return handle # Count records result_count = countSeqFile(seq_file) # Set additional conversion arguments if convert_func in [convertGenericHeader, convertGenbankHeader]: convert_args.update({'delimiter': out_args['delimiter']}) # Intialize file handles pass_handle, fail_handle = None, None # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration and update count printProgress(seq_count, result_count, 0.05, start_time=start_time) seq_count += 1 # Convert header header = convert_func(seq.description, **convert_args) if header is not None: # Write successfully converted sequences pass_count += 1 seq.id = seq.name = flattenAnnotation(header, out_args['delimiter']) seq.description = '' try: SeqIO.write(seq, pass_handle, out_args['out_type']) except AttributeError: # Open output file pass_handle = _open('pass') SeqIO.write(seq, pass_handle, out_args['out_type']) else: fail_count += 1 if out_args['failed']: # Write unconverted sequences try: SeqIO.write(seq, fail_handle, out_args['out_type']) except AttributeError: # Open output file pass_handle = _open('fail') SeqIO.write(seq, fail_handle, out_args['out_type']) # Print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertHeaders' printLog(log) # Close file handles if fail_handle is not None: pass_handle.close() if fail_handle is not None: fail_handle.close() return pass_handle.name
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file = the sequence file name modify_func = the function defining the modification operation modify_args = a dictionary of arguments to pass to modify_func out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def tableHeaders(seq_file, fields, out_file=None, out_args=default_out_args): """ Builds a table of sequence header annotations Arguments: seq_file : the sequence file name. fields : the list of fields to output. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output table file name """ log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = 'table' log['FILE'] = os.path.basename(seq_file) printLog(log) # Open file handles seq_iter = readSeqFile(seq_file) if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'headers', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Count records result_count = countSeqFile(seq_file) # Open csv writer and write header out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', delimiter='\t', fieldnames=fields) out_writer.writeheader() # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time=start_time) # Get annotations seq_count += 1 ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter']) # Write records if ann: pass_count += 1 out_writer.writerow(ann) else: fail_count += 1 # Print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, coord_type=default_coord_type, out_args=default_out_args): """ Generates consensus sequences Arguments: seq_file_1 = the file containing the grouped sequences and annotations seq_file_2 = the file to assign annotations to from seq_file_1 fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations coord_type = the sequence header format out_args = common output argument dictionary from parseCommonArgs Returns: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2) """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) seq_2.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) seq_1.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file : the sequence file name. modify_func : the function defining the modification operation. modify_args : a dictionary of arguments to pass to modify_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output file name. """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time=start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def convertHeaders(seq_file, convert_func, convert_args={}, out_args=default_out_args): """ Converts sequence headers to the pRESTO format Arguments: seq_file = the sequence file name convert_func = the function used to convert sequence headers convert_args = a dictionary of arguments to pass to convert_func out_args = common output argument dictionary from parseCommonArgs Returns: the output sequence file name """ # Define subcommand label dictionary cmd_dict = {convertGenericHeader:'generic', convert454Header:'454', convertGenbankHeader:'genbank', convertIlluminaHeader:'illumina', convertIMGTHeader:'imgt', convertSRAHeader:'sra'} log = OrderedDict() log['START'] = 'ConvertHeaders' log['COMMAND'] = cmd_dict[convert_func] log['FILE'] = os.path.basename(seq_file) printLog(log) # Open input file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count records result_count = countSeqFile(seq_file) # Open output file handles pass_handle = getOutputHandle(seq_file, 'convert-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) if out_args['failed']: fail_handle = getOutputHandle(seq_file, 'convert-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) else: fail_handle = None # Set additional conversion arguments if convert_func in [convertGenericHeader, convertGenbankHeader]: convert_args.update({'delimiter':out_args['delimiter']}) # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration and update count printProgress(seq_count, result_count, 0.05, start_time) seq_count += 1 # Convert header header = convert_func(seq.description, **convert_args) if header is not None: # Write successfully converted sequences pass_count += 1 seq.id = seq.name = flattenAnnotation(header, out_args['delimiter']) seq.description = '' SeqIO.write(seq, pass_handle, out_args['out_type']) else: fail_count += 1 if fail_handle is not None: # Write successfully unconverted sequences SeqIO.write(seq, fail_handle, out_args['out_type']) # Print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertHeaders' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() return pass_handle.name
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, action=None, coord_type=default_coord, out_args=default_out_args): """ Syncronized paired end files and copies annotations between them Arguments: seq_file_1 : the file containing the grouped sequences and annotations. seq_file_2 : the file to assign annotations to from seq_file_1. fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations. fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations. action : the collapse action to take on all copied annotation if they already exist in the target header. coord_type : the sequence header format. out_args : common output argument dictionary from parseCommonArgs. Returns: list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2). """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time=start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation( ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) # Collapse if necessary if action is not None: merge_ann = collapseAnnotation( merge_ann, action, fields=fields_1, delimiter=out_args['delimiter']) # Flatten seq_2.id = flattenAnnotation( merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation( ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) # Collapse if necessary if action is not None: merge_ann = collapseAnnotation( merge_ann, action, fields=fields_2, delimiter=out_args['delimiter']) # Flatten seq_1.id = flattenAnnotation( merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time=start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]