def addDbFile(db_file, fields, values, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file = the database file name fields = a list of fields to add values = a list of values to assign to all rows of each field out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames} # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def indexDbFile(db_file, field=default_index_field, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file = the database file name field = the name of the index field to add out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=field) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Add count and write updated row rec.update({field: rec_count}) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def dropDbFile(db_file, fields, out_args=default_out_args): """ Deletes entire fields from a database file Arguments: db_file = the database file name fields = a list of fields to drop out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write row pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def indexDbFile(db_file, field=default_index_field, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file = the database file name field = the name of the index field to add out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=field) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Add count and write updated row rec.update({field:rec_count}) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}): """ Assembles results from a queue of individual sequence results and manages log/file I/O Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process result_queue = a multiprocessing.Queue holding processQueue results collect_queue = a multiprocessing.Queue to store collector return values db_file = the input database file name out_args = common output argument dictionary from parseCommonArgs cluster_func = the function to call for carrying out clustering on distance matrix cluster_args = a dictionary of arguments to pass to cluster_func Returns: None (adds 'log' and 'out_files' to collect_dict) """ # Open output files try: # Count records and define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] result_count = countDbFile(db_file) # Defined successful output handle pass_handle = getOutputHandle(db_file, out_label='clone-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, out_label='clone-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None fail_writer = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: #sys.stderr.write('Exception in collector file opening step\n') alive.value = False raise # Get results from queue and write to files try: #print 'START COLLECT', alive.value # Iterator over results queue until sentinel object reached start_time = time() rec_count = clone_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break #print "COLLECT", alive.value, result['id'] # Print progress for previous iteration and update record count if rec_count == 0: print('PROGRESS> Assigning clones') printProgress(rec_count, result_count, 0.05, start_time) rec_count += len(result.data) # Write passed and failed records if result: for clone in result.results.values(): clone_count += 1 for i, rec in enumerate(clone): rec.annotations['CLONE'] = clone_count pass_writer.writerow(rec.toDict()) pass_count += 1 result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) else: for i, rec in enumerate(result.data): if fail_writer is not None: fail_writer.writerow(rec.toDict()) fail_count += 1 result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) # Write log printLog(result.log, handle=log_handle) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(rec_count, result_count, 0.05, start_time) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() # Update return list log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log':log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) except: #sys.stderr.write('Exception in collector result processing step\n') alive.value = False raise return None
def updateDbFile(db_file, field, values, updates, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_file = the database file name field = the field to update values = a list of values to specifying which rows to update updates = a list of values to update each value with out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'update' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['VALUES'] = ','.join(values) log['UPDATES'] = ','.join(updates) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Updated values if found for x, y in zip(values, updates): if rec[field] == x: rec[field] = y pass_count += 1 # Write records pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['UPDATED'] = pass_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def sortDbFile(db_file, field, numeric=False, descend=False, out_args=default_out_args): """ Sorts records by values in an annotation field Arguments: db_file = the database filename field = the field name to sort by numeric = if True sort field numerically; if False sort field alphabetically descend = if True sort in descending order; if False sort in ascending order out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'sort' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUMERIC'] = numeric printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Store all records in a dictionary start_time = time() printMessage("Indexing: Running", start_time=start_time) db_dict = {i: r for i, r in enumerate(db_iter)} result_count = len(db_dict) # Sort db_dict by field values tag_dict = {k: v[field] for k, v in db_dict.items()} if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()} sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend) printMessage("Indexing: Done", start_time=start_time, end=True) # Iterate over records start_time = time() rec_count = 0 for key in sorted_keys: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write records pass_writer.writerow(db_dict[key]) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): """ Divides a tab-delimited database file into segments by description tags Arguments: db_file = filename of the tab-delimited database file to split field = the field name by which to split db_file num_split = the numerical threshold by which to group sequences; if None treat field as textual out_args = common output argument dictionary from parseCommonArgs Returns: a list of output file names """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'split' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUM_SPLIT'] = num_split printLog(log) # Open IgRecord reader iter object reader = readDbFile(db_file, ig=False) # Determine total numbers of records rec_count = countDbFile(db_file) start_time = time() count = 0 # Sort records into files based on textual field if num_split is None: # Create set of unique field tags tmp_iter = readDbFile(db_file, ig=False) tag_list = list(set([row[field] for row in tmp_iter])) # Forbidden characters in filename and replacements noGood = { '\/': 'f', '\\': 'b', '?': 'q', '\%': 'p', '*': 's', ':': 'c', '\|': 'pi', '\"': 'dq', '\'': 'sq', '<': 'gt', '>': 'lt', ' ': '_' } # Replace forbidden characters in tag_list tag_dict = {} for tag in tag_list: for c, r in noGood.items(): tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ if c in tag else tag_dict.get(tag, tag)) # Create output handles handles_dict = { tag: getOutputHandle(db_file, '%s-%s' % (field, label), out_type=out_args['out_type'], out_name=out_args['out_name'], out_dir=out_args['out_dir']) for tag, label in tag_dict.items() } # Create Db writer instances writers_dict = { tag: getDbWriter(handles_dict[tag], db_file) for tag in tag_dict } # Iterate over IgRecords for row in reader: printProgress(count, rec_count, 0.05, start_time) count += 1 # Write row to appropriate file tag = row[field] writers_dict[tag].writerow(row) # Sort records into files based on numeric num_split else: num_split = float(num_split) # Create output handles handles_dict = { 'under': getOutputHandle(db_file, 'under-%.1f' % num_split, out_type=out_args['out_type'], out_name=out_args['out_name'], out_dir=out_args['out_dir']), 'atleast': getOutputHandle(db_file, 'atleast-%.1f' % num_split, out_type=out_args['out_type'], out_name=out_args['out_name'], out_dir=out_args['out_dir']) } # Create Db writer instances writers_dict = { 'under': getDbWriter(handles_dict['under'], db_file), 'atleast': getDbWriter(handles_dict['atleast'], db_file) } # Iterate over IgRecords for row in reader: printProgress(count, rec_count, 0.05, start_time) count += 1 tag = row[field] tag = 'under' if float(tag) < num_split else 'atleast' writers_dict[tag].writerow(row) # Write log printProgress(count, rec_count, 0.05, start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['RECORDS'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'ParseDb' printLog(log) # Close output file handles for t in handles_dict: handles_dict[t].close() return [handles_dict[t].name for t in handles_dict]
def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args): """ Assembles results from a queue of individual sequence results and manages log/file I/O Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process result_queue = a multiprocessing.Queue holding processQueue results collect_queue = a multiprocessing.Queue to store collector return values db_file = the input database file name out_args = common output argument dictionary from parseCommonArgs cluster_func = the function to call for carrying out clustering on distance matrix cluster_args = a dictionary of arguments to pass to cluster_func Returns: None (adds 'log' and 'out_files' to collect_dict) """ # Open output files try: # Iterate over Ig records to count and order by junction length result_count = 0 records = {} # print 'Reading file...' db_iter = readDbFile(db_file) for rec in db_iter: records[rec.id] = rec result_count += 1 records = OrderedDict( sorted(list(records.items()), key=lambda i: i[1].junction_length)) # Define empty matrix to store assembled results dist_mat = np.zeros((result_count, result_count)) # Count records and define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] # Defined successful output handle pass_handle = getOutputHandle(db_file, out_label='clone-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') # Defined failed cloning output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, out_label='clone-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None fail_writer = None # Open log file if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Iterator over results queue until sentinel object reached start_time = time() row_count = rec_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration if row_count == 0: print('PROGRESS> Assigning clones') printProgress(row_count, result_count, 0.05, start_time) # Update counts for iteration row_count += 1 rec_count += len(result) # Add result row to distance matrix if result: dist_mat[list(range(result_count - len(result), result_count)), result_count - len(result)] = result.results else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Calculate linkage and carry out clustering # print dist_mat clusters = cluster_func(dist_mat, ** cluster_args) if dist_mat is not None else None clones = {} # print clusters for i, c in enumerate(clusters): clones.setdefault(c, []).append(records[list(records.keys())[i]]) # Write passed and failed records clone_count = pass_count = fail_count = 0 if clones: for clone in clones.values(): clone_count += 1 for i, rec in enumerate(clone): rec.annotations['CLONE'] = clone_count pass_writer.writerow(rec.toDict()) pass_count += 1 #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) else: for i, rec in enumerate(result.data): fail_writer.writerow(rec.toDict()) fail_count += 1 #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) # Print final progress printProgress(row_count, result_count, 0.05, start_time) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() # Update return list log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log': log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) except: alive.value = False raise return None
def collectDbQueue(alive, result_queue, collect_queue, db_file, task_label, out_args, add_fields=None): """ Pulls from results queue, assembles results and manages log and file IO Arguments: alive : multiprocessing.Value boolean controlling whether processing continues; when False function returns result_queue : multiprocessing.Queue holding worker results collect_queue : multiprocessing.Queue to store collector return values db_file : Database file name task_label : Task label used to tag the output files out_args : Common output argument dictionary from parseCommonArgs add_fields : List of fields added to the writer not present in the in_file; if None do not add fields Returns: None : Adds a dictionary with key value pairs to collect_queue containing 'log' defining a log object, 'out_files' defining the output file names """ try: result_count = countDbFile(db_file) # Define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] # Defined valid alignment output handle pass_handle = getOutputHandle(db_file, '%s-pass' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, '%s-fail' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Iterator over results queue until sentinel object reached start_time = time() set_count = rec_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration printProgress(pass_count, result_count, 0.05, start_time) # Update counts for current iteration set_count += 1 rec_count += result.data_count # Write log if result.log is not None: printLog(result.log, handle=log_handle) # Write alignments if result: pass_count += result.data_count if isinstance(result.results, IgRecord): pass_writer.writerow(result.results.toDict()) else: for rec in result.results: pass_writer.writerow(rec.toDict()) else: fail_count += result.data_count if fail_handle is not None: if isinstance(result.data, IgRecord): pass_writer.writerow(result.data.toDict()) else: for rec in result.data: fail_writer.writerow(rec.toDict()) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(pass_count, result_count, 0.05, start_time) # Update return values log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GROUPS'] = set_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log': log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() except: alive.value = False raise return None
def addDbFile(db_file, fields, values, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file = the database file name fields = a list of fields to add values = a list of values to assign to all rows of each field out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = { k: v for k, v in zip(fields, values) if k not in db_iter.fieldnames } # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, score_fields=False, region_fields=False, out_args=default_out_args): """ Writes tab-delimited database file in output directory Arguments: db_gen = a generator of IgRecord objects containing alignment data file_prefix = directory and prefix for CLIP tab-delim file total_count = number of records (for progress bar) id_dict = a dictionary of {IMGT ID: full seq description} no_parse = if ID is to be parsed for pRESTO output with default delimiters score_fields = if True add alignment score fields to output file region_fields = if True add FWR and CDR region fields to output file out_args = common output argument dictionary from parseCommonArgs Returns: None """ pass_file = "%s_db-pass.tab" % file_prefix fail_file = "%s_db-fail.tab" % file_prefix ordered_fields = ['SEQUENCE_ID', 'SEQUENCE_INPUT', 'FUNCTIONAL', 'IN_FRAME', 'STOP', 'MUTATED_INVARIANT', 'INDELS', 'V_CALL', 'D_CALL', 'J_CALL', 'SEQUENCE_VDJ', 'SEQUENCE_IMGT', 'V_SEQ_START', 'V_SEQ_LENGTH', 'V_GERM_START_VDJ', 'V_GERM_LENGTH_VDJ', 'V_GERM_START_IMGT', 'V_GERM_LENGTH_IMGT', 'N1_LENGTH', 'D_SEQ_START', 'D_SEQ_LENGTH', 'D_GERM_START', 'D_GERM_LENGTH', 'N2_LENGTH', 'J_SEQ_START', 'J_SEQ_LENGTH', 'J_GERM_START', 'J_GERM_LENGTH', 'JUNCTION_LENGTH', 'JUNCTION'] if score_fields: ordered_fields.extend(['V_SCORE', 'V_IDENTITY', 'V_EVALUE', 'V_BTOP', 'J_SCORE', 'J_IDENTITY', 'J_EVALUE', 'J_BTOP']) if region_fields: ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) # TODO: This is not the best approach. should pass in output fields. # Initiate passed handle pass_handle = None # Open failed file if out_args['failed']: fail_handle = open(fail_file, 'wt') fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) else: fail_handle = None fail_writer = None # Initialize counters and file pass_writer = None start_time = time() rec_count = pass_count = fail_count = 0 for record in db_gen: #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) printProgress(rec_count, total_count, 0.05, start_time) rec_count += 1 # Count pass or fail if (record.v_call == 'None' and record.j_call == 'None') or \ record.functional is None or \ not record.seq_vdj or \ not record.junction: # print(record.v_call, record.j_call, record.functional, record.junction) fail_count += 1 if fail_writer is not None: fail_writer.writerow(record.toDict()) continue else: pass_count += 1 # Build sample sequence description if record.id in id_dict: record.id = id_dict[record.id] # Parse sequence description into new columns if not no_parse: record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) record.id = record.annotations['ID'] del record.annotations['ID'] # TODO: This is not the best approach. should pass in output fields. # If first sequence, use parsed description to create new columns and initialize writer if pass_writer is None: if not no_parse: ordered_fields.extend(list(record.annotations.keys())) pass_handle = open(pass_file, 'wt') pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) # Write row to tab-delim CLIP file pass_writer.writerow(record.toDict()) # Print log #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) printProgress(rec_count, total_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = pass_file log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'MakeDb' printLog(log) if pass_handle is not None: pass_handle.close() if fail_handle is not None: fail_handle.close()
def assembleCloneGermline(db_file, repo, seq_field=default_seq_field, v_field=default_v_field, germ_types=default_germ_types, out_args=default_out_args): """ Assemble one germline sequence for each clone in a tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'True' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader references = readRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS'] add_fields += ['GERMLINE_V_CALL'] add_fields += ['GERMLINE_D_CALL'] add_fields += ['GERMLINE_J_CALL'] # Create output file handle and Db writer writers = {} pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None writers['fail'] = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) counts = {} clone_count = counts['pass'] = counts['fail'] = 0 # Iterate over rows clone = 'initial' clone_dict = OrderedDict() for i, row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) # Clone isn't over yet if row.get('CLONE', '') == clone: clone_dict[i] = row # Clone just finished elif clone_dict: clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, references, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Now deal with current row (first of next clone) clone = row['CLONE'] clone_dict = OrderedDict([(i, row)]) # Last case is only for first row of file else: clone = row['CLONE'] clone_dict = OrderedDict([(i, row)]) clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, references, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Print log printProgress(i + 1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = counts['pass'] log['FAIL'] = counts['fail'] log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'False' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader references = readRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS'] # Create output file handle and Db writer pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None fail_writer = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) pass_count = fail_count = 0 # Iterate over rows for i, row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) result_log, germlines = joinGermline(row, references, seq_field=seq_field, v_field=v_field, germ_types=germ_types) # Add germline field(s) to dictionary if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full'] if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask'] if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly'] if 'regions' in germ_types: row['GERMLINE_REGIONS'] = germlines['regions'] # Write row to pass or fail file if 'ERROR' in result_log: fail_count += 1 if fail_writer is not None: fail_writer.writerow(row) else: result_log['SEQUENCE'] = row[seq_field] result_log['GERMLINE'] = germlines['full'] result_log['REGIONS'] = germlines['regions'] pass_count += 1 pass_writer.writerow(row) printLog(result_log, handle=log_handle) # Print log printProgress(i + 1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def collectDbQueue(alive, result_queue, collect_queue, db_file, task_label, out_args, add_fields=None): """ Pulls from results queue, assembles results and manages log and file IO Arguments: alive : multiprocessing.Value boolean controlling whether processing continues; when False function returns result_queue : multiprocessing.Queue holding worker results collect_queue : multiprocessing.Queue to store collector return values db_file : Database file name task_label : Task label used to tag the output files out_args : Common output argument dictionary from parseCommonArgs add_fields : List of fields added to the writer not present in the in_file; if None do not add fields Returns: None : Adds a dictionary with key value pairs to collect_queue containing 'log' defining a log object, 'out_files' defining the output file names """ try: result_count = countDbFile(db_file) # Define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] # Defined valid alignment output handle pass_handle = getOutputHandle(db_file, '%s-pass' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, '%s-fail' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Iterator over results queue until sentinel object reached start_time = time() set_count = rec_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration printProgress(pass_count, result_count, 0.05, start_time) # Update counts for current iteration set_count += 1 rec_count += result.data_count # Write log printLog(result.log, handle=log_handle) # Write alignments if result: pass_count += result.data_count for rec in result.results: pass_writer.writerow(rec.toDict()) else: fail_count += result.data_count if fail_handle is not None: for rec in result.data: pass_writer.writerow(rec.toDict()) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(pass_count, result_count, 0.05, start_time) # Update return values log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GROUPS'] = set_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log':log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() except: alive.value = False raise return None
def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False, out_args=default_out_args): """ Writes tab-delimited database file in output directory. Arguments: db : a iterator of IgRecord objects containing alignment data. fields : a list of ordered field names to write. file_prefix : directory and prefix for CLIP tab-delim file. total_count : number of records (for progress bar). id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. no_parse : if ID is to be parsed for pRESTO output with default delimiters. partial : if True put incomplete alignments in the pass file. out_args : common output argument dictionary from parseCommonArgs. Returns: None """ # Function to check for valid records strictly def _pass_strict(rec): valid = [rec.v_call and rec.v_call != 'None', rec.j_call and rec.j_call != 'None', rec.functional is not None, rec.seq_vdj, rec.junction] return all(valid) # Function to check for valid records loosely def _pass_gentle(rec): valid = [rec.v_call and rec.v_call != 'None', rec.d_call and rec.d_call != 'None', rec.j_call and rec.j_call != 'None'] return any(valid) # Set pass criteria _pass = _pass_gentle if partial else _pass_strict # Define output file names pass_file = '%s_db-pass.tab' % file_prefix fail_file = '%s_db-fail.tab' % file_prefix # Initiate handles, writers and counters pass_handle = None fail_handle = None pass_writer = None fail_writer = None start_time = time() rec_count = pass_count = fail_count = 0 # Validate and write output printProgress(0, total_count, 0.05, start_time) for i, record in enumerate(db, start=1): # Replace sequence description with full string, if required if id_dict is not None and record.id in id_dict: record.id = id_dict[record.id] # Parse sequence description into new columns if not no_parse: try: record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) record.id = record.annotations['ID'] del record.annotations['ID'] # TODO: This is not the best approach. should pass in output fields. # If first record, use parsed description to define extra columns if i == 1: fields.extend(list(record.annotations.keys())) except IndexError: # Could not parse pRESTO-style annotations so fall back to no parse no_parse = True sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n') # Count pass or fail and write to appropriate file if _pass(record): # Open pass file if pass_writer is None: pass_handle = open(pass_file, 'wt') pass_writer = getDbWriter(pass_handle, add_fields=fields) # Write row to pass file pass_count += 1 pass_writer.writerow(record.toDict()) else: # Open failed file if out_args['failed'] and fail_writer is None: fail_handle = open(fail_file, 'wt') fail_writer = getDbWriter(fail_handle, add_fields=fields) # Write row to fail file if specified fail_count += 1 if fail_writer is not None: fail_writer.writerow(record.toDict()) # Print progress printProgress(i, total_count, 0.05, start_time) # Print consol log log = OrderedDict() log['OUTPUT'] = pass_file log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'MakeDb' printLog(log) if pass_handle is not None: pass_handle.close() if fail_handle is not None: fail_handle.close()
def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args): """ Assembles results from a queue of individual sequence results and manages log/file I/O Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process result_queue = a multiprocessing.Queue holding processQueue results collect_queue = a multiprocessing.Queue to store collector return values db_file = the input database file name out_args = common output argument dictionary from parseCommonArgs cluster_func = the function to call for carrying out clustering on distance matrix cluster_args = a dictionary of arguments to pass to cluster_func Returns: None (adds 'log' and 'out_files' to collect_dict) """ # Open output files try: # Iterate over Ig records to count and order by junction length result_count = 0 records = {} # print 'Reading file...' db_iter = readDbFile(db_file) for rec in db_iter: records[rec.id] = rec result_count += 1 records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length)) # Define empty matrix to store assembled results dist_mat = np.zeros((result_count,result_count)) # Count records and define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] # Defined successful output handle pass_handle = getOutputHandle(db_file, out_label='clone-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') # Defined failed cloning output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, out_label='clone-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None fail_writer = None # Open log file if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Iterator over results queue until sentinel object reached start_time = time() row_count = rec_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration if row_count == 0: print('PROGRESS> Assigning clones') printProgress(row_count, result_count, 0.05, start_time) # Update counts for iteration row_count += 1 rec_count += len(result) # Add result row to distance matrix if result: dist_mat[list(range(result_count-len(result),result_count)),result_count-len(result)] = result.results else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Calculate linkage and carry out clustering # print dist_mat clusters = cluster_func(dist_mat, **cluster_args) if dist_mat is not None else None clones = {} # print clusters for i, c in enumerate(clusters): clones.setdefault(c, []).append(records[list(records.keys())[i]]) # Write passed and failed records clone_count = pass_count = fail_count = 0 if clones: for clone in clones.values(): clone_count += 1 for i, rec in enumerate(clone): rec.annotations['CLONE'] = clone_count pass_writer.writerow(rec.toDict()) pass_count += 1 #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) else: for i, rec in enumerate(result.data): fail_writer.writerow(rec.toDict()) fail_count += 1 #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) # Print final progress printProgress(row_count, result_count, 0.05, start_time) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() # Update return list log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log':log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) except: alive.value = False raise return None
def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'False' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader repo_dict = getRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] # Create output file handle and Db writer pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None fail_writer = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) pass_count = fail_count = 0 # Iterate over rows for i,row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) result_log, germlines = joinGermline(row, repo_dict, germ_types, v_field, seq_field) # Add germline field(s) to dictionary if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full'] if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask'] if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly'] # Write row to pass or fail file if 'ERROR' in result_log: fail_count += 1 if fail_writer is not None: fail_writer.writerow(row) else: result_log['SEQUENCE'] = row[seq_field] result_log['GERMLINE'] = germlines['full'] result_log['REGIONS'] = germlines['regions'] pass_count += 1 pass_writer.writerow(row) printLog(result_log, handle=log_handle) # Print log printProgress(i+1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def selectDbFile(db_file, fields, values, logic='any', regex=False, out_args=default_out_args): """ Selects records from a database file Arguments: db_file = the database file name fields = a list of fields to check for selection criteria values = a list of values defining selection targets logic = one of 'any' or 'all' defining whether one or all fields must have a match. regex = if False do exact full string matches; if True allow partial regex matches. out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all # Print console log log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) log['REGEX'] =regex printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = fail_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Check for selection values in all fields select = _logic_func([_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if select: pass_count += 1 pass_writer.writerow(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['SELECTED'] = pass_count log['DISCARDED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def sortDbFile(db_file, field, numeric=False, descend=False, out_args=default_out_args): """ Sorts records by values in an annotation field Arguments: db_file = the database filename field = the field name to sort by numeric = if True sort field numerically; if False sort field alphabetically descend = if True sort in descending order; if False sort in ascending order out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'sort' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUMERIC'] = numeric printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Store all records in a dictionary start_time = time() printMessage("Indexing: Running", start_time=start_time) db_dict = {i:r for i, r in enumerate(db_iter)} result_count = len(db_dict) # Sort db_dict by field values tag_dict = {k:v[field] for k, v in db_dict.items()} if numeric: tag_dict = {k:float(v or 0) for k, v in tag_dict.items()} sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend) printMessage("Indexing: Done", start_time=start_time, end=True) # Iterate over records start_time = time() rec_count = 0 for key in sorted_keys: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write records pass_writer.writerow(db_dict[key]) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def assembleCloneGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Assemble one germline sequence for each clone in a tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'True' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader repo_dict = getRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] # Create output file handle and Db writer writers = {} pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None writers['fail'] = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) counts = {} clone_count = counts['pass'] = counts['fail'] = 0 # Iterate over rows clone = 'initial' clone_dict = OrderedDict() for i,row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) # Clone isn't over yet if row.get('CLONE','') == clone: clone_dict[row["SEQUENCE_ID"]] = row # Clone just finished elif clone_dict: clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Now deal with current row (first of next clone) clone = row['CLONE'] clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)]) # Last case is only for first row of file else: clone = row['CLONE'] clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)]) clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Print log printProgress(i+1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = counts['pass'] log['FAIL'] = counts['fail'] log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): """ Divides a tab-delimited database file into segments by description tags Arguments: db_file = filename of the tab-delimited database file to split field = the field name by which to split db_file num_split = the numerical threshold by which to group sequences; if None treat field as textual out_args = common output argument dictionary from parseCommonArgs Returns: a list of output file names """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'split' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUM_SPLIT'] = num_split printLog(log) # Open IgRecord reader iter object reader = readDbFile(db_file, ig=False) # Determine total numbers of records rec_count = countDbFile(db_file) start_time = time() count = 0 # Sort records into files based on textual field if num_split is None: # Create set of unique field tags tmp_iter = readDbFile(db_file, ig=False) tag_list = list(set([row[field] for row in tmp_iter])) # Forbidden characters in filename and replacements noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c', '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'} # Replace forbidden characters in tag_list tag_dict = {} for tag in tag_list: for c,r in noGood.items(): tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ if c in tag else tag_dict.get(tag, tag)) # Create output handles handles_dict = {tag:getOutputHandle(db_file, '%s-%s' % (field, label), out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir']) for tag, label in tag_dict.items()} # Create Db writer instances writers_dict = {tag:getDbWriter(handles_dict[tag], db_file) for tag in tag_dict} # Iterate over IgRecords for row in reader: printProgress(count, rec_count, 0.05, start_time) count += 1 # Write row to appropriate file tag = row[field] writers_dict[tag].writerow(row) # Sort records into files based on numeric num_split else: num_split = float(num_split) # Create output handles handles_dict = {'under':getOutputHandle(db_file, 'under-%.1f' % num_split, out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir']), 'atleast':getOutputHandle(db_file, 'atleast-%.1f' % num_split, out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir'])} # Create Db writer instances writers_dict = {'under':getDbWriter(handles_dict['under'], db_file), 'atleast':getDbWriter(handles_dict['atleast'], db_file)} # Iterate over IgRecords for row in reader: printProgress(count, rec_count, 0.05, start_time) count += 1 tag = row[field] tag = 'under' if float(tag) < num_split else 'atleast' writers_dict[tag].writerow(row) # Write log printProgress(count, rec_count, 0.05, start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['RECORDS'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'ParseDb' printLog(log) # Close output file handles for t in handles_dict: handles_dict[t].close() return [handles_dict[t].name for t in handles_dict]
def selectDbFile(db_file, fields, values, logic='any', regex=False, out_args=default_out_args): """ Selects records from a database file Arguments: db_file = the database file name fields = a list of fields to check for selection criteria values = a list of values defining selection targets logic = one of 'any' or 'all' defining whether one or all fields must have a match. regex = if False do exact full string matches; if True allow partial regex matches. out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all # Print console log log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) log['REGEX'] = regex printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = fail_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Check for selection values in all fields select = _logic_func( [_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if select: pass_count += 1 pass_writer.writerow(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['SELECTED'] = pass_count log['DISCARDED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}): """ Assembles results from a queue of individual sequence results and manages log/file I/O Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process result_queue = a multiprocessing.Queue holding processQueue results collect_queue = a multiprocessing.Queue to store collector return values db_file = the input database file name out_args = common output argument dictionary from parseCommonArgs cluster_func = the function to call for carrying out clustering on distance matrix cluster_args = a dictionary of arguments to pass to cluster_func Returns: None (adds 'log' and 'out_files' to collect_dict) """ # Open output files try: # Count records and define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] result_count = countDbFile(db_file) # Defined successful output handle pass_handle = getOutputHandle(db_file, out_label='clone-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, out_label='clone-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None fail_writer = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: #sys.stderr.write('Exception in collector file opening step\n') alive.value = False raise # Get results from queue and write to files try: #print 'START COLLECT', alive.value # Iterator over results queue until sentinel object reached start_time = time() rec_count = clone_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break #print "COLLECT", alive.value, result['id'] # Print progress for previous iteration and update record count if rec_count == 0: print('PROGRESS> Assigning clones') printProgress(rec_count, result_count, 0.05, start_time) rec_count += len(result.data) # Write passed and failed records if result: # Writing passing sequences for clone in result.results.values(): clone_count += 1 for i, rec in enumerate(clone, start=1): rec.annotations['CLONE'] = clone_count pass_writer.writerow(rec.toDict()) pass_count += 1 result.log['CLONE%i-%i' % (clone_count, i)] = str( rec.junction) # Right failed seqeuence from passing sets if result.failed: for i, rec in enumerate(result.failed, start=1): fail_count += 1 if fail_writer is not None: fail_writer.writerow(rec.toDict()) result.log['FAIL%i-%i' % (clone_count, i)] = str( rec.junction) else: # Write failed sets for i, rec in enumerate(result.data, start=1): fail_count += 1 if fail_writer is not None: fail_writer.writerow(rec.toDict()) result.log['CLONE0-%i' % (i)] = str(rec.junction) # Write log printLog(result.log, handle=log_handle) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(rec_count, result_count, 0.05, start_time) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() # Update return list log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log': log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) except: #sys.stderr.write('Exception in collector result processing step\n') alive.value = False raise return None