def convertToGenbank(db_file, inference=None, db_xref=None, molecule=default_molecule, product=default_product, features=None, c_field=None, label=None, count_field=None, index_field=None, allow_stop=False, asis_id=False, asis_calls=False, allele_delim=default_allele_delim, build_asn=False, asn_template=None, tbl2asn_exec=default_tbl2asn_exec, format=default_format, out_file=None, out_args=default_out_args): """ Builds GenBank submission fasta and table files Arguments: db_file : the database file name. inference : reference alignment tool. db_xref : reference database link. molecule : source molecule (eg, "mRNA", "genomic DNA") product : Product (protein) name. features : dictionary of sample features (BioSample attributes) to add to the description of each record. c_field : column containing the C region gene call. label : a string to use as a label for the ID. if None do not add a field label. count_field : field name to populate the AIRR_READ_COUNT note. index_field : field name to populate the AIRR_CELL_INDEX note. allow_stop : if True retain records with junctions having stop codons. asis_id : if True use the original sequence ID for the output IDs. asis_calls : if True do not parse gene calls for IMGT nomenclature. allele_delim : delimiter separating the gene name from the allele number when asis_calls=True. build_asn : if True run tbl2asn on the generated .tbl and .fsa files. asn_template : template file (.sbt) to pass to tbl2asn. tbl2asn_exec : name of or path to the tbl2asn executable. format : input and output format. out_file : output file name without extension. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: tuple : the output (feature table, fasta) file names. """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'genbank' log['FILE'] = os.path.basename(db_file) printLog(log) # Define format operators try: reader, __, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Open input db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['sequence_input', 'v_call', 'd_call', 'j_call', 'v_seq_start', 'd_seq_start', 'j_seq_start'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Open output if out_file is not None: out_name, __ = os.path.splitext(out_file) fsa_handle = open('%s.fsa' % out_name, 'w') tbl_handle = open('%s.tbl' % out_name, 'w') else: fsa_handle = getOutputHandle(db_file, out_label='genbank', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='fsa') tbl_handle = getOutputHandle(db_file, out_label='genbank', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tbl') # Count records result_count = countDbFile(db_file) # Define writer writer = csv.writer(tbl_handle, delimiter='\t', quoting=csv.QUOTE_NONE) # Iterate over records start_time = time() rec_count, pass_count, fail_count = 0, 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Extract table dictionary name = None if asis_id else rec_count seq = makeGenbankSequence(rec, name=name, label=label, count_field=count_field, index_field=index_field, molecule=molecule, features=features) tbl = makeGenbankFeatures(rec, start=seq['start'], end=seq['end'], product=product, db_xref=db_xref, inference=inference, c_field=c_field, allow_stop=allow_stop, asis_calls=asis_calls, allele_delim=allele_delim) if tbl is not None: pass_count +=1 # Write table writer.writerow(['>Features', seq['record'].id]) for feature, qualifiers in tbl.items(): writer.writerow(feature) if qualifiers: for x in qualifiers: writer.writerow(list(chain(['', '', ''], x))) # Write sequence SeqIO.write(seq['record'], fsa_handle, 'fasta') else: fail_count += 1 # Final progress bar printProgress(rec_count, result_count, 0.05, start_time=start_time) # Run tbl2asn if build_asn: start_time = time() printMessage('Running tbl2asn', start_time=start_time, width=25) result = runASN(fsa_handle.name, template=asn_template, exec=tbl2asn_exec) printMessage('Done', start_time=start_time, end=True, width=25) # Print ending console log log = OrderedDict() log['OUTPUT_TBL'] = os.path.basename(tbl_handle.name) log['OUTPUT_FSA'] = os.path.basename(fsa_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertDb' printLog(log) # Close file handles tbl_handle.close() fsa_handle.close() db_handle.close() return (tbl_handle.name, fsa_handle.name)
def alignRecords(db_file, seq_fields, group_func, align_func, group_args={}, align_args={}, format='changeo', out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Performs a multiple alignment on sets of sequences Arguments: db_file : filename of the input database. seq_fields : the sequence fields to multiple align. group_func : function to use to group records. align_func : function to use to multiple align sequence groups. group_args : dictionary of arguments to pass to group_func. align_args : dictionary of arguments to pass to align_func. format : output format. One of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes. if None defaults to the number of CPUs. queue_size : maximum size of the argument queue. if None defaults to 2*nproc. Returns: dict : names of the 'pass' and 'fail' output files. """ # Define subcommand label dictionary cmd_dict = { alignAcross: 'across', alignWithin: 'within', alignBlocks: 'block' } # Print parameter info log = OrderedDict() log['START'] = 'AlignRecords' log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__) log['FILE'] = os.path.basename(db_file) log['SEQ_FIELDS'] = ','.join(seq_fields) if 'group_fields' in group_args: log['GROUP_FIELDS'] = ','.join(group_args['group_fields']) if 'mode' in group_args: log['MODE'] = group_args['mode'] if 'action' in group_args: log['ACTION'] = group_args['action'] log['NPROC'] = nproc printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Define feeder function and arguments if 'group_fields' in group_args and group_args['group_fields'] is not None: group_args['group_fields'] = [ schema.toReceptor(f) for f in group_args['group_fields'] ] feed_func = feedDbQueue feed_args = { 'db_file': db_file, 'reader': reader, 'group_func': group_func, 'group_args': group_args } # Define worker function and arguments field_map = OrderedDict([(schema.toReceptor(f), '%s_align' % f) for f in seq_fields]) align_args['field_map'] = field_map work_func = processDbQueue work_args = {'process_func': align_func, 'process_args': align_args} # Define collector function and arguments out_fields = getDbFields(db_file, add=list(field_map.values()), reader=reader) out_args['out_type'] = schema.out_type collect_func = collectDbQueue collect_args = { 'db_file': db_file, 'label': 'align', 'fields': out_fields, 'writer': writer, 'out_file': out_file, 'out_args': out_args } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'AlignRecords' printLog(result['log']) output = {k: v for k, v in result.items() if k in ('pass', 'fail')} return output
def insertGaps(db_file, references=None, format=default_format, out_file=None, out_args=default_out_args): """ Inserts IMGT numbering into V fields Arguments: db_file : the database file name. references : folder with germline repertoire files. If None, do not updated alignment columns wtih IMGT gaps. format : input format. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'imgt' log['FILE'] = os.path.basename(db_file) printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Open input db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['sequence_imgt', 'v_germ_start_imgt'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Load references reference_dict = readGermlines(references) # Check for IMGT-gaps in germlines if all('...' not in x for x in reference_dict.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Open output writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='gap', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=schema.out_type) pass_writer = writer(pass_handle, fields=db_iter.fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Update IMGT fields imgt_dict = correctIMGTFields(rec, reference_dict) # Write records if imgt_dict is not None: pass_count += 1 rec.setDict(imgt_dict, parse=False) pass_writer.writeReceptor(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = rec_count - pass_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def convertToAIRR(db_file, format=default_format, out_file=None, out_args=default_out_args): """ Converts a Change-O formatted file into an AIRR formatted file Arguments: db_file : the database file name. format : input format. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'airr' log['FILE'] = os.path.basename(db_file) printLog(log) # Define format operators try: reader, __, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Open input db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Set output fields replacing length with end fields in_fields = [schema.toReceptor(f) for f in db_iter.fields] out_fields = [] for f in in_fields: if f in ReceptorData.length_fields and ReceptorData.length_fields[f][0] in in_fields: out_fields.append(ReceptorData.length_fields[f][1]) out_fields.append(f) out_fields = list(OrderedDict.fromkeys(out_fields)) out_fields = [AIRRSchema.fromReceptor(f) for f in out_fields] # Open output writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='airr', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=AIRRSchema.out_type) pass_writer = AIRRWriter(pass_handle, fields=out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write records pass_writer.writeReceptor(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def createGermlines(db_file, references, seq_field=default_seq_field, v_field=default_v_field, d_field=default_d_field, j_field=default_j_field, cloned=False, clone_field=default_clone_field, germ_types=default_germ_types, format=default_format, out_file=None, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file : input tab-delimited database file. references : folders and/or files containing germline repertoire data in FASTA format. seq_field : field in which to look for sequence. v_field : field in which to look for V call. d_field : field in which to look for D call. j_field : field in which to look for J call. cloned : if True build germlines by clone, otherwise build individual germlines. clone_field : field containing clone identifiers; ignored if cloned=False. germ_types : list of germline sequence types to be output from the set of 'full', 'dmask', 'vonly', 'regions' format : input and output format. out_file : output file name. Automatically generated from the input file if None. out_args : arguments for output preferences. Returns: dict: names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = ','.join(germ_types) log['SEQ_FIELD'] = seq_field log['V_FIELD'] = v_field log['D_FIELD'] = d_field log['J_FIELD'] = j_field log['CLONED'] = cloned if cloned: log['CLONE_FIELD'] = clone_field printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s' % format) out_args['out_type'] = schema.out_type # TODO: this won't work for AIRR necessarily # Define output germline fields germline_fields = OrderedDict() seq_type = seq_field.split('_')[-1] if 'full' in germ_types: germline_fields['full'] = 'germline_' + seq_type if 'dmask' in germ_types: germline_fields['dmask'] = 'germline_' + seq_type + '_d_mask' if 'vonly' in germ_types: germline_fields['vonly'] = 'germline_' + seq_type + '_v_region' if 'regions' in germ_types: germline_fields['regions'] = 'germline_regions' if cloned: germline_fields['v'] = 'germline_v_call' germline_fields['d'] = 'germline_d_call' germline_fields['j'] = 'germline_j_call' out_fields = getDbFields(db_file, add=[schema.fromReceptor(f) for f in germline_fields.values()], reader=reader) # Get repertoire and open Db reader reference_dict = readGermlines(references) db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['v_germ_start_imgt', 'd_germ_start', 'j_germ_start', 'np1_length', 'np2_length'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Check for IMGT-gaps in germlines if all('...' not in x for x in reference_dict.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Count input total_count = countDbFile(db_file) # Check for existence of fields for f in [v_field, d_field, j_field, seq_field]: if f not in db_iter.fields: printError('%s field does not exist in input database file.' % f) # Translate to Receptor attribute names v_field = schema.toReceptor(v_field) d_field = schema.toReceptor(d_field) j_field = schema.toReceptor(j_field) seq_field = schema.toReceptor(seq_field) clone_field = schema.toReceptor(clone_field) # Define Receptor iterator if cloned: start_time = time() printMessage('Sorting by clone', start_time=start_time, width=20) sorted_records = sorted(db_iter, key=lambda x: x.getField(clone_field)) printMessage('Done', start_time=start_time, end=True, width=20) receptor_iter = groupby(sorted_records, lambda x: x.getField(clone_field)) else: receptor_iter = ((x.sequence_id, [x]) for x in db_iter) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Initialize handles, writers and counters pass_handle, pass_writer = None, None fail_handle, fail_writer = None, None rec_count, pass_count, fail_count = 0, 0, 0 start_time = time() # Iterate over rows for key, records in receptor_iter: # Print progress printProgress(rec_count, total_count, 0.05, start_time=start_time) # Define iteration variables records = list(records) rec_log = OrderedDict([('ID', key)]) rec_count += len(records) # Build germline for records if len(records) == 1: germ_log, germlines, genes = buildGermline(records[0], reference_dict, seq_field=seq_field, v_field=v_field, d_field=d_field, j_field=j_field) else: germ_log, germlines, genes = buildClonalGermline(records, reference_dict, seq_field=seq_field, v_field=v_field, d_field=d_field, j_field=j_field) rec_log.update(germ_log) # Write row to pass or fail file if germlines is not None: pass_count += len(records) # Add germlines to Receptor record annotations = {} if 'full' in germ_types: annotations[germline_fields['full']] = germlines['full'] if 'dmask' in germ_types: annotations[germline_fields['dmask']] = germlines['dmask'] if 'vonly' in germ_types: annotations[germline_fields['vonly']] = germlines['vonly'] if 'regions' in germ_types: annotations[germline_fields['regions']] = germlines['regions'] if cloned: annotations[germline_fields['v']] = genes['v'] annotations[germline_fields['d']] = genes['d'] annotations[germline_fields['j']] = genes['j'] # Write records try: for r in records: r.setDict(annotations) pass_writer.writeReceptor(r) except AttributeError: # Create output file handle and writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = writer(pass_handle, fields=out_fields) for r in records: r.setDict(annotations) pass_writer.writeReceptor(r) else: fail_count += len(records) if out_args['failed']: try: fail_writer.writeReceptor(records) except AttributeError: fail_handle = getOutputHandle(db_file, out_label='germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = writer(fail_handle, fields=out_fields) fail_writer.writeReceptor(records) # Write log printLog(rec_log, handle=log_handle) # Print log printProgress(rec_count, total_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles db_handle.close() output = {'pass': None, 'fail': None} if pass_handle is not None: output['pass'] = pass_handle.name pass_handle.close() if fail_handle is not None: output['fail'] = fail_handle.name fail_handle.close() if log_handle is not None: log_handle.close() return output
def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True, extended=False, format=default_format, out_file=None, out_args=default_out_args): """ Main for iHMMuneAlign aligned sample sequences. Arguments: aligner_file : iHMMune-Align output file to process. seq_file : fasta file input to iHMMuneAlign (from which to get sequence). repo : folder with germline repertoire files. partial : If True put incomplete alignments in the pass file. asis_id : if ID is to be parsed for pRESTO output with default delimiters. extended : if True parse alignment scores, FWR and CDR region fields. format : output format. One of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['COMMAND'] = 'ihmm' log['ALIGNER_FILE'] = os.path.basename(aligner_file) log['SEQ_FILE'] = os.path.basename(seq_file) log['ASIS_ID'] = asis_id log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Count records in sequence file total_count = countSeqFile(seq_file) # Get input sequence dictionary seq_dict = getSeqDict(seq_file) # Create germline repo dictionary references = readGermlines(repo) # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema) fields.extend(custom) # Parse and write output with open(aligner_file, 'r') as f: parse_iter = IHMMuneReader(f, seq_dict, references) germ_iter = (addGermline(x, references) for x in parse_iter) output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, asis_id=asis_id, partial=partial, writer=writer, out_file=out_file, out_args=out_args) return output
def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False, asis_id=True, asis_calls=False, extended=False, regions='default', format='changeo', out_file=None, out_args=default_out_args): """ Main for IgBLAST aligned sample sequences. Arguments: aligner_file (str): IgBLAST output file to process. seq_file (str): fasta file input to IgBlast (from which to get sequence). repo (str): folder with germline repertoire files. amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False. partial : If True put incomplete alignments in the pass file. asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters. asis_calls (bool): if True do not parse gene calls for allele names. extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output. regions (str): name of the IMGT FWR/CDR region definitions to use. format (str): output format. one of 'changeo' or 'airr'. out_file (str): output file name. Automatically generated from the input file if None. out_args (dict): common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast' log['ALIGNER_FILE'] = os.path.basename(aligner_file) log['SEQ_FILE'] = os.path.basename(seq_file) log['ASIS_ID'] = asis_id log['ASIS_CALLS'] = asis_calls log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) # Set amino acid conditions if amino_acid: format = '%s-aa' % format parser = IgBLASTReaderAA else: parser = IgBLASTReader # Start start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Count records in sequence file total_count = countSeqFile(seq_file) # Get input sequence dictionary seq_dict = getSeqDict(seq_file) # Create germline repo dictionary references = readGermlines(repo, asis=asis_calls) # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = parser.customFields(schema=schema) fields.extend(custom) # Parse and write output with open(aligner_file, 'r') as f: parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls) germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter) output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id, regions=regions, writer=writer, out_file=out_file, out_args=out_args) return output
def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True, extended=False, format=default_format, out_file=None, out_args=default_out_args): """ Main for IMGT aligned sample sequences. Arguments: aligner_file : zipped file or unzipped folder output by IMGT. seq_file : FASTA file input to IMGT (from which to get seqID). repo : folder with germline repertoire files. partial : If True put incomplete alignments in the pass file. asis_id : if ID is to be parsed for pRESTO output with default delimiters. extended : if True add alignment score, FWR, CDR and junction fields to output file. format : output format. one of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDb' log['COMMAND'] = 'imgt' log['ALIGNER_FILE'] = aligner_file log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' log['ASIS_ID'] = asis_id log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Extract IMGT files temp_dir, imgt_files = extractIMGT(aligner_file) # Count records in IMGT files total_count = countDbFile(imgt_files['summary']) # Get (parsed) IDs from fasta file submitted to IMGT id_dict = getIDforIMGT(seq_file) if seq_file else {} # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema) fields.extend(custom) # Parse IMGT output and write db with open(imgt_files['summary'], 'r') as summary_handle, \ open(imgt_files['gapped'], 'r') as gapped_handle, \ open(imgt_files['ntseq'], 'r') as ntseq_handle, \ open(imgt_files['junction'], 'r') as junction_handle: # Open parser parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle) # Add germline sequence if repo is None: germ_iter = parse_iter else: references = readGermlines(repo) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') germ_iter = (addGermline(x, references) for x in parse_iter) # Write db output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial, writer=writer, out_file=out_file, out_args=out_args) # Cleanup temp directory temp_dir.cleanup() return output
def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field, j_field=default_j_field, max_missing=default_max_missing, group_fields=None, group_func=groupByGene, group_args={}, clone_func=distanceClones, clone_args={}, format=default_format, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Define clonally related sequences Arguments: db_file : filename of input database. seq_field : sequence field used to determine clones. v_field : field containing the V call. j_field : field containing the J call. max_missing : maximum number of non-ACGT characters to allow in the junction sequence. group_fields : additional annotation fields to use to group preclones; if None use only V and J. group_func : the function to use for assigning preclones. group_args : a dictionary of arguments to pass to group_func. clone_func : the function to use for determining clones within preclonal groups. clone_args : a dictionary of arguments to pass to clone_func. format : input and output format. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes; if None defaults to the number of CPUs. queue_size : maximum size of the argument queue; if None defaults to 2*nproc. Returns: dict: dictionary of output pass and fail files. """ # Print parameter info log = OrderedDict() log['START'] = 'DefineClones' log['FILE'] = os.path.basename(db_file) log['SEQ_FIELD'] = seq_field log['V_FIELD'] = v_field log['J_FIELD'] = j_field log['MAX_MISSING'] = max_missing log['GROUP_FIELDS'] = ','.join( group_fields) if group_fields is not None else None for k in sorted(group_args): log[k.upper()] = group_args[k] for k in sorted(clone_args): if k != 'dist_mat': log[k.upper()] = clone_args[k] log['NPROC'] = nproc printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Translate to Receptor attribute names seq_field = schema.toReceptor(seq_field) v_field = schema.toReceptor(v_field) j_field = schema.toReceptor(j_field) if group_fields is not None: group_fields = [schema.toReceptor(f) for f in group_fields] # Define feeder function and arguments group_args['group_fields'] = group_fields group_args['v_field'] = v_field group_args['j_field'] = j_field feed_args = { 'db_file': db_file, 'reader': reader, 'group_func': group_func, 'group_args': group_args } # Define worker function and arguments filter_args = { 'seq_field': seq_field, 'v_field': v_field, 'j_field': j_field, 'max_missing': max_missing } clone_args['seq_field'] = seq_field work_args = { 'process_func': clone_func, 'process_args': clone_args, 'filter_func': filterMissing, 'filter_args': filter_args } # Define collector function and arguments out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader) out_args['out_type'] = schema.out_type collect_args = { 'db_file': db_file, 'fields': out_fields, 'writer': writer, 'out_file': out_file, 'out_args': out_args } # Check for required columns try: required = ['junction'] checkFields(required, out_fields, schema=schema) except LookupError as e: printError(e) # Call process manager result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue, feed_args=feed_args, work_args=work_args, collect_args=collect_args, nproc=nproc, queue_size=queue_size) # Print log result['log']['END'] = 'DefineClones' printLog(result['log']) output = {k: v for k, v in result.items() if k in ('pass', 'fail')} return output