def __init__(self, sequence, reference=HXB2): super(Env, self).__init__() self._sequence = Sequence(sequence) self._reference = Sequence(reference) self._aligned = None self._aligned_reference = None self._id = None self._reference_name = None self._clade = None
def heavy(self): if self._heavy is None: if len(self._heavies) > 0: if self._select_heavy is not None: self._heavy = Sequence(self._select_heavy(self._heavies)) else: self._heavy = Sequence(self._heavies[0]) else: self._heavy = None return self._heavy
def get_jquery_sequence(seq, vbr): hsp = vbr.alignments[0].hsps[0] # check to see if the raw input was reverse-complemented if hsp.sbjct_start > hsp.sbjct_end: # since the BLASTn alignment was done on the raw input # (which has since been reverse complemented), we need # to take the portion of the sequence that was 5' of the alignment # with the raw input (which is the 3' end of the correctly oriented sequence) return Sequence(seq[-hsp.query_start:], id=seq.id) else: return Sequence(seq[hsp.query_end:], id=seq.id)
def light(self): if self._light is None: # self._lights = [s for s in self._seqs if s['chain'] in ['kappa', 'lambda']] if len(self._lights) > 0: if self._select_light is not None: self._light = Sequence(self._select_light(self._lights)) else: self._light = Sequence(self._lights[0]) else: self._light = None return self._light
def main(args): if args.sequences is not None: # from utils import output, vdj processed = process_sequences(args.sequences, args) if len(processed) == 1: return Sequence(dict(processed[0])) return [Sequence(dict(p)) for p in processed] else: input_dir, output_dir, temp_dir, log_dir = make_directories(args) setup_logging(log_dir, args.debug) log_options(input_dir, output_dir, temp_dir, args) if args.use_test_data: mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) input_files = [os.path.join(mod_dir, 'test_data/test_1k.fasta'), ] else: if args.basespace: args.merge = True download_files(input_dir) if args.merge: input_dir = merge_reads(input_dir, args) if args.isotype: args.isotype = args.species input_files = [f for f in list_files(input_dir, log=True) if os.stat(f).st_size > 0] output_files = [] # assigned_files = [] # unassigned_files = [] for f, fmt in zip(input_files, format_check(input_files)): # skip the non-FASTA/Q files if fmt is None: continue start_time = time.time() print_input_file_info(f, fmt) subfiles, seq_count = split_file(f, fmt, temp_dir, args) run_info = run_jobs(subfiles, temp_dir, log_dir, fmt, args) temp_output_files = [r[0] for r in run_info if r is not None] processed_seq_counts = [r[1] for r in run_info if r is not None] annotated_log_files = [r[2] for r in run_info if r is not None] failed_log_files = [r[3] for r in run_info if r is not None] unassigned_log_files = [r[4] for r in run_info if r is not None] vdj_end_time = time.time() _output_files = concat_outputs(f, temp_output_files, output_dir, args) unassigned_file = concat_logs(f, unassigned_log_files, log_dir, 'unassigned') failed_file = concat_logs(f, failed_log_files, log_dir, 'failed') if args.debug: annotated_file = concat_logs(f, annotated_log_files, log_dir, 'annotated') output_files.extend(output_files) if not args.debug: flat_temp_files = [f for subl in temp_output_files for f in subl] clear_temp_files(subfiles + flat_temp_files + annotated_log_files + failed_log_files + unassigned_log_files) print_job_stats(seq_count, processed_seq_counts, start_time, vdj_end_time) return output_files
def orient_query(vdj, vbr): hsp = vbr.alignments[0].hsps[0] # BLASTn always reverse complements the Subject sequence, never the query. # To determine whether the input sequence is the reverse complement, check # to see if the Subject sequence was reverse-complemented by BLASTn if hsp.sbjct_start > hsp.sbjct_end: vdj.oriented = Sequence(vdj.sequence.reverse_complement, id=vdj.sequence.id)
def assign_dgene(self, seq, species): db_file = os.path.join(self.germline_directory, 'ungapped/d.fasta') with open(db_file, 'r') as db_handle: germs = [Sequence(s) for s in SeqIO.parse(db_handle, 'fasta')] rc_germs = [Sequence(s.reverse_complement, id=s.id) for s in germs] germs.extend(rc_germs) alignments = local_alignment(seq, targets=germs, gap_open=-20, gap_extend=-2) alignments.sort(key=lambda x: x.score, reverse=True) all_gls = [a.target.id for a in alignments] all_scores = [a.score for a in alignments] if not all([all_gls, all_scores]): return None top_gl = all_gls[0] top_score = all_scores[0] others = [GermlineSegment(germ, species, score=score) for germ, score in zip(all_gls[1:6], all_scores[1:6])] return GermlineSegment(top_gl, species, score=top_score, others=others, assigner_name=self.name)
def get_isotype(antibody): try: germ_dir = get_germline_database_directory(antibody.species) isotype_file = os.path.join(germ_dir, 'isotypes/isotypes.fasta') isotype_seqs = [Sequence(s) for s in SeqIO.parse(open(isotype_file, 'r'), 'fasta')] return Isotype(antibody, isotype_seqs) except: antibody.exception('ISOTYPING ERROR', traceback.format_exc())
def __init__(self, sequence, v=None, d=None, j=None): super(VDJ, self).__init__() LoggingMixin.__init__(self) self.sequence = Sequence(sequence) self.id = self.sequence.id self.oriented = self.sequence self.v = v self.d = d self.j = j self.initialize_log()
def test_isotype_bcr(): test_fasta = os.path.abspath('abstar/test_data/test_isotype.fasta') with open(test_fasta, 'r') as f: test_seqs = [Sequence(s) for s in SeqIO.parse(f, 'fasta')] vdj_nt = [s for s in test_seqs if s.id == 'vdj_nt'][0].sequence oriented_input = [s for s in test_seqs if s.id == 'oriented_input'][0].sequence antibody = MockAntibody() antibody.species = 'human' antibody.vdj_nt = vdj_nt antibody.oriented_input = oriented_input isotype = get_isotype(antibody) assert isotype.isotype == 'IgG1'
def run_partis(self, sequence_file, file_format, locus): with open(sequence_file, 'r') as sequence_handle: seqs = [Sequence(s) for s in SeqIO.parse(sequence_handle, file_format)] seq_dict = {s.id.replace(':', 'c'): s for s in seqs} partis_out = NamedTemporaryFile(delete=False) germline_dir = os.path.join(self.germline_directory, 'partis/') locus = 'igh' if locus == 'ig' else 'tra' partis_cmd = ['partis', 'run-viterbi', '--infname', sequence_file, '--outfname', partis_out.name, '--locus', locus, '--initial-germline-dir', germline_dir] p = sp.Popen(partis_cmd, stdout=sp.PIPE, stderr=sp.PIPE) stdout, stderr = p.communicate() parsed_vdjs = self.parse_partis_output(partis_out.name, seq_dict) os.unlink(partis_out.name) return parsed_vdjs
def get_env_sequences(clade=None): if clade is None: clade = CLADES elif type(clade) in [str, str]: clade = [ clade, ] envs = [] seq_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'env_sequences') for c in clade: seq_file = os.path.join(seq_dir, '{}.fasta'.format(c.upper())) envs.extend([ Sequence([s.id, str(s.seq).upper()]) for s in SeqIO.parse(open(seq_file, 'r'), 'fasta') ]) return envs
def __init__(self, sequence, species=None): self.raw_sequence = Sequence(str(sequence.seq), id=sequence.description) self._species = species self.gapped_nt_sequence = self.raw_sequence.sequence self.ungapped_nt_sequence = self.gapped_nt_sequence.replace('.', '')
def get_dquery_sequence(seq, jbr): query_start = jbr.alignments[0].hsps[0].query_start - 1 return Sequence(seq[:query_start], id=seq.id)
def run(*args, **kwargs): ''' Runs AbStar. Input sequences can be provided in several different formats: 1) individual sequences as positional arguments: ``run(seq1, seq2, temp=temp, output=output)`` 2) a list of sequences, as an argument: ``run([seq1, seq2], temp=temp, output=output)`` 3) a single FASTA/Q-formatted input file, passed via ``input`` 4) a directory of FASTA/Q-formatted files, passed via ``input`` When passing sequences (not FASTA/Q files), the sequences can be in any format recognized by ``abtools.sequence.Sequence``, including: - a raw nucleotide sequence, as a string (a random sequence ID will be assigned) - a list/tuple of the format ``[sequence_id, sequence]`` - a BioPython SeqRecord object - an AbTools Sequence object Either sequences, ``project_dir``, or all of ``input``, ``output`` and ``temp`` are required. Examples: If processing a single sequence, you can pass the raw sequence, as a string:: import abstar result = abstar.run('ATGC') or a list/tuple of the format ``[sequence_id, sequence]``:: result = abstar.run(['seq1', 'ATGC']) If you pass just the raw sequence, a random sequence ID will be generated with ``uuid.uuid4()``. In either case, when given a single sequence, ``abstar.run()`` will return a single AbTools ``Sequence`` object. If running multiple sequences, you can either pass each sequence as a positional argument:: result_list = run(['seq1', 'ATGC'], ['seq2', 'CGTA']) or you can pass a list of sequences as the first argument, in this case using sequences parsed from a FASTA file using Biopython:: from Bio import SeqIO fasta = open('my_sequences.fasta', 'r') seqs = [s for s in SeqIO.parse(fasta, 'fasta')] result_list = abstar.run(seqs) When given multiple sequences, ``abstar.run()`` will return a list of AbTools ``Sequence`` objects, one per input sequence. If you'd prefer not to parse the FASTQ/A file into a list (for example, if the input file is extremely large), you can pass the input file path directly, along with a temp directory and output directory:: result_files = abstar.run(input='/path/to/my_sequences.fasta', temp='/path/to/temp', output='/path/to/output') Given a file path, ``abstar.run()`` returns a list of output file paths. In the above case, ``result_files`` will be a list containing a single output file path: ``/path/to/output/my_sequences.json``. If you have a directory containing multiple FASTQ/A files, you can pass the directory path using ``input``:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output') As before, ``result_files`` will contain a list of output file paths. If your input directory contains paired FASTQ files (gzip compressed or uncompressed) that need to be merged prior to processing with AbStar:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output', merge=True) The paired read files in ``input`` will be merged with PANDAseq prior to processing with AbStar. By default, PANDAseq's 'simple bayesian' read merging algorithm is used, although alternate algorithms can be selected with ``pandaseq_algo``. AbStar also provides an alternate CSV-formatted output type that mimics the `IMGT Summary file`_. This option is provided to minimize the effort needed to convert existing IMGT-based pipelines to AbStar. Alternate output is only available when passing an input file or directory; passing individual sequences or a list of sequences will always return Sequence objects. To produce IMGT-formatted output:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output', output_type='imgt') .. _IMGT Summary file: http://www.imgt.org/IMGT_vquest/share/textes/imgtvquest.html#Esummary Args: project_dir (str): Path to the project directory. Most useful when directly downloading files from BaseSpace, and all subdirectories will be created by AbStar. input (str): Path to input directory, containing FASTA/Q files. If performing read merging with PANDAseq, paired FASTQ files may be gzip compressed. output (str): Path to output directory. temp (str): Path to temp directory, where intermediate job files will be stored. log (str): Path to log file. If not provided and ``project_dir`` is provided, the log will be written to ``/path/to/project_dir/abstar.log``. If output is provided, log will be written to ``/path/to/output/abstar.log``. species (str): Species of the antibody sequences. Choices are 'human', 'macaque', 'mouse' and 'rabbit'. Default is 'human'. isotype (bool): If True, the isotype will infered by aligning the sequence region downstream of the J-gene. If False, the isotype will not be determined. Default is True. uid (int): Length (in nucleotides) of the Unique Molecular ID used to barcode input RNA. A positive integer results in the UMID being parsed from the start of the read (or merged read), a negative integer results in parsing from the end of the read. Default is 0, which results in no UMID parsing. gzip (bool): If True, compresses output files with gzip. Default is False. pretty (bool): If True, formats JSON output files to be more human-readable. If False, JSON output files contain one record per line. Default is False. output_type (str): Options are 'json' or 'imgt'. IMGT output mimics the Summary table produced by IMGT High-V/Quest, to maintain a level of compatibility with existing IMGT-based pipelines. JSON output is much more detailed. Default is 'json'. merge (bool): If True, input must be paired-read FASTA files (gzip compressed or uncompressed) which will be merged with PANDAseq prior to processing with AbStar. If ``basespace`` is True, ``merge`` is automatically set to True. Default is False. pandaseq_algo (str): Define merging algorithm to be used by PANDAseq. Options are 'simple_bayesian', 'ea_util', 'flash', 'pear', 'rdp_mle', 'stitch', or 'uparse'. Default is 'simple_bayesian', which is the default PANDAseq algorithm. debug (bool): If ``True``, ``abstar.run()`` runs in single-threaded mode, the log is much more verbose, and temporary files are not removed. Default is ``False``. Returns: If the input is a single sequence, ``run`` returns a single AbTools ``Sequence`` object. If the input is a list of sequences, ``run`` returns a list of AbTools ``Sequence`` objects. If the input is a file or a directory of files, ``run`` returns a list of output files. ''' warnings.filterwarnings("ignore") if len(args) == 1: # if there's a single arg, need to check if it's a single sequence... try: sequences = [ Sequence(args[0]), ] except: # ...or a list of sequences try: sequences = [Sequence(s) for s in args[0]] except: print('ERROR: invalid format for sequence input:') for a in args: print(a) sys.exit(1) # if multiple args, assume each is a sequence elif len(args) > 1: try: sequences = [Sequence(s) for s in args] except: print('ERROR: invalid format for sequence input:') for a in args: print(a) sys.exit(1) kwargs['sequences'] = sequences args = Args(**kwargs) validate_args(args) global logger logger = log.get_logger('abstar') output = main(args) # if args.sequences is not None: # output = [Sequence(o) for o in output] # if len(output) == 1: # return output[0] return output
def test_api_integration_bcr_hiv_bnab_lcs(): test_data = os.path.abspath('abstar/test_data/test_hiv_bnab_lcs.fasta') with open(test_data, 'r') as f: test_seqs = [Sequence(s) for s in SeqIO.parse(f, 'fasta')] seqs = abstar.run(*test_seqs) assert len(seqs) == 207
def __call__(self, sequence_file, file_format): with open(sequence_file, 'r') as sequence_handle: seqs = [Sequence(s) for s in SeqIO.parse(sequence_handle, file_format)] vdjs = [] # if the input file is FASTQ-formatted, need to convert it to FASTA for BLASTn to work if file_format == 'fastq': with open(sequence_file, 'w') as handle: handle.write('\n'.join(s.fasta for s in seqs)) # assign V-genes vblast_records = self.blast(sequence_file, self.species, 'V') # if there aren't any vblast_records, that means that none of the # sequences in the input file contained sequences with a significant # match to any germline V-gene. These are likely all non-antibody sequences. if not vblast_records: vdjs = [VDJ(seq) for seq in seqs] for vdj in vdjs: vdj.log('V-GENE ASSIGNMENT ERROR:', 'No variable gene was found.', 'Query sequence does not appear to contain a rearranged antibody.') self.unassigned = vdjs return jquery_seqs = [] for seq, vbr in zip(seqs, vblast_records): try: germ = self.process_blast_record(vbr, self.species) vdj = VDJ(seq, v=germ) self.orient_query(vdj, vbr) jquery = self.get_jquery_sequence(vdj.oriented, vbr) # only try to find J-genes if there's a minimum of 10 nucleotides # remaining after removal of the V-gene alignment if len(jquery) >= 10: vdjs.append(vdj) jquery_seqs.append(jquery) # abort VDJ assignment if the J-gene query sequence is too short else: vdj = VDJ(seq) vdj.log('J-GENE QUERY ERROR:', 'Query sequence for J-gene assignment is too short.') vdj.log('') query = vbr.alignments[0].hsps[0].query subject = vbr.alignments[0].hsps[0].sbjct vdj.log(' QUERY :', query) vdj.log(' ', ''.join(['|' if q == s else ' ' for q, s in zip(query, subject)])) vdj.log('SUBJECT:', subject) vdj.log('') vdj.log('J-QUERY SEQUENCE:', jquery.sequence) self.unassigned.append(vdj) except: vdj = VDJ(seq) vdj.exception('V-GENE ASSIGNMENT ERROR', traceback.format_exc()) self.unassigned.append(vdj) # assign J-genes _vdjs = [] dquery_seqs = [] jblast_infile = self.build_jblast_input(jquery_seqs) jblast_records = self.blast(jblast_infile, self.species, 'J') for vdj, jquery, jbr in zip(vdjs, jquery_seqs, jblast_records): try: germ = self.process_blast_record(jbr, self.species) vdj.j = germ # sanity check to make sure there's not an obvious problem with the V/J # assignments (likely due to poor germline matches to a non-antibody sequence) if vdj.v.chain != vdj.j.chain: vdj.log('GERMLINE ASSIGNMENT ERROR:', 'V-gene ({}) and J-gene ({}) chains do not match'.format(vdj.v.chain, vdj.j.chain)) self.unassigned.append(vdj) continue dquery = self.get_dquery_sequence(jquery, jbr) dquery_seqs.append(dquery) _vdjs.append(vdj) except: vdj.exception('V-GENE ASSIGNMENT ERROR', traceback.format_exc()) self.unassigned.append(vdj) os.unlink(jblast_infile) vdjs = _vdjs # assign D-genes _vdjs = [] for vdj, dquery in zip(vdjs, dquery_seqs): if all([vdj.v.chain == 'heavy', dquery]): try: germ = self.assign_dgene(dquery, self.species) vdj.d = germ except: vdj.exception('D-GENE ASSIGNMENT ERROR:', traceback.format_exc()) self.unassigned.append(vdj) continue _vdjs.append(vdj) self.assigned = _vdjs