예제 #1
0
 def __init__(self, sequence, reference=HXB2):
     super(Env, self).__init__()
     self._sequence = Sequence(sequence)
     self._reference = Sequence(reference)
     self._aligned = None
     self._aligned_reference = None
     self._id = None
     self._reference_name = None
     self._clade = None
예제 #2
0
 def heavy(self):
     if self._heavy is None:
         if len(self._heavies) > 0:
             if self._select_heavy is not None:
                 self._heavy = Sequence(self._select_heavy(self._heavies))
             else:
                 self._heavy = Sequence(self._heavies[0])
         else:
             self._heavy = None
     return self._heavy
예제 #3
0
파일: blastn.py 프로젝트: menis/abstar
 def get_jquery_sequence(seq, vbr):
     hsp = vbr.alignments[0].hsps[0]
     # check to see if the raw input was reverse-complemented
     if hsp.sbjct_start > hsp.sbjct_end:
         # since the BLASTn alignment was done on the raw input
         # (which has since been reverse complemented), we need
         # to take the portion of the sequence that was 5' of the alignment
         # with the raw input (which is the 3' end of the correctly oriented sequence)
         return Sequence(seq[-hsp.query_start:], id=seq.id)
     else:
         return Sequence(seq[hsp.query_end:], id=seq.id)
예제 #4
0
 def light(self):
     if self._light is None:
         # self._lights = [s for s in self._seqs if s['chain'] in ['kappa', 'lambda']]
         if len(self._lights) > 0:
             if self._select_light is not None:
                 self._light = Sequence(self._select_light(self._lights))
             else:
                 self._light = Sequence(self._lights[0])
         else:
             self._light = None
     return self._light
예제 #5
0
def main(args):
    if args.sequences is not None:
        # from utils import output, vdj
        processed = process_sequences(args.sequences, args)
        if len(processed) == 1:
            return Sequence(dict(processed[0]))
        return [Sequence(dict(p)) for p in processed]
    else:
        input_dir, output_dir, temp_dir, log_dir = make_directories(args)
        setup_logging(log_dir, args.debug)
        log_options(input_dir, output_dir, temp_dir, args)
        if args.use_test_data:
            mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
            input_files = [os.path.join(mod_dir, 'test_data/test_1k.fasta'), ]
        else:
            if args.basespace:
                args.merge = True
                download_files(input_dir)
            if args.merge:
                input_dir = merge_reads(input_dir, args)
            if args.isotype:
                args.isotype = args.species
            input_files = [f for f in list_files(input_dir, log=True) if os.stat(f).st_size > 0]
        output_files = []
        # assigned_files = []
        # unassigned_files = []
        for f, fmt in zip(input_files, format_check(input_files)):
            # skip the non-FASTA/Q files
            if fmt is None:
                continue
            start_time = time.time()
            print_input_file_info(f, fmt)
            subfiles, seq_count = split_file(f, fmt, temp_dir, args)
            run_info = run_jobs(subfiles, temp_dir, log_dir, fmt, args)
            temp_output_files = [r[0] for r in run_info if r is not None]
            processed_seq_counts = [r[1] for r in run_info if r is not None]
            annotated_log_files = [r[2] for r in run_info if r is not None]
            failed_log_files = [r[3] for r in run_info if r is not None]
            unassigned_log_files = [r[4] for r in run_info if r is not None]
            vdj_end_time = time.time()
            _output_files = concat_outputs(f, temp_output_files, output_dir, args)
            unassigned_file = concat_logs(f, unassigned_log_files, log_dir, 'unassigned')
            failed_file = concat_logs(f, failed_log_files, log_dir, 'failed')
            if args.debug:
                annotated_file = concat_logs(f, annotated_log_files, log_dir, 'annotated')
            output_files.extend(output_files)
            if not args.debug:
                flat_temp_files = [f for subl in temp_output_files for f in subl]
                clear_temp_files(subfiles + flat_temp_files + annotated_log_files + failed_log_files + unassigned_log_files)
            print_job_stats(seq_count, processed_seq_counts, start_time, vdj_end_time)
        return output_files
예제 #6
0
파일: blastn.py 프로젝트: menis/abstar
 def orient_query(vdj, vbr):
     hsp = vbr.alignments[0].hsps[0]
     # BLASTn always reverse complements the Subject sequence, never the query.
     # To determine whether the input sequence is the reverse complement, check
     # to see if the Subject sequence was reverse-complemented by BLASTn
     if hsp.sbjct_start > hsp.sbjct_end:
         vdj.oriented = Sequence(vdj.sequence.reverse_complement, id=vdj.sequence.id)
예제 #7
0
파일: blastn.py 프로젝트: menis/abstar
 def assign_dgene(self, seq, species):
     db_file = os.path.join(self.germline_directory, 'ungapped/d.fasta')
     with open(db_file, 'r') as db_handle:
         germs = [Sequence(s) for s in SeqIO.parse(db_handle, 'fasta')]
         rc_germs = [Sequence(s.reverse_complement, id=s.id) for s in germs]
         germs.extend(rc_germs)
     alignments = local_alignment(seq, targets=germs,
                                  gap_open=-20, gap_extend=-2)
     alignments.sort(key=lambda x: x.score, reverse=True)
     all_gls = [a.target.id for a in alignments]
     all_scores = [a.score for a in alignments]
     if not all([all_gls, all_scores]):
         return None
     top_gl = all_gls[0]
     top_score = all_scores[0]
     others = [GermlineSegment(germ, species, score=score) for germ, score in zip(all_gls[1:6], all_scores[1:6])]
     return GermlineSegment(top_gl, species, score=top_score, others=others, assigner_name=self.name)
예제 #8
0
파일: isotype.py 프로젝트: menis/abstar
def get_isotype(antibody):
    try:
        germ_dir = get_germline_database_directory(antibody.species)
        isotype_file = os.path.join(germ_dir, 'isotypes/isotypes.fasta')
        isotype_seqs = [Sequence(s) for s in SeqIO.parse(open(isotype_file, 'r'), 'fasta')]
        return Isotype(antibody, isotype_seqs)
    except:
        antibody.exception('ISOTYPING ERROR', traceback.format_exc())
예제 #9
0
 def __init__(self, sequence, v=None, d=None, j=None):
     super(VDJ, self).__init__()
     LoggingMixin.__init__(self)
     self.sequence = Sequence(sequence)
     self.id = self.sequence.id
     self.oriented = self.sequence
     self.v = v
     self.d = d
     self.j = j
     self.initialize_log()
예제 #10
0
def test_isotype_bcr():
    test_fasta = os.path.abspath('abstar/test_data/test_isotype.fasta')
    with open(test_fasta, 'r') as f:
        test_seqs = [Sequence(s) for s in SeqIO.parse(f, 'fasta')]
    vdj_nt = [s for s in test_seqs if s.id == 'vdj_nt'][0].sequence
    oriented_input = [s for s in test_seqs
                      if s.id == 'oriented_input'][0].sequence
    antibody = MockAntibody()
    antibody.species = 'human'
    antibody.vdj_nt = vdj_nt
    antibody.oriented_input = oriented_input
    isotype = get_isotype(antibody)
    assert isotype.isotype == 'IgG1'
예제 #11
0
 def run_partis(self, sequence_file, file_format, locus):
     with open(sequence_file, 'r') as sequence_handle:
         seqs = [Sequence(s) for s in SeqIO.parse(sequence_handle, file_format)]
     seq_dict = {s.id.replace(':', 'c'): s for s in seqs}
     partis_out = NamedTemporaryFile(delete=False)
     germline_dir = os.path.join(self.germline_directory, 'partis/')
     locus = 'igh' if locus == 'ig' else 'tra'
     partis_cmd = ['partis', 'run-viterbi',
                   '--infname', sequence_file,
                   '--outfname', partis_out.name,
                   '--locus', locus,
                   '--initial-germline-dir', germline_dir]
     p = sp.Popen(partis_cmd, stdout=sp.PIPE, stderr=sp.PIPE)
     stdout, stderr = p.communicate()
     parsed_vdjs = self.parse_partis_output(partis_out.name, seq_dict)
     os.unlink(partis_out.name)
     return parsed_vdjs
예제 #12
0
def get_env_sequences(clade=None):
    if clade is None:
        clade = CLADES
    elif type(clade) in [str, str]:
        clade = [
            clade,
        ]

    envs = []
    seq_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                           'env_sequences')

    for c in clade:
        seq_file = os.path.join(seq_dir, '{}.fasta'.format(c.upper()))
        envs.extend([
            Sequence([s.id, str(s.seq).upper()])
            for s in SeqIO.parse(open(seq_file, 'r'), 'fasta')
        ])
    return envs
예제 #13
0
 def __init__(self, sequence, species=None):
     self.raw_sequence = Sequence(str(sequence.seq),
                                  id=sequence.description)
     self._species = species
     self.gapped_nt_sequence = self.raw_sequence.sequence
     self.ungapped_nt_sequence = self.gapped_nt_sequence.replace('.', '')
예제 #14
0
파일: blastn.py 프로젝트: menis/abstar
 def get_dquery_sequence(seq, jbr):
     query_start = jbr.alignments[0].hsps[0].query_start - 1
     return Sequence(seq[:query_start], id=seq.id)
예제 #15
0
def run(*args, **kwargs):
    '''
    Runs AbStar.

    Input sequences can be provided in several different formats:

        1) individual sequences as positional arguments: ``run(seq1, seq2, temp=temp, output=output)``
        2) a list of sequences, as an argument: ``run([seq1, seq2], temp=temp, output=output)``
        3) a single FASTA/Q-formatted input file, passed via ``input``
        4) a directory of FASTA/Q-formatted files, passed via ``input``

    When passing sequences (not FASTA/Q files), the sequences can be in any format recognized
    by ``abtools.sequence.Sequence``, including:

        - a raw nucleotide sequence, as a string (a random sequence ID will be assigned)
        - a list/tuple of the format ``[sequence_id, sequence]``
        - a BioPython SeqRecord object
        - an AbTools Sequence object

    Either sequences, ``project_dir``, or all of ``input``, ``output`` and ``temp`` are required.


    Examples:

        If processing a single sequence, you can pass the raw sequence, as a string::

            import abstar

            result = abstar.run('ATGC')

        or a list/tuple of the format ``[sequence_id, sequence]``::

            result = abstar.run(['seq1', 'ATGC'])

        If you pass just the raw sequence, a random sequence ID will be generated with ``uuid.uuid4()``.
        In either case, when given a single sequence, ``abstar.run()`` will return a single AbTools ``Sequence``
        object. If running multiple sequences, you can either pass each sequence as a positional argument::

            result_list = run(['seq1', 'ATGC'], ['seq2', 'CGTA'])

        or you can pass a list of sequences as the first argument, in this case using sequences parsed from a
        FASTA file using Biopython::

            from Bio import SeqIO

            fasta = open('my_sequences.fasta', 'r')
            seqs = [s for s in SeqIO.parse(fasta, 'fasta')]
            result_list = abstar.run(seqs)

        When given multiple sequences, ``abstar.run()`` will return a list of AbTools ``Sequence`` objects,
        one per input sequence.

        If you'd prefer not to parse the FASTQ/A file into a list (for example, if the input file is
        extremely large), you can pass the input file path directly, along with a temp directory and output
        directory::

            result_files = abstar.run(input='/path/to/my_sequences.fasta',
                                      temp='/path/to/temp',
                                      output='/path/to/output')

        Given a file path, ``abstar.run()`` returns a list of output file paths. In the above case,
        ``result_files`` will be a list containing a single output file path:
        ``/path/to/output/my_sequences.json``.

        If you have a directory containing multiple FASTQ/A files, you can pass the directory path
        using ``input``::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output')

        As before, ``result_files`` will contain a list of output file paths.

        If your input directory contains paired FASTQ files (gzip compressed or uncompressed)
        that need to be merged prior to processing with AbStar::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output',
                                      merge=True)

        The paired read files in ``input`` will be merged with PANDAseq prior to processing with AbStar.
        By default, PANDAseq's 'simple bayesian' read merging algorithm is used, although alternate
        algorithms can be selected with ``pandaseq_algo``.

        AbStar also provides an alternate CSV-formatted output type that mimics the `IMGT Summary file`_.
        This option is provided to minimize the effort needed to convert existing
        IMGT-based pipelines to AbStar. Alternate output is only available when passing an input file or
        directory; passing individual sequences or a list of sequences will always return Sequence objects.
        To produce IMGT-formatted output::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output',
                                      output_type='imgt')

        .. _IMGT Summary file: http://www.imgt.org/IMGT_vquest/share/textes/imgtvquest.html#Esummary


    Args:

        project_dir (str): Path to the project directory. Most useful when directly downloading
            files from BaseSpace, and all subdirectories will be created by AbStar.

        input (str): Path to input directory, containing FASTA/Q files. If performing
            read merging with PANDAseq, paired FASTQ files may be gzip compressed.

        output (str): Path to output directory.

        temp (str): Path to temp directory, where intermediate job files will be stored.

        log (str): Path to log file. If not provided and ``project_dir`` is provided,
            the log will be written to ``/path/to/project_dir/abstar.log``. If output is
            provided, log will be written to ``/path/to/output/abstar.log``.

        species (str): Species of the antibody sequences. Choices are 'human', 'macaque',
            'mouse' and 'rabbit'. Default is 'human'.

        isotype (bool): If True, the isotype will infered by aligning the sequence region
            downstream of the J-gene. If False, the isotype will not be determined.
            Default is True.

        uid (int): Length (in nucleotides) of the Unique Molecular ID used to barcode input RNA.
            A positive integer results in the UMID being parsed from the start of the read (or merged
            read), a negative integer results in parsing from the end of the read. Default is 0,
            which results in no UMID parsing.

        gzip (bool): If True, compresses output files with gzip. Default is False.

        pretty (bool): If True, formats JSON output files to be more human-readable. If False,
            JSON output files contain one record per line. Default is False.

        output_type (str): Options are 'json' or 'imgt'. IMGT output mimics the Summary
            table produced by IMGT High-V/Quest, to maintain a level of compatibility with
            existing IMGT-based pipelines. JSON output is much more detailed. Default is 'json'.

        merge (bool): If True, input must be paired-read FASTA files (gzip compressed or uncompressed)
            which will be merged with PANDAseq prior to processing with AbStar. If ``basespace`` is True,
            ``merge`` is automatically set to True. Default is False.

        pandaseq_algo (str): Define merging algorithm to be used by PANDAseq. Options are
            'simple_bayesian', 'ea_util', 'flash', 'pear', 'rdp_mle', 'stitch', or 'uparse'. Default is
            'simple_bayesian', which is the default PANDAseq algorithm.

        debug (bool): If ``True``, ``abstar.run()`` runs in single-threaded mode, the log is much more verbose,
            and temporary files are not removed. Default is ``False``.


    Returns:

        If the input is a single sequence, ``run`` returns a single AbTools ``Sequence`` object.

        If the input is a list of sequences, ``run`` returns a list of AbTools ``Sequence`` objects.

        If the input is a file or a directory of files, ``run`` returns a list of output files.
    '''

    warnings.filterwarnings("ignore")
    if len(args) == 1:
        # if there's a single arg, need to check if it's a single sequence...
        try:
            sequences = [
                Sequence(args[0]),
            ]
        except:
            # ...or a list of sequences
            try:
                sequences = [Sequence(s) for s in args[0]]
            except:
                print('ERROR: invalid format for sequence input:')
                for a in args:
                    print(a)
                sys.exit(1)
    # if multiple args, assume each is a sequence
    elif len(args) > 1:
        try:
            sequences = [Sequence(s) for s in args]
        except:
            print('ERROR: invalid format for sequence input:')
            for a in args:
                print(a)
            sys.exit(1)
    kwargs['sequences'] = sequences
    args = Args(**kwargs)
    validate_args(args)
    global logger
    logger = log.get_logger('abstar')
    output = main(args)
    # if args.sequences is not None:
    #     output = [Sequence(o) for o in output]
    #     if len(output) == 1:
    #         return output[0]
    return output
예제 #16
0
def test_api_integration_bcr_hiv_bnab_lcs():
    test_data = os.path.abspath('abstar/test_data/test_hiv_bnab_lcs.fasta')
    with open(test_data, 'r') as f:
        test_seqs = [Sequence(s) for s in SeqIO.parse(f, 'fasta')]
    seqs = abstar.run(*test_seqs)
    assert len(seqs) == 207
예제 #17
0
파일: blastn.py 프로젝트: menis/abstar
    def __call__(self, sequence_file, file_format):
        with open(sequence_file, 'r') as sequence_handle:
            seqs = [Sequence(s) for s in SeqIO.parse(sequence_handle, file_format)]
        vdjs = []

        # if the input file is FASTQ-formatted, need to convert it to FASTA for BLASTn to work
        if file_format == 'fastq':
            with open(sequence_file, 'w') as handle:
                handle.write('\n'.join(s.fasta for s in seqs))

        # assign V-genes
        vblast_records = self.blast(sequence_file, self.species, 'V')
        # if there aren't any vblast_records, that means that none of the
        # sequences in the input file contained sequences with a significant
        # match to any germline V-gene. These are likely all non-antibody sequences.
        if not vblast_records:
            vdjs = [VDJ(seq) for seq in seqs]
            for vdj in vdjs:
                vdj.log('V-GENE ASSIGNMENT ERROR:',
                        'No variable gene was found.',
                        'Query sequence does not appear to contain a rearranged antibody.')
            self.unassigned = vdjs
            return
        jquery_seqs = []
        for seq, vbr in zip(seqs, vblast_records):
            try:
                germ = self.process_blast_record(vbr, self.species)
                vdj = VDJ(seq, v=germ)
                self.orient_query(vdj, vbr)
                jquery = self.get_jquery_sequence(vdj.oriented, vbr)
                # only try to find J-genes if there's a minimum of 10 nucleotides
                # remaining after removal of the V-gene alignment
                if len(jquery) >= 10:
                    vdjs.append(vdj)
                    jquery_seqs.append(jquery)
                # abort VDJ assignment if the J-gene query sequence is too short
                else:
                    vdj = VDJ(seq)
                    vdj.log('J-GENE QUERY ERROR:', 'Query sequence for J-gene assignment is too short.')
                    vdj.log('')
                    query = vbr.alignments[0].hsps[0].query
                    subject = vbr.alignments[0].hsps[0].sbjct
                    vdj.log(' QUERY :', query)
                    vdj.log('        ', ''.join(['|' if q == s else ' ' for q, s in zip(query, subject)]))
                    vdj.log('SUBJECT:', subject)
                    vdj.log('')
                    vdj.log('J-QUERY SEQUENCE:', jquery.sequence)
                    self.unassigned.append(vdj)
            except:
                vdj = VDJ(seq)
                vdj.exception('V-GENE ASSIGNMENT ERROR', traceback.format_exc())
                self.unassigned.append(vdj)

        # assign J-genes
        _vdjs = []
        dquery_seqs = []
        jblast_infile = self.build_jblast_input(jquery_seqs)
        jblast_records = self.blast(jblast_infile, self.species, 'J')
        for vdj, jquery, jbr in zip(vdjs, jquery_seqs, jblast_records):
            try:
                germ = self.process_blast_record(jbr, self.species)
                vdj.j = germ
                # sanity check to make sure there's not an obvious problem with the V/J
                # assignments (likely due to poor germline matches to a non-antibody sequence)
                if vdj.v.chain != vdj.j.chain:
                    vdj.log('GERMLINE ASSIGNMENT ERROR:',
                            'V-gene ({}) and J-gene ({}) chains do not match'.format(vdj.v.chain, vdj.j.chain))
                    self.unassigned.append(vdj)
                    continue
                dquery = self.get_dquery_sequence(jquery, jbr)
                dquery_seqs.append(dquery)
                _vdjs.append(vdj)
            except:
                vdj.exception('V-GENE ASSIGNMENT ERROR', traceback.format_exc())
                self.unassigned.append(vdj)
        os.unlink(jblast_infile)
        vdjs = _vdjs

        # assign D-genes
        _vdjs = []
        for vdj, dquery in zip(vdjs, dquery_seqs):
            if all([vdj.v.chain == 'heavy', dquery]):
                try:
                    germ = self.assign_dgene(dquery, self.species)
                    vdj.d = germ
                except:
                    vdj.exception('D-GENE ASSIGNMENT ERROR:', traceback.format_exc())
                    self.unassigned.append(vdj)
                    continue
            _vdjs.append(vdj)
        self.assigned = _vdjs