Ejemplo n.º 1
0
    def quantify_with_kallisto(self, kallisto, cell, output_dir, cell_name, kallisto_base_transcriptome, fastq1, fastq2,
                               ncores, should_resume, single_end, fragment_length, fragment_sd):
        print("##Running Kallisto##")
        if should_resume:
            if os.path.isfile("{}/expression_quantification/abundance.tsv".format(output_dir)):
                print("Resuming with existing Kallisto output")
                return

        print("##Making Kallisto indices##")
        kallisto_dirs = ['kallisto_index']
        for d in kallisto_dirs:
            io.makeOutputDir("{}/expression_quantification/{}".format(output_dir, d))
        fasta_filename = "{output_dir}/unfiltered_TCR_seqs/{cell_name}_TCRseqs.fa".format(output_dir=output_dir,
                                                                                          cell_name=cell_name)
        fasta_file = open(fasta_filename, 'w')
        fasta_file.write(cell.get_fasta_string())
        fasta_file.close()

        output_transcriptome = "{}/expression_quantification/kallisto_index/{}_transcriptome.fa".format(output_dir,
                                                                                                        cell_name)
        with open(output_transcriptome, 'w') as outfile:
            for fname in [kallisto_base_transcriptome, fasta_filename]:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)

        idx_file = "{}/expression_quantification/kallisto_index/{}_transcriptome.idx".format(output_dir, cell_name)

        index_command = [kallisto, 'index', '-i', idx_file, output_transcriptome]
        subprocess.check_call(index_command)
        print("##Quantifying with Kallisto##")

        if not single_end:
            if not fragment_length:
                kallisto_command = [kallisto, 'quant', '-i', idx_file, '-t', ncores, '-o',
                                    "{}/expression_quantification".format(output_dir), fastq1, fastq2]
            else:
                kallisto_command = [kallisto, 'quant', '-i', idx_file, '-t', ncores, '-l', fragment_length, '-o',
                                    "{}/expression_quantification".format(output_dir), fastq1, fastq2]
        else:
            kallisto_command = [kallisto, 'quant', '-i', idx_file, '-t', ncores, '--single', '-l', fragment_length,
                                '-s', fragment_sd, '-o', "{}/expression_quantification".format(output_dir), fastq1]
        subprocess.check_call(kallisto_command)

        # delete index file because it's huge and unecessary. Delete transcriptome file
        # os.remove(idx_file)
        # os.remove(output_transcriptome)
        shutil.rmtree("{}/expression_quantification/kallisto_index/".format(output_dir))
Ejemplo n.º 2
0
    def summarise(self, **kwargs):

        if not kwargs:
            parser = argparse.ArgumentParser(description="Summarise set of cells with reconstructed TCR sequences")
            parser.add_argument('--config_file', '-c', metavar="<CONFIG_FILE>", help='config file to use [tracer.conf]',
                                default='tracer.conf')
            parser.add_argument('--use_unfiltered', '-u', help='use unfiltered recombinants', action="store_true")
            parser.add_argument('--keep_inkt', '-i', help='ignore iNKT cells when constructing networks',
                                action="store_true")
            parser.add_argument('--graph_format', '-f', metavar="<GRAPH_FORMAT>", help='graphviz output format [pdf]',
                                default='pdf')
            parser.add_argument('--no_networks', help='skip attempts to draw network graphs', action = "store_true")                    
            parser.add_argument('dir', metavar="<DIR>",
                                help='directory containing subdirectories for each cell to be summarised')
            args = parser.parse_args(sys.argv[2:])

            root_dir = os.path.abspath(args.dir)
            graph_format = args.graph_format
            config_file = args.config_file
            keep_inkt = args.keep_inkt
            use_unfiltered = args.use_unfiltered
            draw_graphs = not(args.no_networks)
        else:
            config_file = kwargs.get('config_file')
            use_unfiltered = kwargs.get('use_unfiltered')
            keep_inkt = kwargs.get('keep_inkt')
            graph_format = kwargs.get('graph_format')
            root_dir = os.path.abspath(kwargs.get('root_dir'))
            draw_graphs = not(kwargs.get('no_networks'))

        # Read config file
        tracer_func.check_config_file(config_file)
        config = ConfigParser()
        config.read(config_file)


        if draw_graphs:
            dot = self.resolve_relative_path(config.get('tool_locations', 'dot_path'))
            neato = self.resolve_relative_path(config.get('tool_locations', 'neato_path'))
            
            # check that executables from config file can be used
            not_executable = []
            for name, x in six.iteritems({"dot": dot, "neato": neato}):
                if not io.is_exe(x):
                    not_executable.append((name, x))
            if len(not_executable) > 0:
                print()
                print("Could not execute the following required tools. Check your configuration file.")
                for t in not_executable:
                    print( t[0], t[1])
                print()
                exit(1)
        else:
            dot = ""
            neato = ""

        cells = {}
        empty_cells = []
        NKT_cells = {}
        subdirectories = next(os.walk(root_dir))[1]

        if use_unfiltered:
            pkl_dir = "unfiltered_TCR_seqs"
            outdir = "{}/unfiltered_TCR_summary".format(root_dir)
            # outfile = open("{root_dir}/unfiltered_TCR_summary.txt".format(root_dir=root_dir), 'w')
            # length_filename_root = "{}/unfiltered_reconstructed_lengths_TCR".format(root_dir)

        else:
            pkl_dir = "filtered_TCR_seqs"
            outdir = "{}/filtered_TCR_summary".format(root_dir)
            # outfile = open("{root_dir}/filtered_TCR_summary.txt".format(root_dir=root_dir), 'w')
            # length_filename_root = "{}/filtered_reconstructed_lengths_TCR".format(root_dir)

        io.makeOutputDir(outdir)

        outfile = open("{}/TCR_summary.txt".format(outdir), 'w')
        length_filename_root = "{}/reconstructed_lengths_TCR".format(outdir)

        for d in subdirectories:
            cell_pkl = "{root_dir}/{d}/{pkl_dir}/{d}.pkl".format(pkl_dir=pkl_dir, d=d, root_dir=root_dir)
            if os.path.isfile(cell_pkl):
                with open(cell_pkl, 'rb') as pkl:
                    cl = pickle.load(pkl)
                cells[d] = cl
                if cl.is_empty:
                    empty_cells.append(d)
                if cl.is_inkt:
                    NKT_cells[d] = (cl.is_inkt, cl.getMainRecombinantIdentifiersForLocus('B'))
        count_of_cells_with_alpha_recovered = 0
        count_of_cells_with_beta_recovered = 0
        count_of_cells_with_paired_recovered = 0
        for cell_name, cell in six.iteritems(cells):
            prod_a_count = cell.count_productive_recombinants('A')
            prod_b_count = cell.count_productive_recombinants('B')
            if prod_a_count > 0:
                count_of_cells_with_alpha_recovered += 1
            if prod_b_count > 0:
                count_of_cells_with_beta_recovered += 1
            if prod_a_count > 0 and prod_b_count > 0:
                count_of_cells_with_paired_recovered += 1

        total_cells = len(cells)

        outfile.write(
            "TCRA reconstruction:\t{count_of_cells_with_alpha_recovered} / {total_cells} ({alpha_percent}%)\nTCRB reconstruction:\t{count_of_cells_with_beta_recovered} / {total_cells} ({beta_percent}%)\nPaired productive chains\t{count_of_cells_with_paired_recovered} / {total_cells} ({paired_percent}%)\n\n".format(
                paired_percent=round((count_of_cells_with_paired_recovered / float(total_cells)) * 100, 1),
                total_cells=total_cells,
                alpha_percent=round((count_of_cells_with_alpha_recovered / float(total_cells)) * 100, 1),
                beta_percent=round((count_of_cells_with_beta_recovered / float(total_cells)) * 100, 1),
                count_of_cells_with_beta_recovered=count_of_cells_with_beta_recovered,
                count_of_cells_with_paired_recovered=count_of_cells_with_paired_recovered,
                count_of_cells_with_alpha_recovered=count_of_cells_with_alpha_recovered))

        all_alpha_counter = Counter()
        all_beta_counter = Counter()
        prod_alpha_counter = Counter()
        prod_beta_count = Counter()

        counters = {'all_alpha': Counter(), 'all_beta': Counter(), 'prod_alpha': Counter(), 'prod_beta': Counter()}

        for cell in cells.values():
            counters['all_alpha'].update({cell.count_total_recombinants('A'): 1})
            counters['all_beta'].update({cell.count_total_recombinants('B'): 1})
            counters['prod_alpha'].update({cell.count_productive_recombinants('A'): 1})
            counters['prod_beta'].update({cell.count_productive_recombinants('B'): 1})

        max_recombinant_count = max(list(counters['all_alpha'].keys()) + list(counters['all_beta'].keys()))
        table_header = ['', '0 recombinants', '1 recombinant', '2 recombinants']
        recomb_range = range(0, 3)
        if max_recombinant_count > 2:
            extra_header = [str(x) + " recombinants" for x in range(3, max_recombinant_count + 1)]
            table_header = table_header + extra_header
            recomb_range = range(0, max_recombinant_count + 1)

        t = PrettyTable(table_header)
        t.padding_width = 1
        t.align = "l"
        for label in ['all_alpha', 'all_beta', 'prod_alpha', 'prod_beta']:
            counter = counters[label]
            count_array = [counter[x] for x in recomb_range]
            total_with_at_least_one = sum(count_array[1:])
            if total_with_at_least_one > 0:
                percentages = [''] + [" (" + str(round((float(x) / total_with_at_least_one) * 100)) + "%)" for x in
                                      count_array[1:]]
            else:
                percentages = [''] + [" (N/A%)" for x in count_array[1:]]
            row = []
            for i in recomb_range:
                row.append(str(count_array[i]) + percentages[i])

            t.add_row([label] + row)
        outfile.write(t.get_string())

        # If using unfiltered, name cells with more than two recombinants#
        if use_unfiltered:
            outfile.write("\n\n#Cells with more than two recombinants for a locus#\n")
            found_multi = False
            for cell in cells.values():
                if cell.count_total_recombinants('A') > 2 or cell.count_total_recombinants('B') > 2:
                    outfile.write("###{}###\n".format(cell.name))
                    outfile.write("TCRA:\t{}\nTCRB:\t{}\n\n".format(cell.count_total_recombinants('A'),
                                                                    cell.count_total_recombinants('B')))
                    found_multi = True
            if not found_multi:
                outfile.write("None\n\n")

        # Reporting iNKT cells
        iNKT_count = len(NKT_cells)
        if iNKT_count == 1:
            cell_word = 'cell'
        else:
            cell_word = 'cells'
        outfile.write("\n\n#iNKT cells#\nFound {iNKT_count} iNKT {cell_word}\n".format(iNKT_count=iNKT_count,
                                                                                       cell_word=cell_word))
        if iNKT_count > 0:
            for cell_name, ids in six.iteritems(NKT_cells):
                outfile.write("###{cell_name}###\n".format(cell_name=cell_name))
                outfile.write("TCRA:\t{}\nTCRB\t{}\n\n".format(ids[0], ids[1]))

        # plot lengths of reconstructed sequences
        lengths = {'A': [], 'B': []}
        for cell in cells.values():
            for locus in lengths.keys():
                lengths[locus] = lengths[locus] + cell.get_trinity_lengths(locus)

        # plot TCRA length distributions
        if len(lengths['A']) > 1:
            plt.figure()
            plt.axvline(334, linestyle="--", color='k')
            plt.axvline(344, linestyle="--", color='k')
            sns.distplot(lengths['A'])
            sns.despine()
            plt.xlabel("TCRa reconstructed length (bp)")
            plt.ylabel("Density")
            plt.savefig("{}A.pdf".format(length_filename_root))
        if len(lengths['A']) > 0:
            with open("{}A.txt".format(length_filename_root), 'w') as f:
                for l in sorted(lengths['A']):
                    f.write("{}\n".format(l))

        # plot TCRB length distributions
        if len(lengths['B']) > 1:
            plt.figure()
            plt.axvline(339, linestyle="--", color='k')
            plt.axvline(345, linestyle="--", color='k')
            sns.distplot(lengths['B'])
            sns.despine()
            plt.xlabel("TCRb reconstructed length (bp)")
            plt.ylabel("Density")
            plt.savefig("{}B.pdf".format(length_filename_root))
        if len(lengths['B']) > 0:
            with open("{}B.txt".format(length_filename_root), 'w') as f:
                for l in sorted(lengths['B']):
                    f.write("{}\n".format(l))

        for cell_name in empty_cells:
            del cells[cell_name]

        if not keep_inkt:
            for cell_name in NKT_cells.keys():
                del cells[cell_name]

        # make clonotype networks
        component_groups = tracer_func.draw_network_from_cells(cells, outdir, graph_format, dot, neato, draw_graphs)

        # Print component groups to the summary#
        outfile.write(
            "\n###Clonotype groups###\nThis is a text representation of the groups shown in clonotype_network_with_identifiers.pdf. It does not exclude cells that only share beta and not alpha.\n\n")
        for g in component_groups:
            outfile.write(", ".join(g))
            outfile.write("\n\n")

        # plot clonotype sizes
        plt.figure()
        clonotype_sizes = tracer_func.get_component_groups_sizes(cells)
        w = 0.85
        x_range = range(1, len(clonotype_sizes) + 1)
        plt.bar(x_range, height=clonotype_sizes, width=w, color='black', align='center')
        plt.gca().set_xticks(x_range)
        plt.xlabel("Clonotype size")
        plt.ylabel("Clonotype count")
        plt.savefig("{}/clonotype_sizes.pdf".format(outdir))

        # write clonotype sizes to text file
        with open("{}/clonotype_sizes.txt".format(outdir), 'w') as f:
            data = zip(x_range, clonotype_sizes)
            f.write("clonotype_size\tclonotype_count\n")
            for t in data:
                f.write("{}\t{}\n".format(t[0], t[1]))

        # Write out recombinant details for each cell
        with open("{}/recombinants.txt".format(outdir), 'w') as f:
            f.write("cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\n")
            sorted_cell_names = sorted(list(cells.keys()))
            for cell_name in sorted_cell_names:
                cell = cells[cell_name]
                for locus in "AB":
                    recombinants = cell.all_recombinants[locus]
                    if recombinants is not None:
                        for r in recombinants:
                            f.write(
                                "{name}\t{locus}\t{ident}\t{productive}\t{length}\n".format(
                                    name=cell_name, locus=locus, ident=r.identifier,
                                    productive=r.productive, length=len(r.trinity_seq)))
                f.write("\n")
            f.write("\n\n")
            for cell_name in empty_cells:
                f.write("{}\tNo TCRs found\n".format(cell_name))

        outfile.close()
Ejemplo n.º 3
0
    def assemble(self, **kwargs):
        if not kwargs:
            parser = argparse.ArgumentParser(
                description="Reconstruct TCR sequences from RNAseq reads for a single cell")
            parser.add_argument('--ncores', '-p', metavar="<CORES>", help='number of processor cores to use', type=int,
                                default=1)
            parser.add_argument('--config_file', '-c', metavar="<CONFIG_FILE>", help='config file to use [tracer.conf]',
                                default='tracer.conf')
            parser.add_argument('--resume_with_existing_files', '-r',
                                help='look for existing intermediate files and use those instead of starting from scratch',
                                action="store_true")
            parser.add_argument('--species', '-s',
                                help='species from which T cells were isolated - important to determination of iNKT cells',
                                choices=['Mmus', 'Hsap'], default='Mmus')
            parser.add_argument('--seq_method', '-m',
                                help='Method for constructing sequence to assess productivity, \
                                quantify expression and for output reporting. See README for details.',
                                choices=['imgt', 'assembly'], default='imgt')
            parser.add_argument('--single_end', help='set this if your sequencing data are single-end reads',
                                action="store_true")
            parser.add_argument('--fragment_length',
                                help='Estimated average fragment length in the sequencing library.'
                                     ' Used for Kallisto quantification. REQUIRED for single-end data.',
                                default=False)
            parser.add_argument('--fragment_sd',
                                help='Estimated standard deviation of average fragment length in the sequencing library.'
                                     ' Used for Kallisto quantification. REQUIRED for single-end data.',
                                default=False)
            parser.add_argument('fastq1', metavar="<FASTQ1>", help='first fastq file')
            parser.add_argument('fastq2', metavar="<FASTQ2>", help='second fastq file', nargs='?')
            parser.add_argument('cell_name', metavar="<CELL_NAME>", help='name of cell for file labels')
            parser.add_argument('output_dir', metavar="<OUTPUT_DIR>",
                                help='directory for output as <output_dir>/<cell_name>')

            args = parser.parse_args(sys.argv[2:])

            cell_name = args.cell_name
            fastq1 = args.fastq1
            single_end = args.single_end
            fastq2 = args.fastq2

            ncores = str(args.ncores)
            config_file = args.config_file
            species = args.species
            seq_method = args.seq_method
            resume_with_existing_files = args.resume_with_existing_files
            fragment_length = args.fragment_length
            fragment_sd = args.fragment_sd
            output_dir = args.output_dir

        else:
            cell_name = kwargs.get('cell_name')
            fastq1 = kwargs.get('fastq1')
            fastq2 = kwargs.get('fastq2')
            ncores = kwargs.get('ncores')
            config_file = kwargs.get('config_file')
            species = kwargs.get('species')
            seq_method = kwargs.get('seq_method')
            resume_with_existing_files = kwargs.get('resume_with_existing_files')
            output_dir = kwargs.get('output_dir')
            single_end = kwargs.get('single_end')
            fragment_length = kwargs.get('fragment_length')
            fragment_sd = kwargs.get('fragment_sd')

        if not single_end:
            assert fastq2, "Only one fastq file specified. Either set --single_end or provide second fastq."
        else:
            fastq2 = None
            if fastq2:
                print("Two fastq files given with --single-end option. Ignoring second file.")
            assert fragment_length and fragment_sd, \
                'Must specify estimated average fragment length (--fragment_length)' \
                ' and standard deviation (--fragment_sd) for use with single-end data'
            assert fragment_length, \
                'Must specify estimated average fragment length (--fragment_length) for use with single-end data'
            assert fragment_sd, \
                'Must specify estimated fragment length standard deviation (--fragment_sd) for use with single-end data'

        # Check FASTQ files exist
        if not os.path.isfile(fastq1):
            raise OSError('2', 'FASTQ file not found', fastq1)
        if not single_end and fastq2:
            if not os.path.isfile(fastq2):
                raise OSError('2', 'FASTQ file not found', fastq2)

        # Read config file
        tracer_func.check_config_file(config_file)
        config = ConfigParser()
        config.read(config_file)

        bowtie2 = self.resolve_relative_path(config.get('tool_locations', 'bowtie2_path'))
        igblast = self.resolve_relative_path(config.get('tool_locations', 'igblast_path'))
        kallisto = self.resolve_relative_path(config.get('tool_locations', 'kallisto_path'))
        trinity = self.resolve_relative_path(config.get('tool_locations', 'trinity_path'))

        if config.has_option('trinity_options', 'trinity_grid_conf'):
            trinity_grid_conf = self.resolve_relative_path(config.get('trinity_options', 'trinity_grid_conf'))
        else:
            trinity_grid_conf = False

        # Trinity version
        if not config.has_option('trinity_options', 'trinity_version'):
            try:
                subprocess.check_output([trinity, '--version'])
            except subprocess.CalledProcessError as err:
                if re.search('v2', err.output.decode('utf-8')):
                    config.set('trinity_options', 'trinity_version', '2')
                else:
                    config.set('trinity_options', 'trinity_version', '1')

        synthetic_genome_path = self.resolve_relative_path(config.get('bowtie2_options', 'synthetic_genome_index_path'))
        igblast_index_location = self.resolve_relative_path(config.get('IgBlast_options', 'igblast_index_location'))
        igblast_seqtype = config.get('IgBlast_options', 'igblast_seqtype')
        imgt_seq_location = self.resolve_relative_path(config.get('IgBlast_options', 'imgt_seq_location'))

        kallisto_base_transcriptome = self.resolve_relative_path(config.get('kallisto_options', 'base_transcriptome'))

        # check that executables from config file can be used
        not_executable = []
        for name, x in six.iteritems({"bowtie2": bowtie2, "igblast": igblast, "kallisto": kallisto, "trinity": trinity}):
            if not io.is_exe(x):
                not_executable.append((name, x))
        if len(not_executable) > 0:
            print()
            print("Could not execute the following required tools. Check your configuration file.")
            for t in not_executable:
                print(t[0], t[1])
            print()
            exit(1)

        # set-up output directories
        root_output_dir = os.path.abspath(output_dir)
        io.makeOutputDir(root_output_dir)
        output_dir = root_output_dir + "/" + cell_name

        io.makeOutputDir(output_dir)

        data_dirs = ['aligned_reads', 'Trinity_output', 'IgBLAST_output', 'unfiltered_TCR_seqs',
                     'expression_quantification', 'filtered_TCR_seqs']
        for d in data_dirs:
            io.makeOutputDir("{}/{}".format(output_dir, d))

        locus_names = ["TCRA", "TCRB"]

        should_resume = resume_with_existing_files

        self.bowtie2_alignment(bowtie2, ncores, locus_names, output_dir, cell_name, synthetic_genome_path, fastq1,
                               fastq2, should_resume, single_end)
        print()
        trinity_JM = config.get('trinity_options', 'max_jellyfish_memory')
        trinity_version = config.get('trinity_options', 'trinity_version')
        self.assemble_with_trinity(trinity, locus_names, output_dir, cell_name, ncores, trinity_grid_conf, trinity_JM,
                                   trinity_version, should_resume, single_end, species)
        print()
        self.run_IgBlast(igblast, locus_names, output_dir, cell_name, igblast_index_location, igblast_seqtype, species,
                         should_resume)
        print()

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            cell = io.parse_IgBLAST(locus_names, output_dir, cell_name, imgt_seq_location, species, seq_method)
            if cell.is_empty:
                self.die_with_empty_cell(cell_name, output_dir, species)

        self.quantify_with_kallisto(kallisto, cell, output_dir, cell_name, kallisto_base_transcriptome, fastq1, fastq2,
                                    ncores, should_resume, single_end, fragment_length, fragment_sd)

        print()

        counts = tracer_func.load_kallisto_counts("{}/expression_quantification/abundance.tsv".format(output_dir))

        # pdb.set_trace():
        for locus, recombinants in six.iteritems(cell.all_recombinants):
            if recombinants is not None:
                for rec in recombinants:
                    tpm = counts[locus][rec.contig_name]
                    rec.TPM = tpm

        self.print_cell_summary(cell,
                                "{output_dir}/unfiltered_TCR_seqs/unfiltered_TCRs.txt".format(output_dir=output_dir))
        with open("{output_dir}/unfiltered_TCR_seqs/{cell_name}.pkl".format(output_dir=output_dir,
                                                                            cell_name=cell.name), 'wb') as pf:
            pickle.dump(cell, pf, protocol=0)
        print("##Filtering by read count##")
        cell.filter_recombinants()
        fasta_filename = "{output_dir}/filtered_TCR_seqs/{cell_name}_TCRseqs.fa".format(output_dir=output_dir,
                                                                                        cell_name=cell_name)
        fasta_file = open(fasta_filename, 'w')
        fasta_file.write(cell.get_fasta_string())
        fasta_file.close()
        self.print_cell_summary(cell, "{output_dir}/filtered_TCR_seqs/filtered_TCRs.txt".format(output_dir=output_dir))
        with open("{output_dir}/filtered_TCR_seqs/{cell_name}.pkl".format(output_dir=output_dir,
                                                                          cell_name=cell.name), 'wb') as pf:
            pickle.dump(cell, pf, protocol=0)