Ejemplo n.º 1
0
 def report(self):
     t = reporter(
     name='analysis_snp',
     assay=self.assay,
     sample=self.sample,
     outdir=self.outdir + '/..')
     t.get_report()
def analysis_capture_virus(args):

    # check dir
    outdir = args.outdir
    sample = args.sample
    virus_file = args.virus_file
    match_dir = args.match_dir

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # report
    tsne_df_file = glob.glob(f'{match_dir}/*analysis*/*tsne_coord.tsv')[0]
    marker_df_file = glob.glob(f'{match_dir}/*analysis*/*markers.tsv')[0]
    tsne_df = pd.read_csv(tsne_df_file, sep="\t")
    marker_df = pd.read_csv(marker_df_file, sep="\t")
    virus_df = pd.read_csv(virus_file, sep="\t")

    report_prepare(outdir, tsne_df, marker_df, virus_df)

    t = reporter(name='analysis_capture_virus',
                 assay=args.assay,
                 sample=args.sample,
                 outdir=args.outdir + '/..')
    t.get_report()
Ejemplo n.º 3
0
def cutadapt(args):
    # check dir
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # run cutadapt
    adapt = []
    for a in args.adapt:
        adapt.append('-a')
        adapt.append(a)

    out_fq2 = args.outdir + '/' + args.sample + '_clean_2.fq.gz'
    cmd = ['cutadapt'] + adapt + [
        '-n',
        str(len(args.adapt)), '-j',
        str(args.thread), '-m',
        str(args.minimum_length), '--nextseq-trim=' + str(args.nextseq_trim),
        '--overlap',
        str(args.overlap), '-l',
        str(args.insert), '-o', out_fq2, args.fq
    ]
    cutadapt.logger.info('%s' % (' '.join(cmd)))
    res = subprocess.run(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    with open(args.outdir + '/cutadapt.log', 'wb') as fh:
        fh.write(res.stdout)

    format_stat(args.outdir + '/cutadapt.log', args.sample)
    t = reporter(name='cutadapt',
                 assay=args.assay,
                 sample=args.sample,
                 stat_file=args.outdir + '/stat.txt',
                 outdir=args.outdir + '/..')
    t.get_report()
Ejemplo n.º 4
0
def analysis_smk(args):
    logger1.info('smk analysis ...!')

    # check dir
    outdir = args.outdir
    sample = args.sample
    tsne_tag_file = args.tsne_tag_file
    match_dir = args.match_dir

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # report
    tsne_df_file = glob.glob(f'{match_dir}/*analysis*/*tsne_coord.tsv')[0]
    marker_df_file = glob.glob(f'{match_dir}/*analysis*/*markers.tsv')[0]
    tsne_df = pd.read_csv(tsne_df_file, sep="\t")
    marker_df = pd.read_csv(marker_df_file, sep="\t")
    tsne_tag_df = pd.read_csv(tsne_tag_file, sep="\t", index_col=0)

    report_prepare(outdir, tsne_df, marker_df, tsne_tag_df)

    t = reporter(
        name='analysis_smk',
        assay=args.assay,
        sample=sample,
        outdir=args.outdir +
        '/..')
    t.get_report()
Ejemplo n.º 5
0
def sample_info(args):

    sample = args.sample
    ASSAY = ASSAY_DICT[args.assay]
    version = __VERSION__
    outdir = args.outdir
    chemistry = args.chemistry
    if not chemistry:
        chemistry = "Customized"
    #transcriptome = args.genomeDir.split("/")[-1]

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % outdir)

    stat = pd.DataFrame(
        {
            "item": ["Sample ID", "Assay", "Chemistry", "Software Version"],
            "count": [sample, ASSAY, chemistry, version],
        },
        columns=["item", "count"])
    stat_file = outdir + "/stat.txt"
    stat.to_csv(stat_file, sep=":", header=None, index=False)

    t = reporter(name='sample',
                 assay=args.assay,
                 sample=args.sample,
                 stat_file=stat_file,
                 outdir=outdir + '/..')
    t.get_report()
Ejemplo n.º 6
0
def analysis_rna_virus(args):

    # check dir
    outdir = args.outdir
    sample = args.sample
    matrix_file = args.matrix_file
    virus_file = args.virus_file

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # run_R
    seurat(sample, outdir, matrix_file)

    # report
    tsne_df_file = glob.glob(
        "{outdir}/*tsne_coord.tsv".format(outdir=outdir))[0]
    marker_df_file = glob.glob(
        "{outdir}/*markers.tsv".format(outdir=outdir))[0]
    tsne_df = pd.read_csv(tsne_df_file, sep="\t")
    marker_df = pd.read_csv(marker_df_file, sep="\t")
    virus_df = pd.read_csv(virus_file, sep="\t")

    report_prepare(outdir, tsne_df, marker_df, virus_df)

    t = reporter(name='analysis_rna_virus',
                 assay=args.assay,
                 sample=args.sample,
                 outdir=args.outdir + '/..')
    t.get_report()
Ejemplo n.º 7
0
 def report(self):
     t = reporter(name=self.step_name,
                  assay=self.assay,
                  sample=self.sample,
                  stat_file=self.stats_file,
                  outdir=self.outdir + '/..',
                  plot=self.plot)
     t.get_report()
Ejemplo n.º 8
0
 def run(self):
     self.read_to_dic()
     self.tag_count()
     t = reporter(name='mapping_smk',
                  assay="smk",
                  sample=self.sample,
                  stat_file=self.stat_file,
                  outdir=self.outdir + '/..')
     t.get_report()
Ejemplo n.º 9
0
def STAR(args):
    # check
    refFlat, gtf = glob_genomeDir(args.genomeDir)

    # check dir
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # run STAR
    outPrefix = args.outdir + '/' + args.sample + '_'
    outBam = args.outdir + '/' + args.sample + '_'
    # cmd = ['STAR', '--runThreadN', str(args.thread), '--genomeDir', args.genomeDir, '--readFilesIn', args.fq, '--readFilesCommand', 'zcat', '--outFilterMultimapNmax', '1', '--outReadsUnmapped', 'Fastx', '--outFileNamePrefix', outPrefix, '--outSAMtype', 'BAM', 'SortedByCoordinate']
    cmd = ['STAR', '--runThreadN', str(args.thread), '--genomeDir', args.genomeDir,
           '--readFilesIn', args.fq, '--readFilesCommand', 'zcat', '--outFilterMultimapNmax',
           '1', '--outFileNamePrefix', outPrefix, '--outSAMtype', 'BAM', 'SortedByCoordinate']
    if args.out_unmapped:
        cmd += ['--outReadsUnmapped', 'Fastx']
    STAR.logger.info('%s' % (' '.join(cmd)))
    subprocess.check_call(cmd)

    STAR.logger.info('picard start...')
    outBam = outPrefix + 'Aligned.sortedByCoord.out.bam'
    region_txt = args.outdir + '/' + args.sample + '_region.log'
    cmd = [
        'picard',
        '-Xmx4G',
        '-XX:ParallelGCThreads=4',
        'CollectRnaSeqMetrics',
        'I=%s' %
        (outBam),
        'O=%s' %
        (region_txt),
        'REF_FLAT=%s' %
        (refFlat),
        'STRAND=NONE',
        'VALIDATION_STRINGENCY=SILENT']
    STAR.logger.info('%s' % (' '.join(cmd)))
    res = subprocess.run(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    STAR.logger.info(res.stdout)
    STAR.logger.info('picard done.')

    plot = format_stat(
        args.outdir +
        '/' +
        args.sample +
        '_Log.final.out',
        region_txt,
        args.sample)
    t = reporter(
        name='STAR',
        assay=args.assay,
        sample=args.sample,
        stat_file=args.outdir + '/stat.txt',
        outdir=args.outdir + '/..',
        plot=plot)
    t.get_report()
Ejemplo n.º 10
0
    def report(self):

        self.stat_file = f'{self.outdir}/stat.txt'
        self.stats.to_csv(self.stat_file, sep=':', header=False)
        t = reporter(name='count_cite',
                     assay=self.assay,
                     sample=self.sample,
                     stat_file=self.stat_file,
                     outdir=self.outdir + '/..')
        t.get_report()
Ejemplo n.º 11
0
 def report(self):
     t = reporter(
         name='demultiplex',
         assay='single-vdj',
         sample=self.samplename,
         outdir=self.outdir + '/..',
         stat_file=f'{self.outdir}/stat.txt',
         html_flag=False,
     )
     t.get_report()
Ejemplo n.º 12
0
def count_capture_rna(args):

    # check
    _refFlat, gtf = glob_genomeDir(args.genomeDir)
    id_name = gene_convert(gtf)

    # 检查和创建输出目录
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # umi纠错,输出Barcode geneID  UMI     count为表头的表格
    count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt'
    df_probe = bam2table(args.bam, count_detail_file, id_name)
    df_probe.to_csv(f'{args.outdir}/{args.sample}_probe_gene_count.tsv',
                    sep='\t',
                    index=False)

    df = pd.read_table(count_detail_file, header=0)

    # call cells
    pdf = args.outdir + '/barcode_filter_magnitude.pdf'
    marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt'
    (validated_barcodes, threshold, cell_num,
     CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file)

    # match barcode
    sc_cell_barcodes, sc_cell_number = read_barcode_file(args.match_dir)

    # 输出matrix
    (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome,
     match_cell_str,
     match_UMI_median) = expression_matrix(df, validated_barcodes, args.outdir,
                                           args.sample, id_name,
                                           sc_cell_barcodes, sc_cell_number)

    # downsampling
    validated_barcodes = set(validated_barcodes)
    downsample_file = args.outdir + '/' + args.sample + '_downsample.txt'
    Saturation = downsample(count_detail_file, validated_barcodes,
                            downsample_file)

    # summary
    stat_file = args.outdir + '/stat.txt'
    get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes,
                CB_reads_count, reads_mapped_to_transcriptome, match_cell_str,
                match_UMI_median, stat_file, args.outdir + '/../')

    report_prepare(marked_counts_file, downsample_file, args.outdir + '/..')

    t = reporter(assay=args.assay,
                 name='count_capture_rna',
                 sample=args.sample,
                 stat_file=args.outdir + '/stat.txt',
                 outdir=args.outdir + '/..')
    t.get_report()
Ejemplo n.º 13
0
 def report(self, stat=True):
     if stat:
         stat_file = self.outdir + "/stat.txt"
     else:
         stat_file = ''
     t = reporter(name=self.step,
                  assay=self.assay,
                  sample=self.sample,
                  outdir=self.outdir + '/..',
                  stat_file=stat_file)
     t.get_report()
Ejemplo n.º 14
0
def featureCounts(args):

    # check
    _refFlat, gtf = glob_genomeDir(args.genomeDir)

    # check dir
    if not os.path.exists(args.outdir):
        os.mkdir(args.outdir)

    # run featureCounts
    outPrefix = args.outdir + '/' + args.sample
    cmd = ['featureCounts', '-a', gtf, '-o', outPrefix, '-R', 'BAM',
           '-T', str(args.thread), '-t', args.gtf_type, args.input]
    featureCounts.logger.info('%s' % (' '.join(cmd)))
    subprocess.check_call(cmd)

    subprocess.check_call(['which', 'samtools'])

    # sort by name:BC and umi
    featureCounts.logger.info('samtools sort ...!')
    bam_basename = os.path.basename(args.input)
    cmd = [
        'samtools',
        'sort',
        '-n',
        '-@',
        '3',
        '-o',
        outPrefix +
        '_name_sorted.bam',
        args.outdir +
        '/' +
        bam_basename +
        '.featureCounts.bam']
    featureCounts.logger.info('%s' % (' '.join(cmd)))
    subprocess.check_call(cmd)
    featureCounts.logger.info('samtools sort done.')

    format_stat(args.outdir + '/' + args.sample + '.summary', args.sample)
    t = reporter(
        name='featureCounts',
        assay=args.assay,
        sample=args.sample,
        stat_file=args.outdir +
        '/stat.txt',
        outdir=args.outdir +
        '/..')
    t.get_report()
Ejemplo n.º 15
0
def count(args):

    # check
    refFlat, gtf = glob_genomeDir(args.genomeDir)

    # 检查和创建输出目录
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # umi纠错,输出Barcode geneID  UMI     count为表头的表格
    count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt'
    bam2table(args.bam, count_detail_file)

    df = pd.read_table(count_detail_file, header=0)

    # call cells
    pdf = args.outdir + '/barcode_filter_magnitude.pdf'
    marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt'
    (validated_barcodes, threshold, cell_num,
     CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file)

    # 输出matrix
    (CB_total_Genes, CB_reads_count,
     reads_mapped_to_transcriptome) = expression_matrix(
         df, validated_barcodes, args.outdir, args.sample, gtf)

    # downsampling
    validated_barcodes = set(validated_barcodes)
    downsample_file = args.outdir + '/' + args.sample + '_downsample.txt'
    Saturation = downsample(count_detail_file, validated_barcodes,
                            downsample_file)

    # summary
    stat_file = args.outdir + '/stat.txt'
    get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes,
                CB_reads_count, reads_mapped_to_transcriptome, stat_file,
                args.outdir + '/../')

    report_prepare(marked_counts_file, downsample_file, args.outdir + '/..')

    t = reporter(assay=args.assay,
                 name='count',
                 sample=args.sample,
                 stat_file=args.outdir + '/stat.txt',
                 outdir=args.outdir + '/..')
    t.get_report()
Ejemplo n.º 16
0
def analysis(args):

    # check dir
    outdir = args.outdir
    sample = args.sample
    matrix_file = args.matrix_file
    save_rds = args.save_rds
    type_marker_tsv = args.type_marker_tsv
    auto_assign_bool = False
    if type_marker_tsv and type_marker_tsv != 'None':
        auto_assign_bool = True
    if auto_assign_bool:
        save_rds = True

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # run_R
    seurat(sample, outdir, matrix_file, save_rds)

    # auto_assign
    if auto_assign_bool:
        auto_assign(sample, outdir, type_marker_tsv)

    # report
    tsne_df_file = f'{outdir}/{sample}_tsne_coord.tsv'
    marker_df_file = f'{outdir}/{sample}_markers.tsv'
    tsne_df = pd.read_csv(tsne_df_file, sep="\t")
    marker_df = pd.read_csv(marker_df_file, sep="\t")
    report_prepare(outdir, tsne_df, marker_df)

    stat_file = outdir + "/stat.txt"
    assay = __ASSAY__
    t = reporter(name='analysis',
                 assay=assay,
                 sample=args.sample,
                 outdir=args.outdir + '/..',
                 stat_file=stat_file)
    t.get_report()
Ejemplo n.º 17
0
def sample_info(args):

    sample = args.sample
    ASSAY = ASSAY_DICT[args.assay]
    version = __VERSION__
    outdir = args.outdir
    chemistry = args.chemistry

    # get chemistry
    if chemistry == 'auto':
        fq1 = args.fq1
        ch = Chemistry(fq1)
        chemistry = ch.check_chemistry()
    else:
        chemistry = args.chemistry

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % outdir)

    stat = pd.DataFrame({
        "item": ["Sample ID", "Assay", "Chemistry", "Software Version"],
        "count": [sample, ASSAY, chemistry, version],
        },
        columns=["item", "count"]
    )
    stat_file = outdir + "/stat.txt"
    stat.to_csv(stat_file, sep=":", header=None, index=False)

    t = reporter(
        name='sample',
        assay=args.assay,
        sample=args.sample,
        stat_file=stat_file,
        outdir=outdir + '/..')
    t.get_report()
    return chemistry
Ejemplo n.º 18
0
 def test_report(self):
     t = reporter(assay=self.assay,
         name='count_capture_rna', sample=self.sample,
         stat_file=self.outdir + '/stat.txt',
         outdir=self.outdir + '/..')
     t.get_report()
Ejemplo n.º 19
0
def summary(index_file, count_file, outdir, sample):
    # init
    number = 0
    Number_of_Match_Cells_with_SNP = 0
    SNP_count_dict = defaultdict(int)
    coord_gene_dict = defaultdict(dict)

    # read index
    df_index, df_valid = read_index(index_file)

    # out vcf
    out_vcf = open(f'{outdir}/{sample}.vcf', 'wt')
    for index in df_valid.index:
        vcf_coords_dict = {}
        number += 1
        cell_vcf_file = f'{outdir}/cells/cell{index}/cell{index}_norm.vcf'
        # vcf coords
        with open(cell_vcf_file, 'rt') as f:
            for line in f:
                if line.startswith("#"):
                    # add vcf and bam header
                    if number == 1:
                        new_line = process_vcf_header(line, sample)
                        if new_line:
                            out_vcf.write(new_line)
                    continue
                if line:
                    items = line.split('\t')
                    items[7] += f';CELL={index}'
                    new_line = '\t'.join(items)
                    out_vcf.write(new_line)
                    chrom = str(items[0])
                    pos = int(items[1])
                    if chrom not in vcf_coords_dict:
                        vcf_coords_dict[chrom] = set([pos])
                    else:
                        vcf_coords_dict[chrom].add(pos)
                    SNP_count_dict[index] += 1

        # add bam header
        if number == 1:
            cell_bam_file = f'{outdir}/cells/cell{index}/cell{index}_sorted.bam'
            cell_bam = pysam.AlignmentFile(cell_bam_file, "rb")
            header = cell_bam.header
            out_bam = pysam.AlignmentFile(f'{outdir}/{sample}.bam',
                                          "wb",
                                          header=header)

        # add bam
        if len(vcf_coords_dict) > 0:
            Number_of_Match_Cells_with_SNP += 1
            cell_bam_file = f'{outdir}/cells/cell{index}/cell{index}_sorted.bam'
            cell_bam = pysam.AlignmentFile(cell_bam_file, "rb")
            for read in cell_bam:
                bam_ref = str(read.reference_name)
                gene_name = read.get_tag('GN')
                aligned_pairs = read.get_aligned_pairs()
                align_dict = {}
                for pair in aligned_pairs:
                    ref_pos = pair[1]
                    read_pos = pair[0]
                    if ref_pos:
                        align_dict[ref_pos] = read_pos
                if bam_ref in vcf_coords_dict.keys():
                    read_flag = False
                    for pos in vcf_coords_dict[bam_ref]:
                        if pos in align_dict:
                            read_flag = True
                            coord_gene_dict[bam_ref][pos] = gene_name
                    if read_flag:
                        out_bam.write(read)

    out_vcf.close()
    out_bam.close()
    pysam.sort("-o", f'{outdir}/{sample}_sorted.bam', f'{outdir}/{sample}.bam')
    cmd = f'samtools index {outdir}/{sample}_sorted.bam'
    os.system(cmd)

    # annotate vcf
    anno_vcf = open(f'{outdir}/{sample}_anno.vcf', 'wt')
    with open(f'{outdir}/{sample}.vcf', 'rt') as vcf:
        for line in vcf:
            if line.startswith('#'):
                anno_vcf.write(line)
                continue
            items = line.split('\t')
            chrom = str(items[0])
            pos = int(items[1])
            gene_name = coord_gene_dict[chrom][pos]
            items[7] += f';GENE={gene_name}'
            new_line = '\t'.join(items)
            anno_vcf.write(new_line)
    anno_vcf.close()

    # rm
    #os.remove(f'{outdir}/{sample}.vcf')
    #os.remove(f'{outdir}/{sample}.bam')

    # stat
    stats = pd.Series()
    n_match_cell = len(df_index.index)

    df_count = pd.read_csv(count_file, sep='\t')
    df_count_read = df_count.groupby('barcode').agg({'read_count': sum})
    read_total = sum(df_count_read['read_count'])
    Mean_Reads_per_Cell = round((read_total / n_match_cell), 2)
    stats = stats.append(
        pd.Series(Mean_Reads_per_Cell, index=['Mean Reads per Cell']))
    df_count_UMI = df_count.groupby('barcode').agg({'UMI': 'count'})
    UMI_total = sum(df_count_UMI['UMI'])
    Mean_UMIs_per_Cell = round((UMI_total / n_match_cell), 2)
    stats = stats.append(
        pd.Series(Mean_UMIs_per_Cell, index=['Mean UMIs per Cell']))

    stats = stats.append(
        pd.Series(format_stat(Number_of_Match_Cells_with_SNP, n_match_cell),
                  index=['Number of Cells with Variants']))

    SNP_counts = list(SNP_count_dict.values())
    Mean_SNP_per_Cell = round(np.mean(SNP_counts), 3)
    stats = stats.append(
        pd.Series(Mean_SNP_per_Cell,
                  index=['Mean Variants per Cell with Variants']))

    stat_file = f'{outdir}/stat.txt'
    stats.to_csv(stat_file, sep=':', header=False)

    t = reporter(name='snpCalling',
                 assay='snp',
                 sample=sample,
                 stat_file=stat_file,
                 outdir=outdir + '/..')
    t.get_report()
Ejemplo n.º 20
0
def count(args):
    # args
    outdir = args.outdir
    sample = args.sample
    assay = args.assay
    cells = args.cells
    rescue = args.rescue

    # check
    refFlat, gtf_file = glob_genomeDir(args.genomeDir)

    # 检查和创建输出目录
    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # umi纠错,输出Barcode geneID  UMI     count为表头的表格
    count_detail_file = outdir + '/' + sample + '_count_detail.txt.gz'
    bam2table(args.bam, count_detail_file)

    df = pd.read_table(count_detail_file, header=0)

    # export all matrix
    dir_name = 'all_matrix'
    matrix_10X(df, outdir, sample, gtf_file, dir_name=dir_name)

    # call cells
    pdf = outdir + '/barcode_filter_magnitude.pdf'
    df_sum, threshold = call_cells(df, cells, pdf)

    # rescue low UMI cells
    if rescue:
        matrix_dir = f"{outdir}/{sample}_{dir_name}/"
        threshold = rescue_cells(outdir, sample, matrix_dir, threshold)

    # get cell stats
    marked_counts_file = outdir + '/' + sample + '_counts.txt'
    validated_barcodes, CB_describe = get_cell_stats(df_sum, threshold,
                                                     marked_counts_file)

    # export cell matrix
    matrix_10X(df,
               outdir,
               sample,
               gtf_file,
               dir_name='matrix_10X',
               validated_barcodes=validated_barcodes)
    (CB_total_Genes, CB_reads_count,
     reads_mapped_to_transcriptome) = expression_matrix(
         df, validated_barcodes, outdir, sample, gtf_file)

    # downsampling
    validated_barcodes = set(validated_barcodes)
    downsample_file = args.outdir + '/' + args.sample + '_downsample.txt'
    Saturation = downsample(count_detail_file, validated_barcodes,
                            downsample_file)

    # summary
    stat_file = outdir + '/stat.txt'
    get_summary(df, sample, Saturation, CB_describe, CB_total_Genes,
                CB_reads_count, reads_mapped_to_transcriptome, stat_file,
                outdir + '/../')

    report_prepare(marked_counts_file, downsample_file, outdir + '/..')

    t = reporter(assay=assay,
                 name='count',
                 sample=args.sample,
                 stat_file=outdir + '/stat.txt',
                 outdir=outdir + '/..')
    t.get_report()
Ejemplo n.º 21
0
def barcode(args):

    # check dir
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % args.outdir)

    bc_pattern = __PATTERN_DICT__[args.chemistry]
    if (bc_pattern):
        (linker, whitelist) = get_scope_bc(args.chemistry)
    else:
        bc_pattern = args.pattern
        linker = args.linker
        whitelist = args.whitelist
    if (not linker) or (not whitelist) or (not bc_pattern):
        barcode.logger.error("invalid chemistry or [pattern,linker,whitelist]")
        sys.exit()

    # parse pattern to dict, C8L10C8L10C8U8
    # defaultdict(<type 'list'>, {'C': [[0, 8], [18, 26], [36, 44]], 'U':
    # [[44, 52]], 'L': [[8, 18], [26, 36]]})
    pattern_dict = parse_pattern(bc_pattern)

    # check linker
    check_seq(linker, pattern_dict, "L")

    bool_T = True if 'T' in pattern_dict else False
    bool_L = True if 'L' in pattern_dict else False

    C_len = sum([item[1] - item[0] for item in pattern_dict['C']])

    barcode_qual_Counter = Counter()
    umi_qual_Counter = Counter()
    C_U_base_Counter = Counter()
    args.lowQual = ord2chr(args.lowQual)

    # generate list with mismatch 1, substitute one base in raw sequence with
    # A,T,C,G
    barcode_dict = generate_seq_dict(whitelist, n=1)
    linker_dict = generate_seq_dict(linker, n=2)

    fq1_list = args.fq1.split(",")
    fq2_list = args.fq2.split(",")
    # merge multiple fastq files
    if len(fq1_list) > 1:
        barcode.logger.info("merge fastq with same sample name...")
        fastq_dir = args.outdir + "/../merge_fastq"
        if not os.path.exists(fastq_dir):
            os.system('mkdir -p %s' % fastq_dir)
        fastq1_file = f"{fastq_dir}/{args.sample}_1.fq.gz"
        fastq2_file = f"{fastq_dir}/{args.sample}_2.fq.gz"
        fq1_files = " ".join(fq1_list)
        fq2_files = " ".join(fq2_list)
        fq1_cmd = f"cat {fq1_files} > {fastq1_file}"
        fq2_cmd = f"cat {fq2_files} > {fastq2_file}"
        barcode.logger.info(fq1_cmd)
        os.system(fq1_cmd)
        barcode.logger.info(fq2_cmd)
        os.system(fq2_cmd)
        barcode.logger.info("merge fastq done.")
    else:
        fastq1_file = args.fq1
        fastq2_file = args.fq2

    fh1 = xopen(fastq1_file)
    fh2 = xopen(fastq2_file)
    out_fq2 = args.outdir + '/' + args.sample + '_2.fq.gz'
    fh3 = xopen(out_fq2, 'w')

    (total_num, clean_num, no_polyT_num, lowQual_num,
     no_linker_num, no_barcode_num) = (0, 0, 0, 0, 0, 0)
    Barcode_dict = defaultdict(int)

    if args.nopolyT:
        fh1_without_polyT = xopen(args.outdir + '/noPolyT_1.fq', 'w')
        fh2_without_polyT = xopen(args.outdir + '/noPolyT_2.fq', 'w')

    if args.noLinker:
        fh1_without_linker = xopen(args.outdir + '/noLinker_1.fq', 'w')
        fh2_without_linker = xopen(args.outdir + '/noLinker_2.fq', 'w')

    bool_probe = False
    if args.probe_file and args.probe_file != 'None':
        bool_probe = True
        count_dic = genDict(dim=3)
        valid_count_dic = genDict(dim=2)
        probe_dic = read_fasta(args.probe_file)
        reads_without_probe = 0

    g1 = read_fastq(fh1)
    g2 = read_fastq(fh2)

    while True:
        try:
            (header1, seq1, qual1) = next(g1)
            (header2, seq2, qual2) = next(g2)
        except BaseException:
            break
        if total_num > 0 and total_num % 1000000 == 0:
            barcode.logger.info(
                f'processed reads: {format_number(total_num)}.'
                f'valid reads: {format_number(clean_num)}.'
            )

        total_num += 1

        # polyT filter
        if bool_T:
            polyT = seq_ranges(seq1, pattern_dict['T'])
            if no_polyT(polyT):
                no_polyT_num += 1
                if args.nopolyT:
                    fh1_without_polyT.write(
                        '@%s\n%s\n+\n%s\n' % (header1, seq1, qual1))
                    fh2_without_polyT.write(
                        '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2))
                continue

        # lowQual filter
        C_U_quals_ascii = seq_ranges(
            qual1, pattern_dict['C'] + pattern_dict['U'])
        # C_U_quals_ord = [ord(q) - 33 for q in C_U_quals_ascii]
        if low_qual(C_U_quals_ascii, args.lowQual, args.lowNum):
            lowQual_num += 1
            continue

        # linker filter
        barcode_arr = [seq_ranges(seq1, [i]) for i in pattern_dict['C']]
        raw_cb = ''.join(barcode_arr)
        if bool_L:
            linker = seq_ranges(seq1, pattern_dict['L'])
            if (no_linker(linker, linker_dict)):
                no_linker_num += 1

                if args.noLinker:
                    fh1_without_linker.write(
                        '@%s\n%s\n+\n%s\n' % (header1, seq1, qual1))
                    fh2_without_linker.write(
                        '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2))
                continue

        # barcode filter
        # barcode_arr = [seq_ranges(seq1, [i]) for i in pattern_dict['C']]
        # raw_cb = ''.join(barcode_arr)
        res = no_barcode(barcode_arr, barcode_dict)
        if res is True:
            no_barcode_num += 1
            continue
        elif res == "correct":
            cb = raw_cb
        else:
            cb = res

        umi = seq_ranges(seq1, pattern_dict['U'])
        Barcode_dict[cb] += 1
        clean_num += 1
        read_name_probe = 'None'

        if bool_probe:
            # valid count
            valid_count_dic[cb][umi] += 1

            # output probe UMi and read count
            find_probe = False
            for probe_name in probe_dic:
                probe_seq = probe_dic[probe_name]
                probe_seq = probe_seq.upper()
                if seq1.find(probe_seq) != -1:
                    count_dic[probe_name][cb][umi] += 1
                    read_name_probe = probe_name
                    find_probe = True
                    break

            if not find_probe:
                reads_without_probe += 1

        barcode_qual_Counter.update(C_U_quals_ascii[:C_len])
        umi_qual_Counter.update(C_U_quals_ascii[C_len:])
        C_U_base_Counter.update(raw_cb + umi)

        # new readID: @barcode_umi_old readID
        fh3.write(f'@{cb}_{umi}_{read_name_probe}_{total_num}\n{seq2}\n+\n{qual2}\n')

    fh3.close()

    # logging
    if total_num % 1000000 != 0:
        barcode.logger.info(
            f'processed reads: {format_number(total_num)}. '
            f'valid reads: {format_number(clean_num)}. '
        )

    if clean_num == 0:
        raise Exception(
            'no valid reads found! please check the --chemistry parameter.')

    if bool_probe:
        # total probe summary
        total_umi = 0
        total_valid_read = 0
        for cb in valid_count_dic:
            total_umi += len(valid_count_dic[cb])
            total_valid_read += sum(valid_count_dic[cb].values())
        barcode.logger.info("total umi:"+str(total_umi))
        barcode.logger.info("total valid read:"+str(total_valid_read))
        barcode.logger.info("reads without probe:"+str(reads_without_probe))

        # probe summary
        count_list = []
        for probe_name in probe_dic:
            UMI_count = 0
            read_count = 0
            if probe_name in count_dic:
                for cb in count_dic[probe_name]:
                    UMI_count += len(count_dic[probe_name][cb])
                    read_count += sum(count_dic[probe_name][cb].values())
            count_list.append(
                {"probe_name": probe_name, "UMI_count": UMI_count, "read_count": read_count})

        df_count = pd.DataFrame(count_list, columns=[
                                "probe_name", "read_count", "UMI_count"])

        def format_percent(x):
            x = str(round(x*100, 2))+"%"
            return x
        df_count["read_fraction"] = (
            df_count["read_count"]/total_valid_read).apply(format_percent)
        df_count["UMI_fraction"] = (
            df_count["UMI_count"]/total_umi).apply(format_percent)
        df_count.sort_values(by="UMI_count", inplace=True, ascending=False)
        df_count_file = args.outdir + '/' + args.sample + '_probe_count.tsv'
        df_count.to_csv(df_count_file, sep="\t", index=False)

    # stat
    BarcodesQ30 = sum([barcode_qual_Counter[k] for k in barcode_qual_Counter if k >= ord2chr(
        30)]) / float(sum(barcode_qual_Counter.values())) * 100
    UMIsQ30 = sum([umi_qual_Counter[k] for k in umi_qual_Counter if k >= ord2chr(
        30)]) / float(sum(umi_qual_Counter.values())) * 100

    global stat_info
    def cal_percent(x): return "{:.2%}".format((x + 0.0) / total_num)
    with open(args.outdir + '/stat.txt', 'w') as fh:
        """
        Raw Reads: %s
        Valid Reads: %s(%s)
        Q30 of Barcodes: %.2f%%
        Q30 of UMIs: %.2f%%
        """
        stat_info = stat_info % (format_number(total_num), format_number(clean_num),
                                 cal_percent(clean_num), BarcodesQ30,
                                 UMIsQ30)
        stat_info = re.sub(r'^\s+', r'', stat_info, flags=re.M)
        fh.write(stat_info)

    barcode.logger.info('fastqc ...!')
    cmd = ['fastqc', '-t', str(args.thread), '-o', args.outdir, out_fq2]
    barcode.logger.info('%s' % (' '.join(cmd)))
    subprocess.check_call(cmd)
    barcode.logger.info('fastqc done!')

    t = reporter(name='barcode', assay=args.assay, sample=args.sample,
                 stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..')
    t.get_report()
Ejemplo n.º 22
0
def summary(fq, alignments, type, outdir, sample, assay, debug):
    chains = CHAINS[type]

    '''
    # out files
    UMI_unfiltered_file = f'{outdir}/{sample}_UMI_unfiltered.tsv'
    UMI_filtered1_file = f'{outdir}/{sample}_UMI_filtered1.tsv'
    UMI_filtered2_file = f'{outdir}/{sample}_UMI_filtered2.tsv'
    '''

    UMI_count_unfiltered_file = f'{outdir}/{sample}_UMI_count_unfiltered.tsv'
    UMI_count_filtered1_file = f'{outdir}/{sample}_UMI_count_filtered1.tsv'

    # read fq
    read2 = gzip.open(fq, "rt")
    index = 0
    read_row_list = []
    while True:
        line1 = read2.readline()
        line2 = read2.readline()
        line3 = read2.readline()
        line4 = read2.readline()
        if not line4:
            break
        attr = str(line1).strip("@").split("_")
        barcode = str(attr[0])
        umi = str(attr[1])
        dic = {"readId": index, "barcode": barcode, "UMI": umi}
        read_row_list.append(dic)
        index += 1
    df_read = pd.DataFrame(read_row_list, columns=["readId", "barcode", "UMI"])
    mapping_vdj.logger.info("fq reads to dataframe done.")
    read2.close()
    total_read = df_read.shape[0]

    # init row list
    mapping_summary_row_list = []

    # mapped
    alignment = pd.read_csv(alignments, sep="\t")
    alignment.readId = alignment.readId.astype(int)
    align_read = alignment.shape[0]
    df_read.readId = df_read.readId.astype(int)
    df_align = pd.merge(df_read, alignment, on="readId", how="right")

    mapping_summary_row_list.append({
        "item": "Reads Mapped to Any VDJ Gene",
        "count": align_read,
        "total_count": total_read,
    })

    # CDR3
    df_CDR3 = df_align[~pd.isnull(df_align["aaSeqCDR3"])]
    align_read_with_CDR3 = df_CDR3.shape[0]
    mapping_summary_row_list.append({
        "item": "Reads with CDR3",
        "count": align_read_with_CDR3,
        "total_count": total_read,
    })

    # correct CDR3
    df_correct_CDR3 = df_CDR3[~(df_CDR3["aaSeqCDR3"].str.contains(r"\*"))]
    align_read_with_correct_CDR3 = df_correct_CDR3.shape[0]
    mapping_summary_row_list.append({
        "item": "Reads with Correct CDR3",
        "count": align_read_with_correct_CDR3,
        "total_count": total_read,
    })

    # VDJ
    df_VJ = df_correct_CDR3[
        (~pd.isnull(df_correct_CDR3['bestVGene'])) &
        (~pd.isnull(df_correct_CDR3['bestJGene']))
    ]
    df_VJ = df_VJ[df_VJ.bestVGene.str[:3] == df_VJ.bestJGene.str[:3]]
    df_VJ["chain"] = df_VJ.bestVGene.str[:3]
    df_VJ["VJ_pair"] = df_VJ["bestVGene"] + "_" + df_VJ["bestJGene"]
    Reads_Mapped_Confidently_to_VJ_Gene = df_VJ.shape[0]
    mapping_summary_row_list.append({
        "item": "Reads Mapped Confidently to VJ Gene",
        "count": Reads_Mapped_Confidently_to_VJ_Gene,
        "total_count": total_read
    })

    # chain
    for chain in chains:
        df_chain = df_VJ[df_VJ.chain == chain]
        Reads_Mapped_to_chain = df_chain.shape[0]
        mapping_summary_row_list.append({
            "item": f"Reads Mapped to {chain}",
            "count": Reads_Mapped_to_chain,
            "total_count": total_read,
        })

    # unique UMI
    df_UMI = df_VJ.drop_duplicates(subset=["barcode", "UMI"], keep="first")

    # filter1: keep top 1 in each combinations
    groupby_elements = [
        'barcode',
        'chain',
        'bestVGene',
        'bestJGene',
        'aaSeqCDR3',
        'nSeqCDR3',
    ]
    df_UMI_count = df_UMI.groupby(
        groupby_elements, as_index=False).agg({"UMI": "count"})
    df_UMI_count = df_UMI_count.sort_values("UMI", ascending=False)
    # out unfiltered
    df_UMI_count.to_csv(UMI_count_unfiltered_file, sep="\t", index=False)

    df_UMI_count_filter1 = df_UMI_count.groupby(
        ["barcode", "chain"], as_index=False).head(1)
    # out filtered1
    df_UMI_count_filter1.to_csv(
        UMI_count_filtered1_file,
        sep="\t",
        index=False)

    if debug:
        unique_UMI = df_UMI.shape[0]
        mapping_summary_row_list.append({
            "item": "UMI unique count",
            "count": unique_UMI,
            "total_count": align_read_with_correct_CDR3,
        })
        UMI_after_Contamination_Filtering = df_UMI_count.filter1.UMI.sum()
        mapping_summary_row_list.append({
            "item": "UMI after Contamination Filtering",
            "count": UMI_after_Contamination_Filtering,
            "total_count": unique_UMI,
        })

    # stat file
    df = pd.DataFrame(
        mapping_summary_row_list,
        columns=[
            "item",
            "count",
            "total_count"])
    stat_file = f'{outdir}/stat.txt'
    gen_stat(df, stat_file)

    # report
    STEP = 'mapping_vdj'
    name = f'{type}_{STEP}'
    t = reporter(
        name=name,
        sample=sample,
        stat_file=stat_file,
        outdir=outdir + '/..',
        assay=assay,
    )
    t.get_report()
Ejemplo n.º 23
0
def count_vdj(args):

    sample = args.sample
    match_dir = args.match_dir
    UMI_min = args.UMI_min
    outdir = args.outdir
    UMI_count_filter1_file = args.UMI_count_filter1_file
    type = args.type
    debug = args.debug
    iUMI = int(args.iUMI)
    chains = CHAINS[type]

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % outdir)

    # out file
    cell_confident_file = f"{outdir}/{sample}_cell_confident.tsv"
    cell_confident_count_file = f"{outdir}/{sample}_cell_confident_count.tsv"
    clonetypes_file = f"{outdir}/{sample}_clonetypes.tsv"
    match_clonetypes_file = f"{outdir}/{sample}_match_clonetypes.tsv"
    top10_clonetypes_file = f"{outdir}/{sample}_top10_clonetypes.tsv"
    match_top10_clonetypes_file = f"{outdir}/{sample}_match_top10_clonetypes.tsv"

    # read file
    df_UMI_count_filter1 = pd.read_csv(UMI_count_filter1_file, sep='\t')
    if (not match_dir) or (match_dir == "None"):
        match_bool = False
    else:
        match_bool = True
    if match_bool:
        match_cell_barcodes, match_cell_number = read_barcode_file(match_dir)

    cell_summary_row_list = []

    # cell calling:cell calling: keep UMIs >= UMI_min
    df_UMI_sum = df_UMI_count_filter1.groupby(['barcode'], as_index=False).agg(
        {"UMI": "sum"})
    if (UMI_min == "auto"):
        rank = 20
        df_UMI_sum_sorted = df_UMI_sum.sort_values(["UMI"], ascending=False)
        rank_UMI = df_UMI_sum_sorted.iloc[rank, :]["UMI"]
        UMI_min = int(rank_UMI / 10)
    else:
        UMI_min = int(UMI_min)
    df_UMI_cell = df_UMI_sum[df_UMI_sum.UMI >= UMI_min]
    df_UMI_sum["mark"] = df_UMI_sum["UMI"].apply(lambda x: "CB"
                                                 if (x >= UMI_min) else "UB")
    report_prepare(df_UMI_sum, outdir + "/../")

    cell_barcodes = set(df_UMI_cell.barcode)
    cell_number = len(cell_barcodes)
    cell_summary_row_list.append({
        "item": "Estimated Number of Cells",
        "count": cell_number,
        "total_count": cell_number,
    })

    # df_UMI_count_filter1 in cell
    df_cell = df_UMI_count_filter1[df_UMI_count_filter1.barcode.isin(
        cell_barcodes)]
    # filter2: cell wtih UMI >= iUMI of identical receptor type and CDR3
    # combinations.
    df_cell_UMI_count_filter2 = df_cell[df_cell.UMI >= iUMI]

    # cell confident
    df_cell_confident = df_cell_UMI_count_filter2[
        df_cell_UMI_count_filter2["chain"].isin(chains)]
    df_cell_confident = df_cell_confident.sort_values("UMI", ascending=False)
    df_cell_confident = df_cell_confident.groupby(["barcode", "chain"],
                                                  as_index=False).head(1)

    # count
    df_cell_confident_count = df_cell_confident.set_index(["barcode", "chain"])
    df_cell_confident_count = df_cell_confident_count.unstack()
    df_cell_confident_count.columns = [
        '_'.join(col) for col in df_cell_confident_count
    ]
    df_cell_confident_count = df_cell_confident_count.reset_index()
    df_cell_confident_count.fillna(inplace=True, value="NA")

    # clonetypes
    seqs = ["aaSeqCDR3", "nSeqCDR3"]
    cols = []
    for chain in chains:
        for seq in seqs:
            cols.append("_".join([seq, chain]))

    for col in cols:
        if not (col in list(df_cell_confident_count.columns)):
            df_cell_confident_count[col] = "NA"

    df_clonetypes = df_cell_confident_count.copy()

    df_clonetypes = df_clonetypes.groupby(cols, as_index=False).agg(
        {"barcode": "count"})
    # put na last
    df_clonetypes.replace('NA', np.nan, inplace=True)
    df_clonetypes.sort_values(["barcode"] + cols,
                              ascending=False,
                              na_position='last',
                              inplace=True)
    df_clonetypes.replace(np.nan, 'NA', inplace=True)

    total_CDR3_barcode_number = sum(df_clonetypes.barcode)
    df_clonetypes["percent"] = df_clonetypes.barcode / \
        total_CDR3_barcode_number * 100
    df_clonetypes["percent"] = df_clonetypes["percent"].apply(
        lambda x: round(x, 2))

    # add clonetype ID
    df_clonetypes = df_clonetypes.reset_index()
    df_clonetypes["clonetype_ID"] = pd.Series(df_clonetypes.index) + 1
    df_clonetypes.drop(columns=["index"], inplace=True)

    # order
    order = ["clonetype_ID"] + cols + ["barcode", "percent"]
    df_clonetypes = df_clonetypes[order]
    df_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True)
    # out clonetypes
    df_clonetypes.to_csv(clonetypes_file, sep="\t", index=False)

    if type == "TCR":

        UMI_col_dic = {"TRA": "UMI_TRA", "TRB": "UMI_TRB"}
        for chain in UMI_col_dic:
            UMI_col_name = UMI_col_dic[chain]
            if UMI_col_name in df_cell_confident_count.columns:
                df_cell_confident_count[UMI_col_name].replace("NA",
                                                              0,
                                                              inplace=True)
                Median_chain_UMIs_per_Cell = np.median(
                    df_cell_confident_count[UMI_col_name])
            else:
                Median_chain_UMIs_per_Cell = 0
            cell_summary_row_list.append({
                "item":
                "Median {chain} UMIs per Cell".format(chain=chain),
                "count":
                Median_chain_UMIs_per_Cell,
                "total_count":
                np.nan
            })

        df_TRA_TRB = df_cell_confident_count[
            (df_cell_confident_count.aaSeqCDR3_TRA != "NA")
            & (df_cell_confident_count.aaSeqCDR3_TRB != "NA")]
        cell_with_confident_TRA_and_TRB = df_TRA_TRB.shape[0]
        cell_summary_row_list.append({
            "item": "Cell with TRA and TRB",
            "count": cell_with_confident_TRA_and_TRB,
            "total_count": cell_number,
        })
        """
        df cell barcode filter
        intersect cell_barcodes from scRNA-Seq with barcode from TCR seq
        """
        if match_bool:
            cell_with_match_barcode = match_cell_barcodes.intersection(
                cell_barcodes)
            cell_with_match_barcode_number = len(cell_with_match_barcode)

            df_match = df_cell_confident_count[
                df_cell_confident_count.barcode.isin(match_cell_barcodes)]

            df_match_TRA_TRB = df_match[(df_match.aaSeqCDR3_TRA != "NA")
                                        & (df_match.aaSeqCDR3_TRB != "NA")]
            match_cell_with_TRA_and_TRB = df_match_TRA_TRB.shape[0]

            cell_summary_row_list.append({
                "item": "Cell with Barcode Match",
                "count": cell_with_match_barcode_number,
                "total_count": cell_number,
            })
            cell_summary_row_list.append({
                "item": "Cell with Barcode Match, TRA and TRB",
                "count": match_cell_with_TRA_and_TRB,
                "total_count": cell_number,
            })

    # BCR
    elif type == "BCR":

        UMI_col_dic = {"IGH": "UMI_IGH", "IGL": "UMI_IGL", "IGK": "UMI_IGK"}
        for chain in UMI_col_dic:
            UMI_col_name = UMI_col_dic[chain]
            if UMI_col_name in df_cell_confident_count.columns:
                df_cell_confident_count[UMI_col_name].replace("NA",
                                                              0,
                                                              inplace=True)
                df_cell_confident_count_over_zero = df_cell_confident_count[
                    df_cell_confident_count[UMI_col_name] > 0]
                Median_chain_UMIs_per_Cell = np.median(
                    df_cell_confident_count_over_zero[UMI_col_name])
            else:
                Median_chain_UMIs_per_Cell = 0
            cell_summary_row_list.append({
                "item":
                "Median {chain} UMIs per Cell".format(chain=chain),
                "count":
                Median_chain_UMIs_per_Cell,
                "total_count":
                np.nan
            })

        df_heavy_and_light = df_cell_confident_count[
            (df_cell_confident_count.aaSeqCDR3_IGH != "NA")
            & ((df_cell_confident_count.aaSeqCDR3_IGL != "NA")
               | (df_cell_confident_count.aaSeqCDR3_IGK != "NA"))]
        Cell_with_Heavy_and_Light_Chain = df_heavy_and_light.shape[0]
        cell_summary_row_list.append({
            "item": "Cell with Heavy and Light Chain",
            "count": Cell_with_Heavy_and_Light_Chain,
            "total_count": cell_number
        })
        """
        df cell barcode filter
        intersect cell_barcodes from normal scRNA-Seq with barcode from BCR seq
        """
        if match_bool:
            cell_with_match_barcode = match_cell_barcodes.intersection(
                cell_barcodes)
            cell_with_match_barcode_number = len(cell_with_match_barcode)

            df_match = df_cell_confident_count[
                df_cell_confident_count.barcode.isin(match_cell_barcodes)]

            # median match UMI
            df_match_heavy_light = df_match[
                (df_match.aaSeqCDR3_IGH != "NA")
                & ((df_match.aaSeqCDR3_IGL != "NA")
                   | (df_match.aaSeqCDR3_IGK != "NA"))]
            match_cell_with_heavy_and_light = df_match_heavy_light.shape[0]

            cell_summary_row_list.append({
                "item": "Cell with Barcode Match ",
                "count": cell_with_match_barcode_number,
                "total_count": cell_number
            })
            cell_summary_row_list.append({
                "item": "Cell with Barcode Match, Heavy and Light Chain",
                "count": match_cell_with_heavy_and_light,
                "total_count": cell_number
            })

    if match_bool:
        """
        df_match_clonetypes
        """
        df_match_clonetypes = df_match.groupby(cols, as_index=False).agg(
            {"barcode": "count"})
        total_match_CDR3_barcode_number = sum(df_match_clonetypes.barcode)
        df_match_clonetypes["percent"] = df_match_clonetypes.barcode / \
            total_match_CDR3_barcode_number * 100
        df_match_clonetypes["percent"] = df_match_clonetypes["percent"].apply(
            lambda x: round(x, 2))
        df_match_clonetypes.rename(columns={"barcode": "barcode_count"},
                                   inplace=True)
        df_match_clonetypes = df_match_clonetypes.merge(df_clonetypes,
                                                        on=cols,
                                                        how='left',
                                                        suffixes=('', '_y'))
        # order and drop duplicated cols
        order = ["clonetype_ID"] + cols + ["barcode_count", "percent"]
        df_match_clonetypes = df_match_clonetypes[order]
        df_match_clonetypes.sort_values(["barcode_count", "clonetype_ID"],
                                        ascending=[False, True],
                                        inplace=True)
        df_match_clonetypes.to_csv(match_clonetypes_file,
                                   sep="\t",
                                   index=False)

    df_mergeID = pd.merge(df_cell_confident_count,
                          df_clonetypes,
                          how="left",
                          on=cols)
    df_mergeID.sort_values(["clonetype_ID", "barcode"], inplace=True)
    # output df_cell_confident_count
    df_mergeID.to_csv(cell_confident_count_file, sep="\t", index=False)
    df_mergeID = df_mergeID[["barcode", "clonetype_ID"]]
    df_cell_confident_with_ID = pd.merge(df_cell_confident,
                                         df_mergeID,
                                         how="left",
                                         on="barcode")
    df_cell_confident_with_ID.sort_values(["clonetype_ID", "barcode", "chain"],
                                          inplace=True)
    # output df_cell_confident
    df_cell_confident_with_ID.to_csv(cell_confident_file,
                                     sep="\t",
                                     index=False)

    # summary file
    cell_summary = pd.DataFrame(cell_summary_row_list,
                                columns=["item", "count", "total_count"])
    cell_summary["count"] = cell_summary["count"].apply(int)
    cell_summary["percent"] = cell_summary["count"] / \
        (cell_summary.total_count.astype("float")) * 100
    cell_summary["percent"] = cell_summary["percent"].apply(
        lambda x: round(x, 2))
    cell_summary["count"] = cell_summary["count"].apply(format_number)

    def percent_str_func(row):
        need_percent = bool(
            re.search("Cell with", row["item"], flags=re.IGNORECASE))
        if need_percent:
            return "(" + str(row["percent"]) + "%)"
        else:
            return ""

    cell_summary["percent_str"] = cell_summary.apply(
        lambda row: percent_str_func(row), axis=1)

    # stat file
    def gen_stat(summary, stat_file):
        stat = summary
        stat["new_count"] = stat["count"].astype(str) + stat["percent_str"]
        stat = stat.loc[:, ["item", "new_count"]]
        stat.to_csv(stat_file, sep=":", header=None, index=False)

    cell_stat_file = "{}/stat.txt".format(outdir)
    gen_stat(cell_summary, cell_stat_file)
    name = type + '_count_vdj'
    t = reporter(
        name=name,
        sample=args.sample,
        stat_file=cell_stat_file,
        outdir=outdir + '/..',
        assay=args.assay,
        parameters={"iUMI": iUMI},
    )
    t.get_report()

    # cloneytpes table
    def format_table(df_clonetypes, top10_clonetypes_file):
        top10_clonetypes_df = df_clonetypes.head(10)
        top10_clonetypes_df = top10_clonetypes_df.reset_index(drop=True)
        top10_clonetypes_df.index = top10_clonetypes_df.index + 1
        top10_clonetypes_df["percent"] = top10_clonetypes_df["percent"].apply(
            lambda x: str(x) + "%")
        seqs = ["aaSeqCDR3"]
        cols = []
        for chain in chains:
            for seq in seqs:
                cols.append("_".join([seq, chain]))
        top10_cols = ["clonetype_ID"] + cols + ["barcode_count", "percent"]
        top10_clonetypes_df = top10_clonetypes_df[top10_cols]
        top10_clonetypes_df.to_csv(top10_clonetypes_file,
                                   sep="\t",
                                   index=False)
        table_header = ["Clonetype_ID"] + cols + ["Frequency", "Percent"]
        return table_header

    table_header = format_table(df_clonetypes, top10_clonetypes_file)
    use_top10_clonetypes_file = top10_clonetypes_file
    section_header = 'Top10 clonetypes'
    if match_bool:
        format_table(df_match_clonetypes, match_top10_clonetypes_file)
        use_top10_clonetypes_file = match_top10_clonetypes_file
        section_header = 'Match Top10 clonetypes'

    t = reporter(
        name="clonetypes",
        sample=args.sample,
        table_file=use_top10_clonetypes_file,
        table_header=table_header,
        outdir=outdir + '/..',
        assay=args.assay,
        parameters={'section_header': section_header},
    )
    t.get_report()

    # other_metrics_file
    """
Ejemplo n.º 24
0
def count_smk(args):

    read_file = args.read_file
    match_dir = args.match_dir
    tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0]
    UMI_min = args.UMI_min
    SNR_min = args.SNR_min
    dim = int(args.dim)
    combine_cluster = args.combine_cluster
    outdir = args.outdir
    sample = args.sample
    assay = args.assay

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # stat_row
    stats = pd.Series()

    # process
    match_barcode, cell_total = read_barcode_file(match_dir)

    UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv'
    tsne_tag_file = f'{outdir}/{sample}_tsne_tag.tsv'
    cluster_count_file = f'{outdir}/{sample}_cluster_count.tsv'
    cluster_plot = f'{outdir}/{sample}_cluster_plot.pdf'
    if combine_cluster:
        combine_cluster_count_file = f'{outdir}/{sample}_combine_cluster_count.tsv'
        combine_cluster_plot = f'{outdir}/{sample}_combine_cluster_plot.pdf'

    df_read_count = pd.read_csv(read_file, sep="\t", index_col=0)
    mapped_read = df_read_count['read_count'].sum()

    # in cell
    df_read_count_in_cell = df_read_count[df_read_count.index.isin(
        match_barcode)]
    mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum())
    stats = stats.append(
        pd.Series(format_stat(mapped_read_in_cell, mapped_read),
                  index=['Mapped Reads in Cells']))

    # UMI
    df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby(
        ['barcode', 'SMK_barcode_name']).agg({'UMI': 'count'})
    df_UMI_in_cell = df_UMI_in_cell.reset_index()
    df_UMI_in_cell = df_UMI_in_cell.pivot(index='barcode',
                                          columns='SMK_barcode_name',
                                          values='UMI')
    df_cell = pd.DataFrame(index=match_barcode)
    df_UMI_cell = pd.merge(df_cell,
                           df_UMI_in_cell,
                           how="left",
                           left_index=True,
                           right_index=True)

    # fillna
    df_UMI_cell.fillna(0, inplace=True)
    df_UMI_cell = df_UMI_cell.astype(int)

    # UMI
    UMIs = df_UMI_cell.apply(sum, axis=1)
    median = round(np.median(UMIs), 2)
    mean = round(np.mean(UMIs), 2)
    stats = stats.append(pd.Series(str(median), index=['Median UMI per Cell']))

    stats = stats.append(pd.Series(str(mean), index=['Mean UMI per Cell']))

    UMI_min = get_UMI_min(df_UMI_cell, UMI_min)
    count_smk.logger.info(f'UMI_min: {UMI_min}')
    SNR_min = get_SNR_min(df_UMI_cell, dim, SNR_min, UMI_min)
    count_smk.logger.info(f'SNR_min: {SNR_min}')
    df_UMI_cell["tag"] = df_UMI_cell.apply(tag_type,
                                           UMI_min=UMI_min,
                                           SNR_min=SNR_min,
                                           dim=dim,
                                           axis=1)
    df_UMI_cell.to_csv(UMI_tag_file, sep="\t")

    df_tsne = pd.read_csv(tsne_file, sep="\t", index_col=0)
    df_tsne_tag = pd.merge(df_tsne,
                           df_UMI_cell,
                           how="left",
                           left_index=True,
                           right_index=True)

    if combine_cluster:
        df_combine_cluster = pd.read_csv(combine_cluster,
                                         sep="\t",
                                         header=None)
        df_combine_cluster.columns = ["cluster", "combine_cluster"]
        df_tsne_combine_cluster_tag = pd.merge(df_tsne_tag,
                                               df_combine_cluster,
                                               on=["cluster"],
                                               how="left",
                                               left_index=True).set_index(
                                                   df_tsne_tag.index)
        df_tsne_combine_cluster_tag.to_csv(tsne_tag_file, sep="\t")
    else:
        df_tsne_tag.to_csv(tsne_tag_file, sep="\t")

    write_and_plot(df=df_tsne_tag,
                   column_name="cluster",
                   count_file=cluster_count_file,
                   plot_file=cluster_plot)

    if combine_cluster:
        write_and_plot(df=df_tsne_combine_cluster_tag,
                       column_name="combine_cluster",
                       count_file=combine_cluster_count_file,
                       plot_file=combine_cluster_plot)

    df_tag_count = df_UMI_cell["tag"].value_counts().reset_index()
    df_tag_count.columns = ["item", "count"]
    for index, row in df_tag_count.iterrows():
        stats = stats.append(
            pd.Series(format_stat(row['count'], cell_total),
                      index=[row['item'] + ' Cells']))
    stat_file = f'{outdir}/stat.txt'
    stats.to_csv(stat_file, sep=':', header=False)

    t = reporter(name='count_smk',
                 assay=assay,
                 sample=sample,
                 stat_file=stat_file,
                 outdir=outdir + '/..')
    t.get_report()