Example #1
0
def read_bam_from_bed(infile, bedfile, position_threshold):
    chrregions = {}
    chrends = {}
    regions = read_bed(bedfile)
    regions = sort_regions(regions)
    regions = merge_regions(regions, position_threshold)
    regions = expand_regions_from_bed(regions, infile)
    newregions = []
    for contig in regions:
        for a, b, c in regions[contig]:
            newregions.append((contig, a, b, c))

    with pysam.AlignmentFile(infile, 'rb') as f:
        chrs = get_chromosome_list_from_bam(f)
        for contig, start, end, name in newregions:
            if contig in chrs:
                if contig not in chrregions:
                    chrregions[contig] = {}
                chrregions[contig][start] = count_umis_in_region(
                    f, contig, start, end)
                if contig not in chrends:
                    chrends[contig] = {}
                chrends[contig][start] = end
        regions = remove_singleton_regions(chrregions, 2)
    return (regions, chrends)
Example #2
0
def main(bamfilename, bedfilename):
    with open('/home/xsteto/umierrorcorrect/umi.pickle', 'rb') as f:
        umis = pickle.load(f)
    fasta = pysam.FastaFile('/medstore/External_References/hg19/Homo_sapiens_sequence_hg19.fasta')
    ref_seq = get_reference_sequence(fasta, '17', 7577495, 7577800)
    contig = '17'
    start = 7577495
    end = 7577800
    position_matrix, singleton_matrix = get_cons_dict(bamfilename, umis, contig, start, end, True)
    consensus_seq = get_all_consensus(position_matrix, umis, contig)
    cons = get_cons_info(consensus_seq, singleton_matrix)
    regions = read_bed(bedfilename)
    regions = sort_regions(regions)
    regions = merge_regions(regions, 0)
    annotation = regions[contig]
    with open('out/cons.out', 'w') as f:
        write_consensus(f, cons, ref_seq, start, contig, annotation, False)
Example #3
0
def run_umi_errorcorrect(args):
    '''Run the umi clustering and consensus read generation (error correction)'''
    logging.info("Starting UMI clustering")    
    args.output_path = check_output_directory(args.output_path)
    if args.regions_from_bed:
        group_method = 'fromBed'
    elif args.regions_from_tag:
        group_method = 'fromTag'
    else:
        group_method = 'automatic'

    logging.info('Group by position method: {}'.format(group_method))
    if not args.sample_name:
        args.sample_name = get_sample_name(args.bam_file)
    if group_method == 'fromTag':
        regions, ends, starts = cluster_umis_on_position(args.bam_file, args.position_threshold,
                                             group_method, args.bed_file)
    else:
        regions, ends = cluster_umis_on_position(args.bam_file, args.position_threshold, 
                                             group_method, args.bed_file)
    
    #print(regions)
    nregions = 0
    for chrx in regions:
        nregions += len(regions[chrx])
    logging.info("Number of regions, {}".format(nregions))
    
    edit_distance_threshold = args.edit_distance_threshold
    if args.num_threads:
        num_cpus = int(args.num_threads)
    else:
        num_cpus = int(cpu_count())
    logging.info("Starting Consensus sequence generation")
    logging.info("Starting {} threads".format(num_cpus))
    fasta = args.reference_file
    if args.bed_file:
        bedregions = read_bed(args.bed_file)
        bedregions = sort_regions(bedregions)
        if group_method=='fromBed':
            bedregions = merge_regions(bedregions, 0)
    else:
        bedregions = []
    if group_method=='fromTag':
        bamfilelist = cluster_umis_all_regions(regions, ends, edit_distance_threshold,
                                           args.sample_name, args.bam_file, args.output_path,
                                           args.include_singletons, fasta, bedregions,
                                           num_cpus, args.indel_frequency_threshold,
                                           args.consensus_frequency_threshold,
                                           args.regions_from_tag, starts)
    else:
        bamfilelist = cluster_umis_all_regions(regions, ends, edit_distance_threshold, 
                                           args.sample_name, args.bam_file, args.output_path, 
                                           args.include_singletons, fasta, bedregions, 
                                           num_cpus, args.indel_frequency_threshold, 
                                           args.consensus_frequency_threshold)
    merge_bams(args.output_path, bamfilelist, args.sample_name)
    index_bam_file(args.output_path + '/' + args.sample_name + '_consensus_reads.bam',
              num_cpus)
    consfilelist = [x.rstrip('.bam') + '.cons' for x in bamfilelist]
    merge_cons(args.output_path, consfilelist, args.sample_name)
    cons_file = args.output_path + '/' + args.sample_name + '_cons.tsv'
    if args.remove_large_files:
        os.remove(args.output_path+'/' +args.bam_file)

    statfilelist = [x.rstrip('.bam') + '.hist' for x in bamfilelist]
    merge_stat(args.output_path, statfilelist, args.sample_name)
    duppos = check_duplicate_positions(cons_file)
    if any(duppos):
        merge_duplicate_positions_all_chromosomes(duppos,cons_file,num_cpus)
    merge_duplicate_stat(args.output_path,args.sample_name)
    logging.info("Consensus generation complete, output written to {}, {}".format(args.output_path + 
                 '/' + args.sample_name + '_consensus_reads.bam',
                 args.output_path + '/' + args.sample_name + '_cons.tsv'))