Beispiel #1
0
 def __iter__(self):
     if filetype.upper() == "BED":
         for line in HTSeq.BED_Reader(filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
     elif filetype.upper() == "GFF" or filetype.upper() == "GTF":
         for line in HTSeq.GFF_Reader(filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
     elif filetype.upper() == "SAM":
         for line in HTSeq.SAM_Reader(filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
     elif filetype.upper() == "BAM":
         for line in HTSeq.BAM_Reader(filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
     elif self.filetype.upper() == "OTHER":
         for line in func(self.filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
Beispiel #2
0
    def annotate(self, circfile, annotation_tree, output):
        # the circRNA file should be in a bed format, have chr\tstart\tend\t'.'\tjunctiontype\tstrand
        # The annotation tree should be a IntervalTree object

        # check the input
        with open(circfile, 'r') as tmpcirc:
            tmpsplit = tmpcirc.readline().split('\t')
            if len(tmpsplit) != 6:
                warnings.warn(
                    'Input circRNA file is not the desired bed6 format!')
                logging.warning(
                    'Input circRNA file is not the desired bed6 format!')
            ncol = len(tmpsplit)

        # Annotate with Interval tree algorithm
        out = open(output, 'w')
        circ_reagions = HTSeq.BED_Reader(circfile)
        for circ in circ_reagions:
            annotation = self.annotate_one_interval(circ.iv,
                                                    annotation_tree,
                                                    what='gene')
            out.write('\t'.join([
                circ.iv.chrom,
                str(circ.iv.start),
                str(circ.iv.end), annotation,
                str(int(circ.score)), circ.iv.strand
            ]) + '\n')
        out.close()
Beispiel #3
0
def map_genome_features(files,
                        ref,
                        gtf_file,
                        outpath='',
                        aligner='bowtie',
                        overwrite=True,
                        aligner_params=''):
    """Convenience method that maps multiple files to a genome with features
       and return/process hits. Can be used for miRNA discovery.

       Args:
           ref: genome bowtie index name
           gtf_file: gtf or bed file with features
           outpath: output path
           aligner: short read alligner to use
           aligner_params: aligner parameters
    """

    if aligner_params != '':
        aligners.set_params(aligner, aligner_params)
    if overwrite == True:
        print('removing old temp files')
        utils.remove_files(outpath, '*_mapped.sam')
        utils.remove_files(outpath, '*_r.fa')

    ext = os.path.splitext(gtf_file)[1]
    if ext == '.gtf' or ext == '.gff' or ext == '.gz':
        features = HTSeq.GFF_Reader(gtf_file)
    elif ext == '.bed':
        features = HTSeq.BED_Reader(gtf_file)

    exons = get_exons(features)

    cfiles = collapse_files(files, outpath)
    print(cfiles)
    result = []
    for cfile in cfiles:
        label = os.path.splitext(os.path.basename(cfile))[0]
        samfile = os.path.join(outpath, '%s_%s.sam' % (label, ref))
        if aligner == 'bowtie':
            aligners.bowtie_align(cfile, ref, outfile=samfile)
        elif aligner == 'subread':
            aligners.subread_align(cfile, ref, samfile)
        #get true read counts for collapsed file
        readcounts = utils.read_collapsed_file(cfile)
        #count features
        counts = count_features(samfile, features=exons, readcounts=readcounts)
        counts['label'] = label
        counts['genome'] = ref
        total = readcounts.reads.sum()
        counts['fraction'] = counts.reads / total
        result.append(counts)
    result = pd.concat(result)
    result = merge_features(result, gtf_file)
    return result
Beispiel #4
0
def get_gene(bgmodel=None, bed=None):
    gs = set()
    for item in HTSeq.BED_Reader(bed):
        iv = item.iv
        #just in case the bed contains not included chrom in background model
        try:
            for ivb, valueb in bgmodel[iv].steps():
                gs.update(valueb)
        except:
            continue
    return gs
Beispiel #5
0
def get_overlap_rep(rep_model, peak_f):
    reps = set()
    c_t = os.path.split(peak_f)[1].split("_")[0]
    for g in HTSeq.BED_Reader(peak_f):
        iv = g.iv
        try:
            for niv, value in list(rep_model[iv].steps()):
                reps.update(value)
        except:
            continue
    return c_t, reps
Beispiel #6
0
    def __init__(self,
                 l_fp):  #, min_peak_score=None, min_core_score=None, iv=None):
        # Read peaks
        l_peak_fp = []
        for (i, fp) in enumerate(l_fp):
            fh = hts.BED_Reader(fp)
            l_peak = [gf for gf in fh]
            print >> sys.stderr, "%d peaks found in %s" % (len(l_peak), fp)
            #for peak in l_peak: peak.name = {"source": i}
            l_peak_fp.append(l_peak)

        self.l_peak_fp = l_peak_fp
def makeIslandFilteredGraphFile(chroms, chrom_lengths, window_size, bamfile,
                                islandbedfile, outfile):
    ga = HTSeq.GenomicArray(chroms, stranded=False, typecode='d')

    bam_reader = HTSeq.BAM_Reader(bamfile)
    for alt_first, alt_second in HTSeq.pair_SAM_alignments(bam_reader):
        if alt_first == None or alt_second == None:
            continue
        if alt_first.aligned and alt_first.optional_field(
                "NH"
        ) == 1 and alt_second.aligned and alt_second.optional_field("NH") == 1:
            if alt_first.iv.chrom != alt_second.iv.chrom or alt_first.iv.strand == alt_second.iv.strand or alt_first.iv.chrom not in chroms:
                continue

            alt_first_iv_seq = [
                co.ref_iv for co in alt_first.cigar
                if co.type == "M" and co.size > 0
            ]
            alt_second_iv_seq = [
                reverse_strand(co.ref_iv) for co in alt_second.cigar
                if co.type == "M" and co.size > 0
            ]
            alt_iv_seq = combine_pair_iv_seq(alt_first_iv_seq,
                                             alt_second_iv_seq)

            read_length = get_read_length(alt_iv_seq)
            for alt_iv in alt_iv_seq:
                ga[alt_iv] += 1.0 / read_length

    ga_island = HTSeq.GenomicArray(chroms, stranded=False, typecode='d')
    bedfile = HTSeq.BED_Reader(islandbedfile)
    for alt in bedfile:
        for iv, value in ga[alt.iv].steps():
            ga_island[iv] += value

    with open(outfile, 'w') as f:
        for chrom in chroms:
            chrom_length = chrom_lengths[chrom]
            num_windows = chrom_length / window_size
            for i in range(num_windows):
                count_in_window = 0
                window_start = i * window_size
                window_end = (i + 1) * window_size
                window_iv = HTSeq.GenomicInterval(chrom, window_start,
                                                  window_end)
                for iv, value in ga_island[window_iv].steps():
                    count_in_window += value * iv.length
                count_in_window = int(count_in_window)
                if count_in_window != 0:
                    outline = chrom + '\t' + str(window_start) + '\t' + str(
                        window_end) + '\t' + str(count_in_window) + '\n'
                    f.write(outline)
Beispiel #8
0
    def __create_genomic_signals(self,
                                 stranded=True,
                                 func=None,
                                 use_wrappers=True):
        """Prepares coverage as a HTSeq.GenomicArray

        :param filepath: path to file
        :param filetype: type of the file (can be bed etc.)
        """
        stderr.write("Creating %s signal. It may take few minutes...\n" %
                     self.name)
        self.coverage = HTSeq.GenomicArray("auto",
                                           stranded=stranded,
                                           typecode="d")
        self.library_size = 0
        if self.filetype.upper() == "BED":
            if use_wrappers:
                self.coverage = BedWrapper(self.filepath)
            else:
                for line in HTSeq.BED_Reader(self.filepath):
                    self.coverage[line.iv] += 1
                    self.library_size += 1
        elif self.filetype.upper() == "GFF" or self.filetype.upper() == "GTF":
            for line in HTSeq.GFF_Reader(self.filepath):
                self.coverage[line.iv] += 1
                self.library_size += 1
        elif self.filetype.upper() == "SAM":
            for line in HTSeq.SAM_Reader(self.filepath):
                self.coverage[line.iv] += 1
                self.library_size += 1
        elif self.filetype.upper() == "BAM":
            if use_wrappers:
                raise NotImplementedError(
                    "Bam wrapper is not yet implemented!")
                self.coverage = BamWrapper(self.filetype)
            for line in HTSeq.BAM_Reader(self.filepath):
                self.coverage[line.iv] += 1
                self.library_size += 1
        elif (self.filetype.upper() == "BG") or (self.filetype.upper()
                                                 == "BEDGRAPH"):
            raise NotImplementedError("BedGraph is not yet implemented!")
        elif (self.filetype.upper() == "BW") or (self.filetype.upper()
                                                 == "BIGWIG"):
            self.coverage = BigWigWrapper(self.filepath)
        elif self.filetype.upper() == "OTHER":
            for line in func(self.filepath):
                self.coverage[line.iv] += 1
                self.library_size += 1
        else:
            assert False, "I should not be here!"
def getGenomicarrayOfSetsAndNames(bed_file):
    """ Returns a GenomicArrayOfSets of all regions and a list of region names """

    # build parser for rgions
    regionParser = HTSeq.BED_Reader(bed_file)

    # build GenomicArrayOfSets for all regions
    regions = HTSeq.GenomicArrayOfSets("auto", stranded=False)

    # add all region to the GenomicArrayOfSets
    for feature in regionParser:
        regions[feature.iv] += feature.name

    region_names = [feature.name for feature in regionParser]

    return regions, region_names
Beispiel #10
0
def test_output_bed_loss_resolution_equal_stepsize(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # generate loss
    #
    # resolution < stepsize
    inputs = Array("x", numpy.random.random((7, 1, 1, 10)))
    outputs = Array('y',
                    numpy.random.random((7, 1, 1, 4)),
                    conditions=['c1', 'c2', 'c3', 'c4'])

    bwm = get_janggu_conv(inputs, outputs)
    data_path = pkg_resources.resource_filename('janggu',
                                                'resources/10regions.bed')

    gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200)

    dummy_eval = Scorer('loss',
                        lambda t, p: [0.1] * len(t),
                        exporter=export_bed)

    bwm.evaluate(inputs,
                 outputs,
                 callbacks=[dummy_eval],
                 exporter_kwargs={
                     'gindexer': gi,
                     'resolution': 200
                 })

    file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name,
                         'loss.nptest.y.{}.bed')

    for cond in ['c1', 'c2', 'c3', 'c4']:
        assert os.path.exists(file_.format(cond))

    bed = iter(HTSeq.BED_Reader(file_.format('c1')))

    nreg = 0
    for reg in bed:
        numpy.testing.assert_equal(reg.score, 0.1)
        nreg += 1


#        numpy.testing.assert_equal(breg.score, value)

    assert nreg == 7, 'There should be 7 regions in the bed file.'
Beispiel #11
0
def test_output_bed_predict_resolution_unequal_stepsize(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # generate loss
    #
    # resolution < stepsize
    inputs = Array("x", numpy.random.random((7, 4, 1, 10)))
    outputs = Array('y',
                    numpy.random.random((7, 4, 1, 4)),
                    conditions=['c1', 'c2', 'c3', 'c4'])

    bwm = get_janggu(inputs, outputs)
    data_path = pkg_resources.resource_filename('janggu',
                                                'resources/10regions.bed')

    gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200)

    dummy_eval = Scorer('pred',
                        lambda p: [0.1] * len(p),
                        exporter=ExportBed(gindexer=gi, resolution=50),
                        conditions=['c1', 'c2', 'c3', 'c4'])

    bwm.predict(inputs, callbacks=[dummy_eval])

    file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name,
                         'pred.nptest.y.{}.bed')

    for cond in ['c1', 'c2', 'c3', 'c4']:
        assert os.path.exists(file_.format(cond))

    bed = iter(HTSeq.BED_Reader(file_.format('c1')))

    nreg = 0
    for reg in bed:
        numpy.testing.assert_equal(reg.score, 0.1)
        nreg += 1

    assert nreg == 28, 'There should be 28 regions in the bed file.'
Beispiel #12
0
def produce_sequences(bed_file,
                      fasta_file,
                      gtf_file,
                      min_length,
                      max_length,
                      width,
                      padding,
                      graphprot_compatible=False):
    print(" Reading primary peaks from BED file")
    bed_file = HTSeq.BED_Reader(bed_file)
    input_peaks = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    total_peaks = 0
    for peak in bed_file:
        total_peaks += 1
        if width > 0:
            mid = int((peak.iv.start + peak.iv.end) / 2)
            peak.iv.start = mid - int(width / 2)
            peak.iv.end = peak.iv.start + width
        if padding > 0:
            peak.iv.start -= padding
            peak.iv.end += padding
        input_peaks[peak.iv] += peak.name

    print(" Reading GTF file from " + str(gtf_file))
    genes = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    gene_dict = dict()
    gtf_file = HTSeq.GFF_Reader(gtf_file)
    total_genes = 0
    for feature in gtf_file:
        if feature.type == "gene":
            total_genes += 1
            genes[feature.iv] += feature.name
            gene_dict[feature.name] = feature
    if total_genes == 0:  # this GTF file doesn't have 'gene' features, we need to build the gene intervals from the exon intervals instead
        print(
            " No 'gene' features in GTF, building gene intervals from exons instead."
        )
        for feature in gtf_file:
            if feature.type == "exon":
                gene = gene_dict.get(feature.attr["gene_id"], False)
                if not gene:
                    feature.type = 'gene'
                    gene_dict[feature.attr["gene_id"]] = feature
                    total_genes += 1
                else:
                    if gene.iv.start > feature.iv.start:
                        gene.iv.start = feature.iv.start
                    if gene.iv.end < feature.iv.end:
                        gene.iv.end = feature.iv.end
                    gene_dict[feature.attr["gene_id"]] = gene
        for gene in gene_dict.values():
            genes[gene.iv] += gene.attr["gene_id"]
    print(" Loaded {} total genes.".format(total_genes))

    print(" Reading genome from file " + str(fasta_file) + " ...", )
    sys.stdout.flush()
    genome = read_genome(fasta_file)
    print("done")

    print(" Filtering and constructing background...")
    pos_peaks = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    neg_peaks = HTSeq.GenomicArrayOfSets("auto", stranded=True)

    pos_seqs = []
    neg_seqs = []
    seq_ids = []

    not_in_gene = 0
    multiple_genes = 0
    redundant = 0
    invalid = 0
    for peak in bed_file:
        valid = True
        iset = None
        if peak.iv.length < min_length or peak.iv.length > max_length:
            valid = False
            invalid += 1
        if valid:
            if width > 0:
                mid = int((peak.iv.start + peak.iv.end) / 2)
                peak.iv.start = mid - int(width / 2)
                peak.iv.end = peak.iv.start + width
            if padding > 0:
                peak.iv.start -= padding
                peak.iv.end += padding
            for iv2, step_set in input_peaks[peak.iv].steps():
                if iset is None:
                    iset = step_set.copy()
                else:
                    iset.intersection_update(step_set)
        try:
            overlaps = len(iset)
        except TypeError:
            overlaps = 0
        if overlaps == 1 and valid:
            # this peak does not overlap other peaks after padding, so we can assume it's reasonably unique
            pos_peaks[peak.iv] += peak.name

            # now find the gene that it overlaps
            gset = None
            #print " Looking for overlapping gene in list of {} total genes on chromosome {}.".format(len(genes[peak.iv]), peak.iv)
            for iv2, step_set in genes[peak.iv].steps():
                if gset is None:
                    gset = step_set.copy()
                else:
                    gset.intersection_update(step_set)

            if len(gset) == 1:
                # this peak overlaps exactly one gene so we know where to randomly choose a background sequence
                gene = gene_dict[list(gset)[0]]
                overlap = True
                overlap_counter = 0
                while overlap:
                    overlap_counter += 1
                    start = random.randint(gene.iv.start,
                                           gene.iv.end - peak.iv.length)
                    end = start + peak.iv.length
                    neg_peak = HTSeq.GenomicInterval(gene.iv.chrom, start, end,
                                                     gene.iv.strand)
                    overlap_peak = None
                    overlap_neg_peak = None
                    for iv2, step_set in pos_peaks[neg_peak].steps():
                        if overlap_peak is None:
                            overlap_peak = step_set.copy()
                        else:
                            overlap_peak.intersection_update(step_set)
                    for iv2, step_set in neg_peaks[neg_peak].steps():
                        if overlap_neg_peak is None:
                            overlap_neg_peak = step_set.copy()
                        else:
                            overlap_neg_peak.intersection_update(step_set)
                    if not overlap_peak and not overlap_neg_peak:  # yes! found a non-overlapping region suitable as background sequence
                        overlap = False
                    if overlap_counter > 1000:  # accept that a non-overlap can't be found but don't use this peak
                        print(
                            "Warning: failed to find non-overlapping background for "
                            + str(peak.name))
                        valid = False
                        overlap = False
                        invalid += 1
                if 'n' in str(genome[neg_peak.chrom]
                              [neg_peak.start:neg_peak.end].seq).lower():
                    print("Warning: 'n' in background sequence for " +
                          str(peak.name))
                    valid = False
                    invalid += 1
                if valid:
                    neg_peaks[neg_peak] += 1
                    pos_seq = Seq(
                        str(genome[peak.iv.chrom]
                            [peak.iv.start:peak.iv.end].seq), generic_dna)
                    if peak.iv.strand == "-":
                        pos_seq = pos_seq.reverse_complement()
                    neg_seq = Seq(
                        str(genome[neg_peak.chrom]
                            [neg_peak.start:neg_peak.end].seq), generic_dna)
                    if neg_peak.strand == "-":
                        neg_seq = neg_seq.reverse_complement()
                    pos_seq = str(pos_seq)
                    neg_seq = str(neg_seq)

                    if graphprot_compatible:
                        pos_seq = pos_seq[:padding].lower(
                        ) + pos_seq[padding:-padding].upper(
                        ) + pos_seq[-padding:].lower()
                        neg_seq = neg_seq[:padding].lower(
                        ) + neg_seq[padding:-padding].upper(
                        ) + neg_seq[-padding:].lower()
                    pos_seqs.append(pos_seq)
                    neg_seqs.append(neg_seq)
                    seq_ids.append(peak.name)
            elif len(gset) == 0:
                not_in_gene += 1
            elif len(gset) > 1:
                multiple_genes += 1
        elif overlaps > 1 and valid:
            redundant += 1

    print(" Found {} invalid peaks (too short or too long).".format(invalid))
    print(" Found {} valid but redundant peaks.".format(redundant))
    print(
        " Found {} non-redundant peaks that did not overlap any genes, and {} that overlapped multiple genes."
        .format(not_in_gene, multiple_genes))
    print(" Found {} valid non-redundant peaks overlapping genes.".format(
        len(pos_seqs)))

    return pos_seqs, neg_seqs, seq_ids
Beispiel #13
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-b",
                      "--file",
                      action="store",
                      type="string",
                      dest="file_name",
                      metavar="<file>",
                      help="name of bed file (not including .bed extension)")
    parser.add_option("-g",
                      "--genome",
                      action="store",
                      type="string",
                      dest="genome_data",
                      metavar="<file>",
                      help="name of reference genome (mm9 for mouse)")
    parser.add_option("-r",
                      "--redundancy",
                      action="store",
                      type="int",
                      dest="redundancy",
                      metavar="<file>",
                      help="redundancy threshold")
    parser.add_option(
        "-w",
        "--window_size",
        action="store",
        type="int",
        dest="window_size",
        metavar="<int>",
        help=
        "size of windows used to partition genome (200 for histones, 50 for TFs"
    )
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        metavar="<int>",
        help=
        "fragment size determines the shift (half of fragment_size of ChIP-seq read position, in bps)"
    )
    parser.add_option("-p",
                      "--genome_fraction",
                      action="store",
                      type="float",
                      dest="genome_fraction",
                      metavar="<int>",
                      help="effective genome fraction: 0.8 in most cases")
    parser.add_option(
        "-s",
        "--gap_size",
        action="store",
        type="int",
        dest="gap_size",
        metavar="<int>",
        help=
        "maximum number of base pairs between windows in the same island (usually same as window size)"
    )
    parser.add_option("-e",
                      "--e-value",
                      action="store",
                      type="string",
                      dest="e_value",
                      metavar="<string>",
                      help="e-value used to determine significance")
    parser.add_option("-i",
                      "--input_dir",
                      action="store",
                      type="string",
                      dest="input_dir",
                      metavar="<string>",
                      help="path to input directory")
    parser.add_option("-o",
                      "--output_dir",
                      action="store",
                      type="string",
                      dest="output_dir",
                      metavar="<string>",
                      help="path to output directory")
    parser.add_option("-a",
                      "--SICER_dir",
                      action="store",
                      type="string",
                      dest="sicer_dir",
                      metavar="<string>",
                      help="path to directory containing SICER files")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    #create string names for files
    #remove .bed extension
    file_name = opt.file_name[:-4]
    bed_file_name = opt.input_dir + "/" + opt.file_name
    sorted_bed_file_name = opt.output_dir + "/" + file_name + "_sorted_temp.bed"
    # This file stores the preprocessed raw bed file.
    red_rem_bed_file_name = opt.output_dir + "/" + file_name + "-" + str(
        opt.redundancy) + "-removed.bed"
    # This file stores the candidate islands.
    score_island_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(opt.gap_size) + ".scoreisland"
    # This file stores the summary graph.
    graph_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + ".graph"
    # This file stores the island-filtered non-redundant raw reads
    island_filtered_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(opt.gap_size) + "-E" + str(
            opt.e_value) + "-islandfiltered.bed"
    # This file stores the sample summary graph in bedgraph format
    normalized_bedgraph_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-normalized.bedgraph"
    # This file stores normalized summary graph made by the island-filtered non-redundant raw reads in bedgraph format
    islandfiltered_normalized_bedgraph_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(opt.gap_size) + "-E" + str(
            opt.e_value) + "-islandfiltered-normalized.bedgraph"
    genome_file = opt.sicer_dir + "/genomes/" + opt.genome_data

    # read genome data from file containing genome data
    # store genome data in the dictionary genome
    genome = SICER_MS.get_genome_data(genome_file)

    # convert E_value to float
    e_value = float(opt.e_value)

    # sort bed file by chromosome, then by coordinate, then by strand
    print "\nSorting BED file..."
    SICER_MS.sort_bed_file(bed_file_name, sorted_bed_file_name)

    # remove redundant reads in bed file and count number of total reads and number of retained reads
    print "\nPreprocess the sorted BED file to remove redundancy with threshold " + str(
        opt.redundancy) + "..."
    total, retained = SICER_MS.remove_redundant_reads_bed(
        sorted_bed_file_name, red_rem_bed_file_name, opt.redundancy, genome)
    print "Total reads: " + str(total) + "\nTotal retained reads: " + str(
        retained) + "\n\n"

    # remove sorted bed file
    os.system('rm %s' % (sorted_bed_file_name))

    # create HTSeq bed reader that can iterate through all of the reads
    bed_iterator = HTSeq.BED_Reader(red_rem_bed_file_name)

    print "Partition the genome in windows... \n"
    # make dictionary of reads and windows and count total reads
    # read_dict: keys are chromosomes and values are a list of read positions
    # window_dict: keys are chromosomes and values are a list of window start coordinates for windows containing reads
    read_dict, window_dict, total_reads = SICER_MS.make_dict_of_reads_and_windows(
        bed_iterator, genome, opt.fragment_size, opt.window_size)

    print "Count reads in windows... \n"
    # calculate the number of island reads in all the windows comprising the islands
    # calculate normalized read count for each window
    # add the window's normalized read count to a genomic array (island_normalized_window_array)
    # the island_normalized_window_array will be used to write a bedgraph file
    window_counts_dict, normalized_window_array = SICER_MS.get_window_counts_and_normalize(
        window_dict, read_dict, genome, 1000000, total_reads, opt.window_size)
    # write bedgraph file of normalized islands
    normalized_window_array.write_bedgraph_file(normalized_bedgraph_file_name)

    print "Find candidate islands exhibiting clustering... \n"
    # finds all islands using the dictionary of window counts and generates .scoreisland file
    # returns a genomic array island_array of all island tag counts and a list of islands (in dictionary format)
    # the dictionary keys of each island are 'island', 'score', and 'chip' (the read count)
    # also writes graph file
    island_array, islands_list = SICER_MS.find_islands(
        window_counts_dict, total_reads, opt.gap_size, opt.window_size, genome,
        opt.genome_fraction, e_value, score_island_file_name, graph_file_name,
        2)

    print "\nFilter reads with identified significant islands...\n"
    # given HTSeq bed_iterator and HTSeq Genomic Array that has chip read count assigned to all islands
    # finds all reads in the bed_iterator that are located in islands
    # if a read is located in an island, it is written to a bed file
    # creates a genomic array of all windows that have reads located in islands
    # returns a dictionary containing all reads located in islands and a dictionary containing all windows in islands
    # dictionary format: keys are chromosomes, values are sorted lists of all read/window positions
    islandfiltered_reads_dict, islandfiltered_windows_dict, total_reads_in_islands = SICER_MS.filter_raw_tags_by_islands(
        bed_iterator, island_array, island_filtered_file_name,
        opt.fragment_size, opt.window_size, genome)

    # calculate the number of island filtered reads in all the windows comprising the islands
    # calculate normalized read count for each window
    # add the window's normalized read count to a genomic array (islandfilt_normalized_window_array)
    # the islandfilt_normalized_window_array will be used to write a bedgraph file
    islandfiltered_window_counts_dict, islandfiltered_normalized_window_array = SICER_MS.get_window_counts_and_normalize(
        islandfiltered_windows_dict, islandfiltered_reads_dict, genome,
        1000000, total_reads_in_islands, opt.window_size)
    # write bedgraph file of normalized filtered islands
    islandfiltered_normalized_window_array.write_bedgraph_file(
        islandfiltered_normalized_bedgraph_file_name)
Beispiel #14
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-b",
                      "--file",
                      action="store",
                      type="string",
                      dest="file_name",
                      metavar="<file>",
                      help="name of bed file")
    parser.add_option("-c",
                      "--control",
                      action="store",
                      type="string",
                      dest="control_file_name",
                      metavar="<file>",
                      help="name of control bed file")
    parser.add_option("-g",
                      "--genome",
                      action="store",
                      type="string",
                      dest="genome_data",
                      metavar="<file>",
                      help="name of reference genome (mm9 for mouse)")
    parser.add_option("-r",
                      "--redundancy",
                      action="store",
                      type="int",
                      dest="redundancy",
                      metavar="<int>",
                      help="redundancy threshold")
    parser.add_option(
        "-w",
        "--window_size",
        action="store",
        type="int",
        dest="window_size",
        metavar="<int>",
        help=
        "size of windows used to partition genome (200 for histones, 50 for TFs"
    )
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        metavar="<int>",
        help=
        "fragment size determines the shift (half of fragment_size of ChIP-seq read position, in bps)"
    )
    parser.add_option("-p",
                      "--genome_fraction",
                      action="store",
                      type="float",
                      dest="genome_fraction",
                      metavar="<int>",
                      help="effective genome fraction: 0.8 in most cases")
    parser.add_option(
        "-s",
        "--gap_size",
        action="store",
        type="int",
        dest="gap_size",
        metavar="<int>",
        help=
        "maximum number of base pairs between windows in the same island (usually same as window size)"
    )
    parser.add_option("-d",
                      "--FDR",
                      action="store",
                      type="string",
                      dest="FDR",
                      metavar="<string>",
                      help="false discovery rate controlling significance")
    parser.add_option("-i",
                      "--input_dir",
                      action="store",
                      type="string",
                      dest="input_dir",
                      metavar="<string>",
                      help="path to input directory")
    parser.add_option("-o",
                      "--output_dir",
                      action="store",
                      type="string",
                      dest="output_dir",
                      metavar="<string>",
                      help="path to output directory")
    parser.add_option("-a",
                      "--SICER_dir",
                      action="store",
                      type="string",
                      dest="sicer_dir",
                      metavar="<string>",
                      help="path to directory containing SICER files")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    file_name = opt.file_name[:-4]
    control_file_name = opt.control_file_name[:-4]

    # create string names for files
    bed_file_name = opt.input_dir + "/" + opt.file_name
    control_bed_file_name = opt.input_dir + "/" + opt.control_file_name
    sorted_bed_file_name = opt.output_dir + "/" + file_name + "_sorted_temp.bed"
    # This file stores the preprocessed raw bed file.
    red_rem_bed_file_name = opt.output_dir + "/" + file_name + "-" + str(
        opt.redundancy) + "-removed.bed"
    island_bed_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(opt.gap_size) + "-FDR" + str(
            opt.FDR) + "-island.bed"
    sorted_control_file_name = opt.output_dir + "/" + control_file_name + "_sorted_temp.bed"
    # This file stores the preprocessed raw bed control file.
    red_rem_control_file_name = opt.output_dir + "/" + control_file_name + "-" + str(
        opt.redundancy) + "-removed.bed"
    # This file stores the sample summary graph in bedgraph format
    normalized_bedgraph_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-normalized.bedgraph"
    # This file stores the control summary graph in bedgraph format
    control_normalized_bedgraph_file_name = opt.output_dir + "/" + control_file_name + "-W" + str(
        opt.window_size) + "-normalized.bedgraph"
    # This file stores the candidate islands.
    score_island_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(opt.gap_size) + ".scoreisland"
    # These files store the summary graphs.
    graph_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + ".graph"
    control_graph_file_name = opt.output_dir + "/" + control_file_name + "-W" + str(
        opt.window_size) + ".graph"
    # This file stores the island-filtered non-redundant raw reads
    island_filtered_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(opt.gap_size) + "-FDR" + str(
            opt.FDR) + "-islandfiltered.bed"
    # This file stores normalized summary graph made by the island-filtered non-redundant raw reads in bedgraph format
    islandfiltered_normalized_bedgraph_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(opt.gap_size) + "-FDR" + str(
            opt.FDR) + "-islandfiltered-normalized.bedgraph"
    # This file stores the summary of candidate islands, including chrom start end read-count_sample read-count-control pvalue, fold change and qvalue
    islandsummary_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(opt.gap_size) + "-islands-summary"
    # This file stores the summary of significant islands identified with FDR criterion.
    filtered_island_file_name = opt.output_dir + "/" + file_name + "-W" + str(
        opt.window_size) + "-G" + str(
            opt.gap_size) + "-islands-summary-FDR" + str(opt.FDR)

    # convert FDR to float
    FDR = float(opt.FDR)

    genome_file = opt.sicer_dir + "/genomes/" + opt.genome_data

    # read genome data from file containing genome data
    # store genome data in the dictionary genome
    genome = SICER_MS.get_genome_data(genome_file)

    # number of islands expected in random background.
    # the E value is used for identification of candidate islands that exhibit clustering.
    e_value = 1000

    # sort bed file by chromosome, then by coordinate, then by strand
    print "\nSorting BED file..."
    SICER_MS.sort_bed_file(bed_file_name, sorted_bed_file_name)
    # sort control file by chromosome, then by coordinate, then by strand
    print "Sorting control BED file..."
    SICER_MS.sort_bed_file(control_bed_file_name, sorted_control_file_name)

    # remove redundant reads in bed file and count number of total reads and number of retained reads
    print "\nPreprocess the sorted BED file to remove redundancy with threshold " + str(
        opt.redundancy) + "..."
    total, retained = SICER_MS.remove_redundant_reads_bed(
        sorted_bed_file_name, red_rem_bed_file_name, opt.redundancy, genome)
    print "Total reads: " + str(total) + "\nTotal retained reads: " + str(
        retained)
    # remove redundant reads in control file and count number of total reads and number of retained reads
    print "\nPreprocess the sorted control file to remove redundancy with threshold " + str(
        opt.redundancy) + "..."
    control_total, control_retained = SICER_MS.remove_redundant_reads_bed(
        sorted_control_file_name, red_rem_control_file_name, opt.redundancy,
        genome)
    print "Control file total reads: " + str(
        control_total) + "\nControl file total retained reads: " + str(
            control_retained) + "\n \n"

    os.system('rm %s %s' % (sorted_bed_file_name, sorted_control_file_name))

    # create HTSeq bed readers that can iterate through all of the reads
    bed_iterator = HTSeq.BED_Reader(red_rem_bed_file_name)
    control_bed_iterator = HTSeq.BED_Reader(red_rem_control_file_name)

    print "Partition the genome in windows... \n"

    # make dictionary of reads and windows and count total reads
    # read_dict: keys are chromosomes and values are a list of read positions
    # window_dict: keys are chromosomes and values are a list of window start coordinates for windows containing reads
    read_dict, window_dict, total_reads = SICER_MS.make_dict_of_reads_and_windows(
        bed_iterator, genome, opt.fragment_size, opt.window_size)

    # make dictionary of reads and windows and count total reads for control file
    control_read_dict, control_window_dict, control_total_reads = SICER_MS.make_dict_of_reads_and_windows(
        control_bed_iterator, genome, opt.fragment_size, opt.window_size)
    print "Count reads in windows... \n"
    # get the read count and normalized read count of all windows in the bed file
    # create window counts dictionary window_counts_dict
    # add the window's score to the genomic array normalized_window_array
    # window_counts_dict: keys are chromosomes and values are a list of smaller
    # lists of the format [window_start, read_count, score] (the score will be calculated later)
    window_counts_dict, normalized_window_array = SICER_MS.get_window_counts_and_normalize(
        window_dict, read_dict, genome, 1000000, total_reads, opt.window_size)

    # get the read count and score of all windows in the control file file
    control_window_counts_dict, control_normalized_window_array = SICER_MS.get_window_counts_and_normalize(
        control_window_dict, control_read_dict, genome, 1000000,
        control_total_reads, opt.window_size)

    # write bedgraph file of normalized windows
    normalized_window_array.write_bedgraph_file(normalized_bedgraph_file_name)

    # write bedgraph file of normalized windows for control
    control_normalized_window_array.write_bedgraph_file(
        control_normalized_bedgraph_file_name)

    # write graph file for control reads
    SICER_MS.write_graph_file(control_window_counts_dict, opt.window_size,
                              control_graph_file_name, genome)

    print "Find candidate islands exhibiting clustering... \n"

    # finds all islands using the dictionary of window counts and generates .scoreisland file
    # returns a genomic array island_array of all island tag counts and a list of islands (in dictionary format)
    # the dictionary keys of each island are 'island', 'score', and 'chip' (the read count)
    # also writes graph file
    island_array, islands_list = SICER_MS.find_islands(
        window_counts_dict, total_reads, opt.gap_size, opt.window_size, genome,
        opt.genome_fraction, e_value, score_island_file_name, graph_file_name,
        2)

    # count the number of reads in the islands for both chip and control
    # returns updated list of islands including chip and control read counts and the total reads located in islands for
    # both island dictionaries
    islands_list, total_chip_reads_in_islands, total_control_reads_in_islands = SICER_MS.count_reads_in_islands(
        islands_list, read_dict, control_read_dict)

    print "Total chip reads in islands: " + str(total_chip_reads_in_islands)
    print "Total control reads in islands: " + str(
        total_control_reads_in_islands)

    # calculate the p-value and fold change (number of chip reads versus number of expected chip reads) for all islands
    # calculate alpha value for all islands
    # write island summary file
    # return list of islands islands_list; each island is a dictionary with keys 'island' (HTSeq genomic interval),
    # 'chip' (number of chip reads), 'control' (number of control reads), 'pvalue', 'fc' (fold change), and 'alpha'
    # also return HTSeq Genomic Array of all islands with their chip read count
    islands_list, island_array = SICER_MS.get_pvalue_fc_write_islandsummary(
        islands_list, total_reads, control_total_reads, opt.genome_fraction,
        genome, islandsummary_file_name)

    print "\nIdentify significant islands using FDR criterion..."
    # given list of islands as dictionaries, filter all islands with alpha values meeting the significance threshold to write two files
    # write filtered island file (format: chr  start   end    chip_reads   control_reads   pvalue  fc  alpha)
    # write island bed file (format: chr   start   end     chip_reads)
    filtered_islands_list, filtered_island_array = SICER_MS.filter_islands_by_significance(
        islands_list, filtered_island_file_name, island_bed_file_name, FDR,
        genome)

    print "\nFilter reads with identified significant islands...\n"
    # given HTSeq bed_iterator and HTSeq Genomic Array that has chip read count assigned to all islands
    # finds all reads in the bed_iterator that are located in islands
    # if a read is located in an island, it is written to a bed file
    # creates a genomic array of all windows that have reads located in islands
    # returns a dictionary containing all reads located in islands and a dictionary containing all windows in islands
    # dictionary format: keys are chromosomes, values are sorted lists of all read/window positions
    islandfiltered_reads_dict, islandfiltered_windows_dict, total_chip_reads_in_islands = SICER_MS.filter_raw_tags_by_islands(
        bed_iterator, filtered_island_array, island_filtered_file_name,
        opt.fragment_size, opt.window_size, genome)

    # calculate the number of island filtered reads in all the windows comprising the islands
    # calculate normalized read count for each window
    # add the window's normalized read count to a genomic array (islandfilt_normalized_window_array)
    # the islandfilt_normalized_window_array will be used to write a bedgraph file
    islandfilt_window_counts_dict, islandfilt_normalized_window_array = SICER_MS.get_window_counts_and_normalize(
        islandfiltered_windows_dict, islandfiltered_reads_dict, genome,
        1000000, total_chip_reads_in_islands, opt.window_size)

    # write bedgraph file of normalized filtered islands
    islandfilt_normalized_window_array.write_bedgraph_file(
        islandfiltered_normalized_bedgraph_file_name)
Beispiel #15
0
def read_bed(ant_file, stranded=True):
    bed = HTSeq.BED_Reader(ant_file)
    ant = HTSeq.GenomicArrayOfSets("auto", stranded=stranded)
    for feature in bed:
        ant[feature.iv] += feature.name
    return ant
def main(argv):
    #establish options for running program
    parser = OptionParser()
    parser.add_option("-a",
                      "--islandfile1",
                      action="store",
                      type="string",
                      dest="file1",
                      metavar="<file>",
                      help="name of first islands file")
    parser.add_option("-b",
                      "--islandfile2",
                      action="store",
                      type="string",
                      dest="file2",
                      metavar="<file>",
                      help="name of second islands file")
    parser.add_option("-o",
                      "--outputfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name")
    (opt, args) = parser.parse_args(argv)

    print "########## Island Union Program ##########"

    #add file1 islands to tempfile
    os.system('cat %s > %s' % (opt.file1, "tempfile.bed"))

    #add file2 islands to tempfile
    os.system('cat %s >> %s' % (opt.file2, "tempfile.bed"))

    #sort tempfile and store in sortedfile
    os.system('sort -k1,1 -k2,3n %s > %s' % ("tempfile.bed", "sortedfile.bed"))

    #instantiate HTSeq bediterator
    bed_iterator = HTSeq.BED_Reader("sortedfile.bed")

    outfile = open(opt.outfile, 'w')

    total_islands = 0
    tempGI = None
    currentGI = None
    #iterate through GenomicInterval objects
    for read in bed_iterator:
        if tempGI is None:
            currentGI = read.iv
            tempGI = read.iv
        else:
            tempGI = currentGI
            currentGI = read.iv

            #use genomicInterval overlaps method
            if tempGI.overlaps(currentGI):
                currentGI.extend_to_include(tempGI)

            else:
                newLine = str(tempGI.chrom) + "\t" + str(
                    tempGI.start) + "\t" + str(tempGI.end) + "\n"
                outfile.write(newLine)
                total_islands += 1

    #add last entry to union file
    newLine = str(currentGI.chrom) + "\t" + str(currentGI.start) + "\t" + str(
        currentGI.end) + "\n"
    outfile.write(newLine)
    total_islands += 1

    outfile.close()

    #remove tempfile.bed and sortedfile.bed
    os.system('rm %s %s' % ("tempfile.bed", "sortedfile.bed"))

    print "Total number of islands in islands_union_file: " + str(
        total_islands)
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-b",
        "--bam_file",
        action="store",
        type="string",
        dest="bamfile",
        help="paired-end bam file to be condensed into a single-ended bam file"
    )
    (opt, args) = parser.parse_args(argv)

    #Sortng inputted paired-end BAM file by name, so paired reads are adjacent
    bam_reader = HTSeq.BAM_Reader(opt.bamfile)
    os.system('samtools sort -O BAM -n %s > %s' %
              (opt.bamfile, opt.bamfile[:-4] + "_sorted.bam"))

    #BAM file is converted to BED format
    os.system(
        'bamToBed -i %s > %s' %
        (opt.bamfile[:-4] + "_sorted.bam", opt.bamfile[:-4] + "_sorted.bed"))

    #BED iterator created to traverse BED file
    bed_iterator = HTSeq.BED_Reader(opt.bamfile[:-4] + "_sorted.bed")

    outfile = open(opt.bamfile[:-4] + "_sorted_condensed.bed", 'w')

    #algorithmic logic -> combine paired reads into a single read
    pair1 = None
    pair2 = None
    oddRead = None
    new_start = 0
    new_end = 0
    pos = 0
    new_strand = ''
    singleRead_iv = None
    line_count = 0
    error_count = 0
    finalcount = 0

    for read in bed_iterator:
        line_count = +line_count + 1
        if line_count % 2 != 0:
            oddRead = read.iv

        elif line_count % 2 == 0:
            pair1 = oddRead
            pair2 = read.iv
            print "Read 1 chr: " + str(
                pair1.chrom) + " and Read 2 chr: " + str(pair2.chrom)

            if str(pair1.chrom) == str(pair2.chrom):
                #determines start and end of single read
                new_start = min([pair1.start, pair2.start])
                new_end = max([pair1.end, pair2.end])

                #position calculation
                pos = (new_start + new_end) / 2
                if pos % 1 != 0:
                    pos = pos - 0.5

                #decides proper strand for new single read (strand is that of leftmost read in pair)
                if pair1.start < pair2.start:
                    new_strand = pair1.strand
                else:
                    new_strand = pair2.strand

                #creates new single read with length 1 bp
                singleRead_iv = HTSeq.GenomicInterval(pair1.chrom, pos,
                                                      pos + 1, new_strand)

                #writes read to output BED file
                write_to_outfile(outfile, singleRead_iv)
                finalcount = finalcount + 1
            else:
                print "Error: paired reads not on same chromosome."
                error_count = error_count + 1

    print "pairedRead to singleRead conversion is done running.\nThere were " + str(
        error_count) + " pairs on different chromosomes"
    print "Reads written to outfile: " + str(finalcount)
def main(argv):
    parser = OptionParser()
    parser.add_option("-g", "--genome", action="store", type="string", dest="genome_data", help="species, mm9, hg18, etc",
                      metavar="<str>")
    parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>",
                      help="raw read file A in bed format")
    parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>",
                      help="raw read file B in bed format")
    parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>",
                      help="average size of a fragment after A experiment")
    parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>",
                      help="island read count summary file")


    (opt, args) = parser.parse_args(argv)
    if len(argv) < 12:
        parser.print_help()
        sys.exit(1)

    # create HTSeq BED_Readers for BED files
    file_A_iterator = HTSeq.BED_Reader(opt.readfileA)
    file_B_iterator = HTSeq.BED_Reader(opt.readfileB)
    island_file_iterator = HTSeq.BED_Reader(opt.islandfile)

    genome_file = opt.sicer_dir + "/genomes/" + opt.genome_data
    genome = get_genome_data(genome_file)

    read_dict_A, A_library_size = make_dict_of_reads(file_A_iterator, genome, opt.fragment_size)

    read_dict_B, B_library_size = make_dict_of_reads(file_B_iterator, genome, opt.fragment_size)

    print "Library size of " + opt.readfileA + ":  " + str(A_library_size)
    print "Library size of " + opt.readfileB + ":  " + str(B_library_size)

    A_reads_in_islands = 0
    B_reads_in_islands = 0

    islands_list = []

    island_A_readcount_list = []
    island_B_readcount_list = []

    # Find read counts on the islands
    for region in island_file_iterator:

        read_count_A = get_read_count_in_region(region.iv, read_dict_A)
        A_reads_in_islands += read_count_A
        island_A_readcount_list.append(read_count_A)
        read_count_B = get_read_count_in_region(region.iv, read_dict_B)
        B_reads_in_islands += read_count_B
        island_B_readcount_list.append(read_count_B)

        island = {'region': region.iv, 'A_count': read_count_A, 'B_count': read_count_B}
        islands_list.append(island)

    pvalue_A_vs_B_list = []
    pvalue_B_vs_A_list = []


    print "Total number of A reads on islands is: " + str(A_reads_in_islands)
    print "Total number of B reads on islands is: " + str(B_reads_in_islands)

    library_scaling_factor = A_library_size * 1.0 / B_library_size
    pseudo_count = 1
    pvalue_A_vs_B_list = []
    pvalue_B_vs_A_list = []

    # Calculate the p value.
    for island in islands_list:
        A_count = island['A_count']
        B_count = island['B_count']
        pvalue_A_vs_B = pvalue(A_count, B_count, library_scaling_factor, pseudo_count)
        pvalue_A_vs_B_list.append(pvalue_A_vs_B)
        pvalue_B_vs_A = pvalue(B_count, A_count, 1 / library_scaling_factor, pseudo_count)
        pvalue_B_vs_A_list.append(pvalue_B_vs_A)

    # Calculate the FDR
    fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list)
    fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list)

    # Output the islands read counts, normalized read counts, fc, pvalue both ways
    scaling_factor = 1000000
    outfile = open(opt.out_file, 'w')
    outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" \
              + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + \
              "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n"
    outfile.write(outline)
    ii = 0
    for island in islands_list:
        A_count = island['A_count']
        B_count = island['B_count']
        normalized_A = A_count / float(A_library_size) * scaling_factor
        normalized_B = B_count / float(B_library_size) * scaling_factor
        fc_A_vs_B = ((A_count + pseudo_count) * 1.0 / (B_count + pseudo_count)) / library_scaling_factor
        fc_B_vs_A = ((B_count + pseudo_count) * 1.0 / (A_count + pseudo_count)) * library_scaling_factor
        outline = island['region'].chrom + "\t" + str(island['region'].start) + "\t" + str(island['region'].end) + "\t" + str(
                        A_count) + "\t" + str(normalized_A) + "\t" + str(B_count) + "\t" + str(normalized_B) + "\t" + str(
                        fc_A_vs_B) + "\t" + str(pvalue_A_vs_B_list[ii]) + "\t" + str(fdr_A_vs_B_list[ii]) + "\t" + str(
                        fc_B_vs_A) + "\t" + str(pvalue_B_vs_A_list[ii]) + "\t" + str(fdr_B_vs_A_list[ii]) + "\n"
        outfile.write(outline)
        ii += 1

    # Calculate the correlations using normalized read counts
    A_array = ()
    B_array = ()

    A_array = scipy.array(island_A_readcount_list)
    B_array = scipy.array(island_B_readcount_list)

    # Normalization to reads per million
    A_array = A_array / float(A_library_size) * scaling_factor
    B_array = B_array / float(B_library_size) * scaling_factor
    pearson = scipy.stats.pearsonr(A_array, B_array)
    print "Pearson's correlation is: " + str(pearson[0]) + " with p-value " + str(pearson[1])
    spearman = scipy.stats.spearmanr(A_array, B_array)
    print "Spearman's correlation is: " + str(spearman[0]) + " with p-value " + str(spearman[1])
# get genomic arrays
peakregions = dict()
peakcutoffs = dict()
samplenames = dict()
maxpeaklength = dict()
for count in range(0,len(bedfiles)):
	samp = bedfiles[count]
	# get sample name
	samplenames[samp] = samp.strip('noM_peaks.bed')
	output.write('\t' + samp.strip('noM_peaks.bed'))
	# get peak locations		
	peakregions[samp] = HTSeq.GenomicArray("auto",stranded=False,typecode='d')
	maxpeaklength[samp] = 0
	scoreslist = list()
	peakfile = HTSeq.BED_Reader(beddirectory + samp)
	totbases = 0
	for peak in peakfile:
		peakregions[samp][peak.iv] = peak.score
		scoreslist.append(peak.score)
		peaklength = peak.iv.end - peak.iv.start + 1
		totbases += peaklength
		maxpeaklength[samp] = max(maxpeaklength[samp],peaklength)
	# find score cutoff for this particular library
	sortedscores = sorted(scoreslist)
	scorecutind = len(sortedscores) - maxpeaks - 1 # -1 for python indexing 0
	peakcutoffs[samp] = sortedscores[scorecutind]
	covoutput.write(samp.strip('noM_peaks.bed') + '\t' + str(totbases) + '\n')
covoutput.close()	
output.write('\n')
Beispiel #20
0
def load_bed(bed_path):
    check_file_exist(bed_path)
    bed_reader = HTSeq.BED_Reader(
        bed_path)  # also call it a sam reader for ease

    return bed_reader