Beispiel #1
0
def conservation_serial(bed_file, window_sizes, chrom_bigwig_dict):
    # output header fields
    fields = ['name', 'position', 'transcript_length', 'mean']
    fields.extend(map(str, window_sizes))
    print '\t'.join(fields)
    # process bed file
    for f in BEDFeature.parse(open(bed_file)):
        # retrieve conservation data
        bigwig_file = chrom_bigwig_dict[f.chrom]
        arr = extract_bigwig_data(f, bigwig_file)
        # measure conservation at various sliding windows
        window_scores = []
        for window_size in window_sizes:
            window_scores.append(best_sliding_window(arr, window_size,
                                                     np.mean))
        # calc mean conservation
        finitearr = arr[np.isfinite(arr)]
        if len(finitearr) == 0:
            mean_cons = np.nan
        else:
            mean_cons = np.mean(finitearr)
        fields = [
            f.name,
            '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand),
            str(len(arr)),
            str(mean_cons)
        ]
        fields.extend(map(str, window_scores))
        print '\t'.join(fields)
 def _consumer(input_queue, output_queue):
     while True:
         line = input_queue.get()
         if line is None:
             break
         f = BEDFeature.from_string(line)
         # retrieve conservation data
         bigwig_file = chrom_bigwig_dict[f.chrom]
         arr = extract_bigwig_data(f, bigwig_file)
         # measure conservation at various sliding windows
         window_scores = []
         for window_size in window_sizes:
             window_scores.append(best_sliding_window(arr, window_size, np.mean))
         # measure average conservation
         finitearr = arr[np.isfinite(arr)]
         if len(finitearr) == 0:
             mean_cons = np.nan
         else:
             mean_cons = np.mean(finitearr)
         fields = [f.name, '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand),
                   str(len(arr)), str(mean_cons)]
         fields.extend(map(str,window_scores))
         result = '\t'.join(fields)
         output_queue.put(result)
     output_queue.put(None)
Beispiel #3
0
 def _consumer(input_queue, output_queue):
     while True:
         line = input_queue.get()
         if line is None:
             break
         f = BEDFeature.from_string(line)
         # retrieve conservation data
         bigwig_file = chrom_bigwig_dict[f.chrom]
         arr = extract_bigwig_data(f, bigwig_file)
         # measure conservation at various sliding windows
         window_scores = []
         for window_size in window_sizes:
             window_scores.append(
                 best_sliding_window(arr, window_size, np.mean))
         # measure average conservation
         finitearr = arr[np.isfinite(arr)]
         if len(finitearr) == 0:
             mean_cons = np.nan
         else:
             mean_cons = np.mean(finitearr)
         fields = [
             f.name,
             '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand),
             str(len(arr)),
             str(mean_cons)
         ]
         fields.extend(map(str, window_scores))
         result = '\t'.join(fields)
         output_queue.put(result)
     output_queue.put(None)
Beispiel #4
0
def conservation_serial(bed_file, chrom_bigwig_dict, sig_threshold,
                        window_sizes, num_processes):
    # process bed file
    for f in BEDFeature.parse(open(bed_file)):
        fields = feature_conservation(f, chrom_bigwig_dict, sig_threshold,
                                      window_sizes)
        print '\t'.join(fields)
def conservation_serial(bed_file, chrom_bigwig_dict, sig_threshold, 
                        window_sizes, num_processes):
    # process bed file
    for f in BEDFeature.parse(open(bed_file)):
        fields = feature_conservation(f, chrom_bigwig_dict, 
                                      sig_threshold, window_sizes)
        print '\t'.join(fields)
Beispiel #6
0
 def _consumer(input_queue, output_queue):
     while True:
         line = input_queue.get()
         if line is None:
             break
         f = BEDFeature.from_string(line)
         fields = feature_conservation(f, chrom_bigwig_dict, sig_threshold,
                                       window_sizes)
         output_queue.put('\t'.join(fields))
     output_queue.put(None)
 def _consumer(input_queue, output_queue):
     while True:
         line = input_queue.get()
         if line is None:
             break
         f = BEDFeature.from_string(line)
         fields = feature_conservation(f, chrom_bigwig_dict, 
                                       sig_threshold, window_sizes)
         output_queue.put('\t'.join(fields))
     output_queue.put(None)
Beispiel #8
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('bed_file')
    args = parser.parse_args()

    trees = collections.defaultdict(lambda: ClusterTree(0,1))    
    for f in BEDFeature.parse(open(args.bed_file)):
        tree = trees[f.chrom]        
        for start,end in f.exons:
            tree.insert(start, end, 1)
    footprint = 0
    for chrom in sorted(trees):
        chromprint = 0
        tree = trees[chrom]
        for start,end,indexes in tree.getregions():
            chromprint += (end - start)
        #print chrom, chromprint
        footprint += chromprint
    print 'total', footprint
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('bed_file')
    args = parser.parse_args()

    trees = collections.defaultdict(lambda: ClusterTree(0, 1))
    for f in BEDFeature.parse(open(args.bed_file)):
        tree = trees[f.chrom]
        for start, end in f.exons:
            tree.insert(start, end, 1)
    footprint = 0
    for chrom in sorted(trees):
        chromprint = 0
        tree = trees[chrom]
        for start, end, indexes in tree.getregions():
            chromprint += (end - start)
        #print chrom, chromprint
        footprint += chromprint
    print 'total', footprint
def conservation_serial(bed_file, window_sizes, chrom_bigwig_dict):
    # output header fields
    fields = ['name', 'position', 'transcript_length', 'mean']
    fields.extend(map(str,window_sizes))
    print '\t'.join(fields)
    # process bed file
    for f in BEDFeature.parse(open(bed_file)):
        # retrieve conservation data
        bigwig_file = chrom_bigwig_dict[f.chrom]
        arr = extract_bigwig_data(f, bigwig_file)
        # measure conservation at various sliding windows
        window_scores = []
        for window_size in window_sizes:
            window_scores.append(best_sliding_window(arr, window_size, np.mean))
        # calc mean conservation
        finitearr = arr[np.isfinite(arr)]
        if len(finitearr) == 0:
            mean_cons = np.nan
        else:
            mean_cons = np.mean(finitearr)
        fields = [f.name, '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand),
                  str(len(arr)), str(mean_cons)]
        fields.extend(map(str,window_scores))
        print '\t'.join(fields)    
Beispiel #11
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', type=int, dest='num_samples', default=100000)
    parser.add_argument('genome_bed_file')
    parser.add_argument('region_lengths_file')
    # parse arguments
    args = parser.parse_args()
    genome_bed_file = args.genome_bed_file
    region_lengths_file = args.region_lengths_file
    num_samples = max(1, args.num_samples)
    # check arguments
    if not os.path.exists(genome_bed_file):
        parser.error('genome bed file %s not found' % (genome_bed_file))
    if not os.path.exists(region_lengths_file):
        parser.error('region lengths file %s not found' %
                     (region_lengths_file))
    # get transcript lengths from transcript bed file
    logging.debug('Reading region lengths')
    region_lengths = []
    with open(region_lengths_file) as f:
        region_lengths.extend(int(x.strip()) for x in f)
    # first pass to index windows
    logging.debug('Reading genome bed file')
    features = []
    genome_cumsums = []
    genome_size = 0
    max_feature_size = 0
    for f in BEDFeature.parse(open(genome_bed_file)):
        length = sum((e[1] - e[0]) for e in f.exons)
        if length == 0:
            continue
        features.append(f)
        genome_cumsums.append(genome_size)
        genome_size += length
        max_feature_size = max(max_feature_size, length)
    logging.debug('Genome bed size %d' % (genome_size))
    # get windows
    trials = 0
    windows = []
    for i in xrange(num_samples):
        # choose random region length
        length = region_lengths[random.randrange(len(region_lengths))]
        while True:
            trials += 1
            # choose random window start
            genome_start = random.randrange(genome_size - length)
            feature_index = bisect.bisect_right(genome_cumsums,
                                                genome_start) - 1
            f = features[feature_index]
            feature_length = (f.tx_end - f.tx_start)
            # can use available length of this feature
            genome_feature_start = genome_cumsums[feature_index]
            feature_offset = genome_start - genome_feature_start
            avail_length = feature_length - feature_offset
            if avail_length < length:
                continue
            windows.append((f.chrom, f.tx_start + feature_offset,
                            f.tx_start + feature_offset + length))
            break
        if (i > 0) and (i % 1000) == 0:
            logging.debug('Finished %d' % (i))
    logging.debug('Sampled %d windows (%d trials)' % (len(windows), trials))
    for window in sorted(windows):
        print '\t'.join(map(str, window))
    return 0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', type=int, dest='num_samples', default=100000)
    parser.add_argument('genome_bed_file')
    parser.add_argument('region_lengths_file')
    # parse arguments
    args = parser.parse_args()
    genome_bed_file = args.genome_bed_file
    region_lengths_file = args.region_lengths_file
    num_samples = max(1, args.num_samples)
    # check arguments
    if not os.path.exists(genome_bed_file):
        parser.error('genome bed file %s not found' % (genome_bed_file))
    if not os.path.exists(region_lengths_file):
        parser.error('region lengths file %s not found' % (region_lengths_file))
    # get transcript lengths from transcript bed file
    logging.debug('Reading region lengths')
    region_lengths = []
    with open(region_lengths_file) as f:
        region_lengths.extend(int(x.strip()) for x in f)
    # first pass to index windows
    logging.debug('Reading genome bed file')
    features = []
    genome_cumsums = []
    genome_size = 0
    max_feature_size = 0
    for f in BEDFeature.parse(open(genome_bed_file)):
        length = sum((e[1]-e[0]) for e in f.exons)
        if length == 0:
            continue
        features.append(f)
        genome_cumsums.append(genome_size)
        genome_size += length
        max_feature_size = max(max_feature_size, length)
    logging.debug('Genome bed size %d' % (genome_size))
    # get windows
    trials = 0
    windows = []
    for i in xrange(num_samples):
        # choose random region length
        length = region_lengths[random.randrange(len(region_lengths))]        
        while True:
            trials += 1
            # choose random window start                
            genome_start = random.randrange(genome_size - length)        
            feature_index = bisect.bisect_right(genome_cumsums, genome_start) - 1
            f = features[feature_index]
            feature_length = (f.tx_end - f.tx_start)
            # can use available length of this feature
            genome_feature_start = genome_cumsums[feature_index]
            feature_offset = genome_start - genome_feature_start
            avail_length = feature_length - feature_offset
            if avail_length < length:
                continue
            windows.append((f.chrom, f.tx_start + feature_offset, f.tx_start + feature_offset + length))
            break
        if (i > 0) and (i % 1000) == 0:
            logging.debug('Finished %d' % (i))
    logging.debug('Sampled %d windows (%d trials)' % (len(windows), trials))
    for window in sorted(windows):
        print '\t'.join(map(str, window))
    return 0