コード例 #1
0
ファイル: segment.py プロジェクト: orenlivne/ober
def form_intersections(segments, full_output=False, samples=None, snp_range=None):
    '''Return a list of bins whose ith element is the list of samples that are equal over
    [endpoints[i], endpoints[i+1]]. This is done by scanning all ibd_pairs entries
    and accumulating them into the intersecting segments' bins.
    Union set of all segment endpoints and sort ascending. These form
    the set of intersections of all pairwise_ibd segments.'''
    non_unique_endpoints = [s[0] for s in segments]
    # print 'snp_range', snp_range
    if snp_range is not None:
        non_unique_endpoints += [(snp_range[START], snp_range[STOP])]
    endpoints = sorted(reduce(set.union, non_unique_endpoints, set([])))
    
    # This index makes it easier to random-access bins
    endpoint_index = dict((x, index) for (index, x) in enumerate(endpoints))
    sub_segments = [(endpoints[i], endpoints[i + 1]) for i in xrange(len(endpoints) - 1)]
    # print 'endpoints', endpoints
    # print 'sub_segments', sub_segments
    
    n = len(endpoints)
    intersections = [list([]) for _ in xrange(n - 1)]
    if full_output:
        value_segment_tuples = [list([]) for _ in xrange(n - 1)]
        segment_local_value_tuples = []

    # Sweep segment intersections (bins) in order, accumulate segment info into each one
    for k, segment in enumerate(segments):
        snp = segment[0]
        for i in xrange(endpoint_index[snp[0]], endpoint_index[snp[1]]):
            intersections[i].append(segment[1])
            if full_output:
                value_segment_tuples[i] += [(x, k) for x in segment[1]]
                segment_local_value_tuples += [(k, (i, x)) for x in segment[1]]

    if full_output:
        # print 'segment_local_value_tuples', segment_local_value_tuples
        segment_to_local_values = util.to_set_dict(segment_local_value_tuples)
        # print 'segment_to_local_values', segment_to_local_values
        # print [x for s in segments for x in s[1]]
        all_samples = samples if samples is not None else util.union_all(x for s in segments for x in s[1])
        # print 'all_samples', all_samples
        all_local_values = set(itertools.product(xrange(n - 1), all_samples))
        dangling_local_values = all_local_values - util.union_all(*segment_to_local_values.values())
        # print 'dangling_local_values', dangling_local_values
        value_to_segments = [util.to_set_dict(d) for d in value_segment_tuples]
        # print 'value_to_segments', value_to_segments 
        util.to_set_dict((x, k) for k, s in enumerate(segments) for x in s[1])
        return (sub_segments, intersections,
                value_to_segments, segment_to_local_values, dangling_local_values)
    else:
        return (sub_segments, intersections)
コード例 #2
0
ファイル: ld_graph.py プロジェクト: orenlivne/ober
def _compute_block_frames(g, snp_dao):
    '''Yield frame numbers (colors) within each SNP LD block using greedy coloring.'''
    # This is how to turn on database logging to debug query slowness:
    #    logging.basicConfig()
    #    logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)
    #    logging.getLogger('sqlalchemy.pool.QueuePool').setLevel(logging.DEBUG)
    for block in nx.connected_components(g):
        snps = list(snp_dao.get_snps(block))
        position = dict((snp.name, snp.bp) for snp in snps)
        #        print '#' * 50
        #        print 'block', block
        #        print '#' * 50
        #        print 'snps', snps
        #        print 'position', position
        h = g.subgraph(block)
        c = greedy_coloring(h, position)
        frames = util.to_set_dict((v, k) for (k, v) in c.iteritems())
        yield frames
コード例 #3
0
ファイル: ld_graph.py プロジェクト: orenlivne/ober
def _compute_block_frames(g, snp_dao):
    '''Yield frame numbers (colors) within each SNP LD block using greedy coloring.'''
    # This is how to turn on database logging to debug query slowness:
#    logging.basicConfig()
#    logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)
#    logging.getLogger('sqlalchemy.pool.QueuePool').setLevel(logging.DEBUG)
    for block in nx.connected_components(g):
        snps = list(snp_dao.get_snps(block))
        position = dict((snp.name, snp.bp) for snp in snps)
#        print '#' * 50
#        print 'block', block 
#        print '#' * 50
#        print 'snps', snps 
#        print 'position', position 
        h = g.subgraph(block)
        c = greedy_coloring(h, position)
        frames = util.to_set_dict((v, k) for (k, v) in c.iteritems())
        yield frames
コード例 #4
0
ファイル: TestLdGraph.py プロジェクト: orenlivne/ober
def group_by_value(d):
    '''Convert a dictionary to a dictionary where each key is a d-value and the value is the list of keys
    in d that have this value.'''
    return util.to_set_dict((v, k) for k, v in d.iteritems())
コード例 #5
0
ファイル: TestLdGraph.py プロジェクト: orenlivne/ober
def group_by_value(d):
    '''Convert a dictionary to a dictionary where each key is a d-value and the value is the list of keys
    in d that have this value.'''
    return util.to_set_dict((v, k) for k, v in d.iteritems())
コード例 #6
0
ファイル: extract_genotypes.py プロジェクト: orenlivne/ober
def extract_genotypes(input_file,
                      var_file_prefix=os.environ['OBER_DATA'] +
                      '/cgi/all.2012-09-20.testvar.chr',
                      index_file=os.environ['OBER_DATA'] +
                      '/cgi/README.assembly_sample_subject.csv',
                      allele_swap=False,
                      debug=False,
                      custom_id=False):
    '''Extract SNP data from tabixed CGI var files and convert to a Genotype object.
    Base-pair ranges are read from input_file in the format: chr start_bp end_bp
    * Multiple ranges can be specified (one per line).
    * If input_file = ''-'', input is taken from standard input.
    * Use 23 for chrX, 24 for chrY. (BUT: chrs X, Y, M are not yet supported. TBA.)
    * By default, allele labels are swapped so that 1 is always the major and 2 is always the minor.'''

    # Read CGI-ID-to-our-sample-ID (FINDIV) dictionary
    cgi_id_to_sample_id = db_gene.cgi.ids.cgi_id_to_sample_id(index_file)

    # Read SNPs from stdin; assuming numeric chromosome numbers (typically we only use autosomal)
    input_snps = [(int(line[0]), (int(line[1]), int(line[2]))) for line in
                  csv.reader(input_file, skipinitialspace=True, delimiter=' ')]
    num_snps = len(input_snps)

    # Holds the sorted list sample IDs; assumed to be the same for all chromosomes;
    # if not, raise an exception in the chromosome loop below
    sample_id = None

    # Holds SNP metadata
    snp = np.zeros(
        (num_snps, ),
        dtype=[
            ('chrom', np.uint8),  # Chromosome # containing the SNP
            ('name', np.chararray),  # SNP name (e.g., 'rs...')
            ('dist_cm',
             np.float),  # Genetic distance from beginning of chomorsome
            ('base_pair', np.uint)  # Base pair position on chromosome
        ])
    # Genetic map (allele letters of each SNP)
    genetic_map = np.zeros((num_snps, ), dtype=np.object)

    # Sample genotypes at all SNPs
    data = None

    # Bulk-extract data using tabix for each chromosome
    row = 0
    for chrom, bps in util.to_set_dict(input_snps).iteritems():
        chrom_letter = _to_letter[chrom]
        var_file = '%s%s.tsv.gz' % (var_file_prefix, chrom_letter)

        # Read header line from the compressed var file of this chromosome to get CGI IDs
        for line in csv.reader(gzip.GzipFile(var_file),
                               skipinitialspace=True,
                               delimiter='\t'):
            raw_id = [cgi_id_to_sample_id[cgi_id] for cgi_id in line[8:]]
            chr_sample_id, index = util.sort_with_index(np.array(raw_id))
            break
        if sample_id is None:
            # Save global IDs
            sample_id = chr_sample_id
            num_samples = len(sample_id)
        elif not np.array_equal(chr_sample_id, sample_id):
            # Make sure the IDs are the same across all chromosomes
            raise ValueError(
                'Sample IDs are not the same across all chromosomes:\n' +
                sample_id + '\n' + chr_sample_id)

        # Read SNP data from the compressed chromosome archive using tabix; append to genotype array
        if data is None:
            data = np.zeros((num_snps, num_samples, 2), dtype=np.byte)
        _, output, _ = util.run_command(
            _TABIX_CMD + ' ' + var_file + ' ' +
            ' '.join('chr%s:%d-%d' % (chrom_letter, x[0], x[1]) for x in bps),
            verbose=debug)
        # Apparently there's a tabix bug: some records might not be contained in the requested region.
        # Only process lines that are within the requested regions
        relevant_lines = [
            a for a in csv.reader(StringIO.StringIO(output),
                                  skipinitialspace=True,
                                  delimiter='\t')
            if np.array([(x[0] <= int(a[2]))
                         & (int(a[3]) <= x[1]) and a[4] == 'snp'
                         for x in bps]).any()
        ]
        for line in relevant_lines:
            if len(snp) <= row:
                snp.resize(2 * len(snp))
                genetic_map.resize(2 * len(snp))
                data.resize((2 * len(snp), num_samples, 2), refcheck=False)
            # If SNP name is missing, create a unique identifier by concatenating the chromosome, begin and minor allele
            snp[row] = (chrom, line[7] if line[7] and not custom_id else
                        'chr%s_%d_%s' % (chrom_letter, int(line[2]), line[6]),
                        0, int(line[3]))
            genetic_map[row] = line[5] + line[6]
            # print line[8:]
            if chrom <= 22:
                # Autosomes - genotype data (two letters)
                data[row, :] = np.array(
                    sum(([
                        CGI_LETTER_TO_ALLELE[item[0]],
                        CGI_LETTER_TO_ALLELE[item[1]]
                    ] for item in line[8:]), [])).reshape(num_samples,
                                                          2)[index, :]
            else:
                # X/Y chromosomes: if one letter is available => duplicate it to make the individual homozygous
                # (or 00 for a single missing value)
                data[row, :] = np.array(
                    sum(([
                        CGI_LETTER_TO_ALLELE[item[0]],
                        CGI_LETTER_TO_ALLELE[item[1 if len(item) == 2 else 0]]
                    ] for item in line[8:]), [])).reshape(num_samples,
                                                          2)[index, :]
            d = data[row, :]
            _, f2 = im.gt.allele_frequencies(d)
            if allele_swap and f2 > 0.7:  # Minor allele has a much larger frequency than the major allele, swap them
                print 'chr%d:%d-%d (%s): Major-minor allele swap detected, fixing' % \
                (chrom, int(line[2]), int(line[3]), line[7])
                im.gt.swap_alleles(d)
            row += 1

    # Not all SNPs might be found; restrict data arrays to the first row rows
    snp = snp[0:row]
    genetic_map = genetic_map[0:row].tolist()
    data = data[0:row]
    print 'Found %d SNP(s) in %d requested base-pair range(s)' % (row,
                                                                  num_snps)

    # Construct Genotype object; write in desired output format
    g = im.factory.GenotypeFactory.new_instance('genotype',
                                                data,
                                                snp,
                                                sample_id=sample_id)
    g.map = genetic_map
    return g
コード例 #7
0
ファイル: extract_genotypes.py プロジェクト: orenlivne/ober
def extract_genotypes(input_file,
                      var_file_prefix=os.environ['OBER_DATA'] + '/cgi/all.2012-09-20.testvar.chr',
                      index_file=os.environ['OBER_DATA'] + '/cgi/README.assembly_sample_subject.csv',
                      allele_swap=False, debug=False, custom_id=False):
    '''Extract SNP data from tabixed CGI var files and convert to a Genotype object.
    Base-pair ranges are read from input_file in the format: chr start_bp end_bp
    * Multiple ranges can be specified (one per line).
    * If input_file = ''-'', input is taken from standard input.
    * Use 23 for chrX, 24 for chrY. (BUT: chrs X, Y, M are not yet supported. TBA.)
    * By default, allele labels are swapped so that 1 is always the major and 2 is always the minor.'''
    
    # Read CGI-ID-to-our-sample-ID (FINDIV) dictionary
    cgi_id_to_sample_id = db_gene.cgi.ids.cgi_id_to_sample_id(index_file)
    
    # Read SNPs from stdin; assuming numeric chromosome numbers (typically we only use autosomal)
    input_snps = [(int(line[0]), (int(line[1]), int(line[2])))
            for line in csv.reader(input_file, skipinitialspace=True, delimiter=' ')]
    num_snps = len(input_snps)
    
    # Holds the sorted list sample IDs; assumed to be the same for all chromosomes;
    # if not, raise an exception in the chromosome loop below
    sample_id = None
    
    # Holds SNP metadata
    snp = np.zeros((num_snps,),
                   dtype=[('chrom', np.uint8),  # Chromosome # containing the SNP
                          ('name', np.chararray),  # SNP name (e.g., 'rs...')
                          ('dist_cm', np.float),  # Genetic distance from beginning of chomorsome
                          ('base_pair', np.uint)  # Base pair position on chromosome
                          ])
    # Genetic map (allele letters of each SNP)
    genetic_map = np.zeros((num_snps,), dtype=np.object)
    
    # Sample genotypes at all SNPs 
    data = None

    # Bulk-extract data using tabix for each chromosome
    row = 0
    for chrom, bps in util.to_set_dict(input_snps).iteritems():
        chrom_letter = _to_letter[chrom]
        var_file = '%s%s.tsv.gz' % (var_file_prefix, chrom_letter)

        # Read header line from the compressed var file of this chromosome to get CGI IDs
        for line in csv.reader(gzip.GzipFile(var_file), skipinitialspace=True, delimiter='\t'):
            raw_id = [cgi_id_to_sample_id[cgi_id] for cgi_id in line[8:]]
            chr_sample_id, index = util.sort_with_index(np.array(raw_id))
            break
        if sample_id is None:
            # Save global IDs 
            sample_id = chr_sample_id
            num_samples = len(sample_id)
        elif not np.array_equal(chr_sample_id, sample_id):
            # Make sure the IDs are the same across all chromosomes
            raise ValueError('Sample IDs are not the same across all chromosomes:\n' 
                             + sample_id + '\n' + chr_sample_id)
        
        # Read SNP data from the compressed chromosome archive using tabix; append to genotype array
        if data is None:
            data = np.zeros((num_snps, num_samples, 2), dtype=np.byte)
        _, output, _ = util.run_command(_TABIX_CMD + ' ' + var_file + ' '
                                        + ' '.join('chr%s:%d-%d' % (chrom_letter, x[0], x[1]) for x in bps),
                                        verbose=debug)
        # Apparently there's a tabix bug: some records might not be contained in the requested region.
        # Only process lines that are within the requested regions
        relevant_lines = [a for a in csv.reader(StringIO.StringIO(output), skipinitialspace=True, delimiter='\t')
                          if np.array([(x[0] <= int(a[2])) & (int(a[3]) <= x[1]) and a[4] == 'snp'
                                       for x in bps]).any()]
        for line in relevant_lines:
            if len(snp) <= row:
                snp.resize(2 * len(snp))
                genetic_map.resize(2 * len(snp))
                data.resize((2 * len(snp), num_samples, 2), refcheck=False)
            # If SNP name is missing, create a unique identifier by concatenating the chromosome, begin and minor allele
            snp[row] = (chrom, line[7] if line[7] and not custom_id else 'chr%s_%d_%s' % (chrom_letter, int(line[2]), line[6]), 0, int(line[3]))
            genetic_map[row] = line[5] + line[6]
            # print line[8:]
            if chrom <= 22:
                # Autosomes - genotype data (two letters)
                data[row, :] = np.array(sum(([CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1]]]
                                             for item in line[8:]), [])).reshape(num_samples, 2)[index, :]
            else:
                # X/Y chromosomes: if one letter is available => duplicate it to make the individual homozygous
                # (or 00 for a single missing value)
                data[row, :] = np.array(sum(([CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1 if len(item) == 2 else 0]]]
                                             for item in line[8:]), [])).reshape(num_samples, 2)[index, :]
            d = data[row, :]
            _, f2 = im.gt.allele_frequencies(d)
            if allele_swap and f2 > 0.7:  # Minor allele has a much larger frequency than the major allele, swap them
                print 'chr%d:%d-%d (%s): Major-minor allele swap detected, fixing' % \
                (chrom, int(line[2]), int(line[3]), line[7])
                im.gt.swap_alleles(d)
            row += 1

    # Not all SNPs might be found; restrict data arrays to the first row rows 
    snp = snp[0:row]
    genetic_map = genetic_map[0:row].tolist()
    data = data[0:row]
    print 'Found %d SNP(s) in %d requested base-pair range(s)' % (row, num_snps)
    
    # Construct Genotype object; write in desired output format
    g = im.factory.GenotypeFactory.new_instance('genotype', data, snp, sample_id=sample_id)
    g.map = genetic_map
    return g