Beispiel #1
0
def read_beagle_markerfile(filename, label=None):
    """ 
    Reads marker locations from a BEAGLE formatted file
    
    :param filename: The file to be read
    :param label: An optional label to give the chromosome, since the BEAGLE
        format does not require it
    
    :type filename: string

    :rtype: ChromosomeTemplate
    """
    with smartopen(filename) as f:
        chrom = ChromosomeTemplate(label=label)

        last_pos = -1
        for line in f:
            rec = BeagleMarkerRecord(line)

            if rec.pos < 0:
                raise FileFormatError(
                    'Bad position for genotype: {}'.format(rec.pos))
            elif rec.pos <= last_pos:
                raise FileFormatError('Makers in file out of order')

            chrom.add_genotype(None, map_position=None, label=rec.label, 
                               bp=rec.pos, reference=rec.reference, 
                               alternates=rec.alternates)
            last_pos = rec.pos

    return chrom
Beispiel #2
0
def test_labelledallele_delabeler():
    ngenos = 10  # Number of genotypes per chromosome
    if ngenos % 2 == 1:
        raise ValueError('Even number of genotypes needed')

    p = Population()
    c = ChromosomeTemplate()
    for i in range(ngenos):
        c.add_genotype()
    p.add_chromosome(c)

    a = Individual(p, 1)
    a._init_genotypes(blankchroms=False)
    a.genotypes[0][0] = Alleles([1]*ngenos)
    a.genotypes[0][1] = Alleles([2]*ngenos)

    b = Individual(p, 2)
    b._init_genotypes(blankchroms=False)
    b.genotypes[0][0] = Alleles([3] * ngenos)
    b.genotypes[0][1] = Alleles([4] * ngenos)


    chromatid_spans = [InheritanceSpan(a, 0, 0, 0, ngenos//2),
                       InheritanceSpan(b, 0, 1, ngenos//2, ngenos)]
    chromatid = LabelledAlleles(spans=chromatid_spans, chromobj=c)

    expected_value = [1]*(ngenos//2) + [4] * (ngenos//2)
    expected_value = Alleles(expected_value)

    actual_value = chromatid.delabel()
    assert all(actual_value == expected_value)
Beispiel #3
0
def read_map(filename, only=None):
    with open(filename) as f:
        line = f.readline()
        header = line.strip().split()
        sex_specific_map = len(header) == 5

        lastchrom = None
        chromosomes = []
        for markernum, line in enumerate(f):
            l = line.strip().split()

            if sex_specific_map:
                chrom, marker, pos, male_pos, female_pos = l
            else:
                chrom, marker, pos = l

            if only and marker not in only:
                continue

            pos = float(pos)

            if chrom != lastchrom:
                if lastchrom:
                    c.finalize()
                c = ChromosomeTemplate(label=chrom)
                chromosomes.append(c)
                lastchrom = chrom

            c.add_genotype(map_position=pos, label=marker)

        for chrom in chromosomes:
            chrom.finalize()
        return chromosomes
Beispiel #4
0
def read_map(mapfile):
    """
    Reads a PLINK map file into a list of ChromosomeTemplate objects

    Arguments:
    mapfile: The file to be read

    Returns: a list of ChromosomeTemplate objects
    """
    last_chr, last_pos = None, 0
    chroms = []
    chromosome = None
    with open(mapfile) as f:
        for i, line in enumerate(f):
            line = line.strip().split()
            chr, label, cm, pos = line
            cm, pos = float(cm), int(pos)
            if pos < 0:
                raise FileFormatError("Invalid position: {}".format(pos))
            if chr != last_chr:
                # If this happens, we've moved on to a new chromosome,
                # or we've just started. If we haven't just started, We'll
                # close up the old one
                if i > 0:
                    chromosome.finalize()
                    chroms.append(chromosome)
                # Make the next chromosome
                chromosome = ChromosomeTemplate(label=chr)
            elif pos < last_pos:
                raise FileFormatError("Map file not sorted")
            chromosome.add_genotype(None, cm, label=label, bp=pos)
            last_chr, last_pos = chr, pos
    chromosome.finalize()
    chroms.append(chromosome)
    return chroms
Beispiel #5
0
def read_beagle_markerfile(filename, label=None):
    """ 
    Reads marker locations from a BEAGLE formatted file
    
    :param filename: The file to be read
    :param label: An optional label to give the chromosome, since the BEAGLE
        format does not require it
    
    :type filename: string

    :rtype: ChromosomeTemplate
    """
    with smartopen(filename) as f:
        chrom = ChromosomeTemplate(label=label)

        last_pos = -1
        for line in f:
            rec = BeagleMarkerRecord(line)

            if rec.pos < 0:
                raise FileFormatError('Bad position for genotype: {}'.format(
                    rec.pos))
            elif rec.pos <= last_pos:
                raise FileFormatError('Makers in file out of order')

            chrom.add_genotype(None,
                               map_position=None,
                               label=rec.label,
                               bp=rec.pos,
                               reference=rec.reference,
                               alternates=rec.alternates)
            last_pos = rec.pos

    return chrom
Beispiel #6
0
def read_beagle_markerfile(filename, label=None):
    ''' 
    Reads marker locations from a beagle format file
    
    Arguments
    -----
    filename: The file to be read
    label: An optional label to give the chromosome, since the BEAGLE
        format does not require it

    Returns: a ChromosomeTemplate object
    '''
    with open(filename) as f:
        chrom = ChromosomeTemplate(label=label)

        last_pos = -1
        for line in f:
            rec = BeagleMarkerRecord(line)

            if rec.pos < 0:
                raise FileFormatError(
                    'Bad position for genotype: {}'.format(rec.pos))
            elif rec.pos <= last_pos:
                raise FileFormatError('Makers in file out of order')

            chrom.add_genotype(None, cm=None, label=rec.label, bp=rec.pos,
                               reference=rec.reference, alternates=rec.alternates)
            last_pos = rec.pos

    return chrom
Beispiel #7
0
def test_labelledallele_delabeler():
    ngenos = 10  # Number of genotypes per chromosome
    if ngenos % 2 == 1:
        raise ValueError('Even number of genotypes needed')

    p = Population()
    c = ChromosomeTemplate()
    for i in range(ngenos):
        c.add_genotype()
    p.add_chromosome(c)

    a = Individual(p, 1)
    a._init_genotypes(blankchroms=False)
    a.genotypes[0][0] = Alleles([1] * ngenos)
    a.genotypes[0][1] = Alleles([2] * ngenos)

    b = Individual(p, 2)
    b._init_genotypes(blankchroms=False)
    b.genotypes[0][0] = Alleles([3] * ngenos)
    b.genotypes[0][1] = Alleles([4] * ngenos)

    chromatid_spans = [
        InheritanceSpan(a, 0, 0, 0, ngenos // 2),
        InheritanceSpan(b, 0, 1, ngenos // 2, ngenos)
    ]
    chromatid = LabelledAlleles(spans=chromatid_spans, chromobj=c)

    expected_value = [1] * (ngenos // 2) + [4] * (ngenos // 2)
    expected_value = Alleles(expected_value)

    actual_value = chromatid.delabel()
    assert all(actual_value == expected_value)
Beispiel #8
0
def test_labelledalleles():
    IS = InheritanceSpan

    ngenos = 50
    p = Population()
    c = ChromosomeTemplate()
    for i in range(ngenos):
        c.add_genotype()
    p.add_chromosome(c)

    a = Individual(p, 1)
    actual = LabelledAlleles.founder_chromosome(a, 0, 0, chromobj=c)
    expected = LabelledAlleles(spans=[IS(a, 0, 0, 0, ngenos)], chromobj=c)
    assert actual == expected
Beispiel #9
0
def test_labelledalleles():
    IS = InheritanceSpan

    ngenos = 50
    p = Population()
    c = ChromosomeTemplate()
    for i in range(ngenos):
        c.add_genotype()
    p.add_chromosome(c)

    a = Individual(p, 1)
    actual = LabelledAlleles.founder_chromosome(a, 0, 0, chromobj=c)
    expected = LabelledAlleles(spans=[IS(a, 0, 0, 0, ngenos)], chromobj=c)
    assert actual == expected
Beispiel #10
0
def test_chromosometemplate():
    # Test the marker finder
    c = ChromosomeTemplate()
    for i in range(1, 100):
        c.add_genotype(map_position=i, bp=(i * 1e6))

    assert c.closest_marker(0) == 0
    assert c.closest_marker(5000001) == 4
    assert c.closest_marker(5999999) == 5
    assert c.closest_marker(1e10) == c.nmark() - 1
Beispiel #11
0
    def add_dummy_polygene_chromosomes(self,
                                       population,
                                       nloc,
                                       mean=0,
                                       sd=1,
                                       freqs=None,
                                       polylabel='Polygene'):
        """
        Creates many independently segregating chromosomes that 
        additively influence the trait.

        :param population: The population to add the chromosomes to
        :param nloc:       The number of dummy chromosomes to create
        :param mean:       Mean locus additive effect
        :param sd:         Standard deviation of locus additive effect
        :param freqs: frequencies of each polygene
        :param polylabel:  Label to add give chromosome

        :type nloc: integer
        :type mean: float
        :type sd: float
        :type freqs: sequence of floats
        :type polylabel: string

        :rtype: void
        """
        if freqs is None:
            freqs = np.zeros(nloc, dtype=np.float) + 0.5

        if sd == 0:
            effects = [mean] * nloc
        else:
            effects = np.random.normal(mean, sd, nloc)

        for i, effect in enumerate(effects):
            # Create the chromosome
            lab = '{}{}'.format(polylabel, i)
            c = ChromosomeTemplate(label=lab)
            c.add_genotype(freqs[i], 0)
            population.add_chromosome(c)

            # Add the effect
            locus = i, 0
            self.add_effect(locus, a=effect, k=0)
Beispiel #12
0
    def add_dummy_polygene_chromosomes(self, population, nloc,
                                       mean=0,
                                       sd=1,
                                       freqs=None,
                                       polylabel='Polygene'):
        """
        Creates many independently segregating chromosomes that 
        additively influence the trait.

        :param population: The population to add the chromosomes to
        :param nloc:       The number of dummy chromosomes to create
        :param mean:       Mean locus additive effect
        :param sd:         Standard deviation of locus additive effect
        :param freqs: frequencies of each polygene
        :param polylabel:  Label to add give chromosome

        :type nloc: integer
        :type mean: float
        :type sd: float
        :type freqs: sequence of floats
        :type polylabel: string

        :rtype: void
        """
        if freqs is None:
            freqs = np.zeros(nloc, dtype=np.float) + 0.5

        if sd == 0:
            effects = [mean] * nloc
        else:
            effects = np.random.normal(mean, sd, nloc)

        for i, effect in enumerate(effects):
            # Create the chromosome
            lab = '{}{}'.format(polylabel, i)
            c = ChromosomeTemplate(label=lab)
            c.add_genotype(freqs[i], 0)
            population.add_chromosome(c)

            # Add the effect
            locus = i, 0
            self.add_effect(locus, a=effect, k=0)
def test_chromosometemplate():
	# Test the marker finder
	c = ChromosomeTemplate()
	for i in range(1,100):
		c.add_genotype(map_position=i, bp=(i*1e6))

	assert c.closest_marker(0) == 0
	assert c.closest_marker(5000001) == 4
	assert c.closest_marker(5999999) == 5
	assert c.closest_marker(1e10) == c.nmark() - 1 
Beispiel #14
0
def read_map(mapfile):
    """
    Reads a PLINK map file into a list of ChromosomeTemplate objects

    
    :param mapfile: Path of the file to be read
    :type mapfile: string

    :rtype: a list of ChromosomeTemplate objects
    """
    last_chr, last_pos = None, 0
    chroms = ChromosomeSet()
    chromosome = None
    with smartopen(mapfile) as f:
        for i, line in enumerate(f):
            line = line.strip().split()
            chrom, label, cm, pos = line
            cm, pos = float(cm), int(pos)
            if pos < 0:
                raise FileFormatError('Invalid position: {}'.format(pos))
            if chrom != last_chr:
                # If this happens, we've moved on to a new chromosome,
                # or we've just started. If we haven't just started, We'll
                # close up the old one
                if i > 0:
                    chromosome.finalize()
                    chroms.add_chromosome(chromosome)
                # Make the next chromosome
                chromosome = ChromosomeTemplate(label=chrom)
            elif pos < last_pos:
                raise FileFormatError('Map file not sorted')
            chromosome.add_genotype(None, cm, label=label, bp=pos)
            last_chr, last_pos = chrom, pos
    chromosome.finalize()
    chroms.add_chromosome(chromosome)
    return chroms
Beispiel #15
0
def blank_chromosome(size=2):
    ch = ChromosomeTemplate()
    for i in range(size):
        ch.add_genotype()
    return ch
Beispiel #16
0
def blank_chromosome(size=2):
    ch = ChromosomeTemplate()
    for i in range(size):
        ch.add_genotype()
    return ch
Beispiel #17
0
def read_vcf(filename, require_pass=False, freq_info=None):
    """
    Reads a VCF file and returns a Population object with the
    individuals represented in the file
    
    Genotypes generated by this function will be sparse

    :param require_pass: only allow variants with PASS under FILTER
    :type require_pass: bool
    :param freq_info: INFO field to get allele frequency from
    :param freq_info: string

    :returns: Individuals in the VCF
    :rtype: Population
    """
    with smartopen(filename) as f:

        genotypes = []

        pop, inds = _vcf_parseheader(f)

        last_chrom = None
        chromobj = None

        for line in f:
            record = VCFRecord(line)

            if require_pass and not record.filter_passed:
                continue

            if record.chrom != last_chrom:
                if last_chrom is not None:
                    pop.add_chromosome(chromobj)
                chromobj = ChromosomeTemplate(label=record.chrom)

            if freq_info is not None:
                freq = _vcf_get_infofreq(record.info, freq_info)
            else:
                freq = 0

            genorow = record.genotypes()
            genotypes.append(genorow)

            chromobj.add_genotype(bp=record.pos,
                                  label=record.label,
                                  frequency=freq)

            last_chrom = record.chrom

        pop.add_chromosome(chromobj)
        pop.chromosomes.finalize()
    for ind in inds:
        # Initialize new genotypes
        ind._init_genotypes(sparse=True)

    # Now actually sift through markers and assign them to individuals
    final_indices = []
    for chromidx, chromobj in enumerate(pop.chromosomes):
        indices = zip([chromidx] * chromobj.nmark(), range(chromobj.nmark()))
        final_indices.extend(indices)

    raw_indices = range(len(genotypes))

    for raw, final in zip(raw_indices, final_indices):
        chromidx, markidx = final
        row = genotypes[raw]
        assign_genorow(row, inds, chromidx, markidx)

        # Kill the row so we don't end up with the whole dataset in memory twice
        genotypes[raw] = None

    return pop
Beispiel #18
0
def read_vcf(filename, require_pass=False, freq_info=None, info_filters=None):
    '''
    Reads a VCF file and returns a Population object with the
    individuals represented in the file
    '''
    if not info_filters:
        info_filters = []

    for filter in info_filters:
        if not callable(filter):
            raise ValueError('Filter not callable')

    with open(filename) as f:
        pop = Population()

        last_chrom = None
        genotypes = []

        for i, line in enumerate(f):

            if line.startswith('##'):
                continue

            elif line.startswith('#'):
                ind_ids = line.strip().split()[9:]
                inds = [Individual(pop, ind_id) for ind_id in ind_ids]
                for ind in inds:
                    pop.register_individual(ind)

                break
        
        for i, line in enumerate(f):
            record = VCFRecord(line)

            if info_filters and not all(filter(record) for filter in info_filters):
                continue

            if require_pass and not record.filter_passed:
                continue

            if record.chrom != last_chrom:
                if last_chrom is not None:
                    chromobj.finalize()
                    pop.add_chromosome(chromobj)
                chromobj = ChromosomeTemplate(label=record.chrom)


            if freq_info is not None and freq_info in record.info:
                freq = record.info[freq_info]
                if ',' in freq:
                    freq = freq.split(',')[0]
                freq = float(freq)
            else:
                freq = 0

            genorow = record.genotypes()
            genotypes.append(genorow)

            chromobj.add_genotype(bp=record.pos,
                                  label=record.label,
                                  frequency=freq)

            last_chrom = record.chrom

        chromobj.finalize()
        pop.add_chromosome(chromobj)

    for ind in inds:
        # Initialize new genotypes
        ind._init_genotypes(sparse=True)

    # Now actually sift through markers and assign them to individuals
    final_indices = []
    for chromidx, chromobj  in enumerate(pop.chromosomes):
        indices = zip([chromidx]*chromobj.nmark(), range(chromobj.nmark()))
        final_indices.extend(indices)

    raw_indices = range(len(genotypes))

    for raw, final in zip(raw_indices, final_indices):
        chromidx, markidx = final
        row = genotypes[raw]
        assign_genorow(row, inds, chromidx, markidx)

        # Kill the row so we don't end up with the whole dataset in memory twice
        genotypes[raw] = None
    
    return pop