コード例 #1
0
ファイル: beagle.py プロジェクト: jameshicks/pydigree
def read_beagle_markerfile(filename, label=None):
    """ 
    Reads marker locations from a BEAGLE formatted file
    
    :param filename: The file to be read
    :param label: An optional label to give the chromosome, since the BEAGLE
        format does not require it
    
    :type filename: string

    :rtype: ChromosomeTemplate
    """
    with smartopen(filename) as f:
        chrom = ChromosomeTemplate(label=label)

        last_pos = -1
        for line in f:
            rec = BeagleMarkerRecord(line)

            if rec.pos < 0:
                raise FileFormatError(
                    'Bad position for genotype: {}'.format(rec.pos))
            elif rec.pos <= last_pos:
                raise FileFormatError('Makers in file out of order')

            chrom.add_genotype(None, map_position=None, label=rec.label, 
                               bp=rec.pos, reference=rec.reference, 
                               alternates=rec.alternates)
            last_pos = rec.pos

    return chrom
コード例 #2
0
def read_beagle_markerfile(filename, label=None):
    """ 
    Reads marker locations from a BEAGLE formatted file
    
    :param filename: The file to be read
    :param label: An optional label to give the chromosome, since the BEAGLE
        format does not require it
    
    :type filename: string

    :rtype: ChromosomeTemplate
    """
    with smartopen(filename) as f:
        chrom = ChromosomeTemplate(label=label)

        last_pos = -1
        for line in f:
            rec = BeagleMarkerRecord(line)

            if rec.pos < 0:
                raise FileFormatError('Bad position for genotype: {}'.format(
                    rec.pos))
            elif rec.pos <= last_pos:
                raise FileFormatError('Makers in file out of order')

            chrom.add_genotype(None,
                               map_position=None,
                               label=rec.label,
                               bp=rec.pos,
                               reference=rec.reference,
                               alternates=rec.alternates)
            last_pos = rec.pos

    return chrom
コード例 #3
0
ファイル: sgs.py プロジェクト: y-chai/pydigree
def write_sgs(data, filename):
    """
    GERMLINE files are text files with the format:

        0) Family ID 1
        1) Individual ID 1
        2) Family ID 2
        3) Individual ID 2
        4) Chromosome
        5) Segment start (bp/cM)
        6) Segment end (bp/cM)
        7) Segment start (SNP)
        8) Segment end (SNP)
        9) Total SNPs in segment
        10) Genetic length of segment
        11) Units for genetic length (cM or MB)
        12) Mismatching SNPs in segment
        13) 1 if Individual 1 is homozygous in match; 0 otherwise
        14) 1 if Individual 2 is homozygous in match; 0 otherwise
    """

    with smartopen(filename, 'w') as o:
        for segment in data.segments:
            oline = []

            ind1 = segment.ind1.full_label
            ind2 = segment.ind2.full_label
            oline.extend(ind1)
            oline.extend(ind2)

            chrom = [segment.chromosome.label]
            physical = segment.physical_location
            labs = segment.marker_labels
            nmark = [segment.nmark]
            psize = [segment.physical_size / 1e6]  # Megabases, not basepairs
            oline.extend(chrom)
            oline.extend(physical)
            oline.extend(labs)
            oline.extend(nmark)
            oline.extend(psize)
            unit = ['MB']
            # Extra info GERMLINE gives you like mismatch rate
            misc = 'X', 'X', 'X'
            oline.extend(unit)
            oline.extend(misc)

            oline = '\t'.join([str(x) for x in oline])

            o.write(oline)
            o.write('\n')
コード例 #4
0
ファイル: kinship.py プロジェクト: y-chai/pydigree
def read_kinship(filename):
    '''
    Reads a KinInbCoef formatted file of kinship and inbreeding coefficients

    :param filename: the filename to be read
    :type filename: string

    Returns: a dictionary in the format 
    {frozenset({(fam, ind_a), (fam, ind_b)}): kinship/inbreeding
    '''
    kindict = {}
    with smartopen(filename) as f:
        for line in f:
            fam, ida, idb, phi = line.strip().split()
            kindict[frozenset({(fam, ida), (fam, idb)})] = float(phi)
    return kindict
コード例 #5
0
ファイル: kinship.py プロジェクト: jameshicks/pydigree
def read_kinship(filename):
    '''
    Reads a KinInbCoef formatted file of kinship and inbreeding coefficients

    :param filename: the filename to be read
    :type filename: string

    Returns: a dictionary in the format 
    {frozenset({(fam, ind_a), (fam, ind_b)}): kinship/inbreeding
    '''
    kindict = {}
    with smartopen(filename) as f:
        for line in f:
            fam, ida, idb, phi = line.strip().split()
            kindict[frozenset({(fam, ida), (fam, idb)})] = float(phi)
    return kindict
コード例 #6
0
def read_beagle_genotypefile(filename, pop, missingcode='0'):
    '''
    Reads BEAGLE formatted genotype files
    
    Arguments

    :param filename: Filename of BEAGLE genotype file
    :param pop: the population to add these individuals to
    :param missingcode: The value that indicates a missing genotype
    
    :type missingcode: string
    :rtype: void
    '''
    with smartopen(filename) as f:
        for line in f:
            rec = BeagleGenotypeRecord(line)

            if rec.identifier == 'I':
                inds = [Individual(pop, label) for label in rec.data[::2]]
            elif rec.is_phenotype_record:
                for ind, pheno_status in zip(inds, rec.data[::2]):
                    if rec.identifier == 'A':
                        pheno_status = pheno_status == '2'
                    else:
                        try:
                            pheno_status = float(pheno_status)
                        except ValueError:
                            pass
                    ind.phenotypes[rec.label] = pheno_status
            else:
                # We've reached the genotypes, and we're skipping out
                break
        f.seek(0)
        gtrows = [
            list(grouper(BeagleGenotypeRecord(x).data, 2)) for x in f
            if x.startswith('M')
        ]
        genotypes = zip(*gtrows)
        for ind, sequentialalleles in zip(inds, genotypes):
            ind.genotypes = gt_from_seq(ind.chromosomes,
                                        sequentialalleles,
                                        missing_code=missingcode)
コード例 #7
0
ファイル: beagle.py プロジェクト: jameshicks/pydigree
def read_beagle_genotypefile(filename, pop, missingcode='0'):
    '''
    Reads BEAGLE formatted genotype files
    
    Arguments

    :param filename: Filename of BEAGLE genotype file
    :param pop: the population to add these individuals to
    :param missingcode: The value that indicates a missing genotype
    
    :type missingcode: string
    :rtype: void
    '''
    with smartopen(filename) as f:
        for line in f:
            rec = BeagleGenotypeRecord(line)

            if rec.identifier == 'I':
                inds = [Individual(pop, label) for label in rec.data[::2]]
            elif rec.is_phenotype_record:
                for ind, pheno_status in zip(inds, rec.data[::2]):
                    if rec.identifier == 'A':
                        pheno_status = pheno_status == '2'
                    else:
                        try:
                            pheno_status = float(pheno_status)
                        except ValueError:
                            pass
                    ind.phenotypes[rec.label] = pheno_status
            else:
                # We've reached the genotypes, and we're skipping out
                break
        f.seek(0)
        gtrows = [list(grouper(BeagleGenotypeRecord(x).data, 2))
                  for x in f if x.startswith('M')]
        genotypes = zip(*gtrows)
        for ind, sequentialalleles in zip(inds, genotypes):
            ind.genotypes = gt_from_seq(ind.chromosomes,
                                        sequentialalleles,
                                        missing_code=missingcode)
コード例 #8
0
ファイル: sgs.py プロジェクト: y-chai/pydigree
def read_germline(filename):
    '''
    Reads a GERMLINE formatted SGS filename into an SGSAnalysis object

    GERMLINE files are text files with the format:

        0) Family ID 1
        1) Individual ID 1
        2) Family ID 2
        3) Individual ID 2
        4) Chromosome
        5) Segment start (bp/cM)
        6) Segment end (bp/cM)
        7) Segment start (SNP)
        8) Segment end (SNP)
        9) Total SNPs in segment
        10) Length of segment
        11) Units for genetic length (cM or MB)
        12) Mismatching SNPs in segment
        13) 1 if Individual 1 is homozygous in match; 0 otherwise
        14) 1 if Individual 2 is homozygous in match; 0 otherwise

    This function only uses 0-6.
    '''
    analysis = SGSAnalysis()
    with smartopen(filename) as f:
        for line in f:
            rec = GermlineRecord(line)

            if rec.pair not in analysis:
                analysis[rec.pair] = SGS(rec.ind1, rec.ind2)

            phys_loc = (rec.location if rec.bp_locations else None)
            seg = Segment(rec.ind1, rec.ind2, rec.chromosome, None, None,
                          physical_location=phys_loc)
            
            analysis[rec.pair].append(seg)
    return analysis
コード例 #9
0
    def from_file(filename):
        """
        Reads a trait from a file

        :param filename: path to file
        :type filename: string

        :rtype: QuantitativeTrait
        """
        with smartopen(filename) as f:
            trait_type, name = f.readline().strip().split()
            trait = QuantitativeTrait(trait_type, name)
            for line in f:
                l = line.strip().split()

                if len(l) != 5:
                    # TODO: implement epistatic effects in file
                    raise NotImplementedError(
                        'Epistatic effects not yet implemented')
                chrom, loc, _, _, a, k = line.strip().split()
                locus = chrom, loc
                trait.add_effect(locus, a, k)

        return trait
コード例 #10
0
ファイル: trait.py プロジェクト: jameshicks/pydigree
    def from_file(filename):
        """
        Reads a trait from a file

        :param filename: path to file
        :type filename: string

        :rtype: QuantitativeTrait
        """
        with smartopen(filename) as f:
            trait_type, name = f.readline().strip().split()
            trait = QuantitativeTrait(trait_type, name)
            for line in f:
                l = line.strip().split()
                
                if len(l) != 5:
                    # TODO: implement epistatic effects in file
                    raise NotImplementedError(
                        'Epistatic effects not yet implemented')
                chrom, loc, _, _, a, k = line.strip().split()
                locus = chrom, loc
                trait.add_effect(locus, a, k)
        
        return trait
コード例 #11
0
ファイル: vcf.py プロジェクト: y-chai/pydigree
def read_vcf(filename, require_pass=False, freq_info=None):
    """
    Reads a VCF file and returns a Population object with the
    individuals represented in the file
    
    Genotypes generated by this function will be sparse

    :param require_pass: only allow variants with PASS under FILTER
    :type require_pass: bool
    :param freq_info: INFO field to get allele frequency from
    :param freq_info: string

    :returns: Individuals in the VCF
    :rtype: Population
    """
    with smartopen(filename) as f:

        genotypes = []

        pop, inds = _vcf_parseheader(f)

        last_chrom = None
        chromobj = None

        for line in f:
            record = VCFRecord(line)

            if require_pass and not record.filter_passed:
                continue

            if record.chrom != last_chrom:
                if last_chrom is not None:
                    pop.add_chromosome(chromobj)
                chromobj = ChromosomeTemplate(label=record.chrom)

            if freq_info is not None:
                freq = _vcf_get_infofreq(record.info, freq_info)
            else:
                freq = 0

            genorow = record.genotypes()
            genotypes.append(genorow)

            chromobj.add_genotype(bp=record.pos,
                                  label=record.label,
                                  frequency=freq)

            last_chrom = record.chrom

        pop.add_chromosome(chromobj)
        pop.chromosomes.finalize()
    for ind in inds:
        # Initialize new genotypes
        ind._init_genotypes(sparse=True)

    # Now actually sift through markers and assign them to individuals
    final_indices = []
    for chromidx, chromobj in enumerate(pop.chromosomes):
        indices = zip([chromidx] * chromobj.nmark(), range(chromobj.nmark()))
        final_indices.extend(indices)

    raw_indices = range(len(genotypes))

    for raw, final in zip(raw_indices, final_indices):
        chromidx, markidx = final
        row = genotypes[raw]
        assign_genorow(row, inds, chromidx, markidx)

        # Kill the row so we don't end up with the whole dataset in memory twice
        genotypes[raw] = None

    return pop
コード例 #12
0
ファイル: test_segments.py プロジェクト: y-chai/pydigree
import pydigree as pyd
from pydigree.io import smartopen
from pydigree.sgs.sgs import intervals_to_array
from pydigree.ibs import ibs

replicate = sys.argv[1]
ms = int(sys.argv[2])
prefix='null'

peds = pyd.io.plink.read_plink('{}-{}.ped'.format(prefix, replicate), '{}.map'.format(prefix))
ped = peds['1']
s = pyd.sgs.sgs_population(ped, seed_size=ms)


with smartopen('{}-{}.ibd.gz'.format(prefix, replicate)) as f:
    trueibd = {}
    for line in f:
        fam, id1, id2, ibd_states = line.strip().split(None, 3)
        trueibd[frozenset({id1,id2})] = np.array([int(x) for x in ibd_states.split()])

a = intervals_to_array(s[frozenset({ped['7'],ped['8']})][0], ped.chromosomes[0].nmark())
b = trueibd[frozenset({'7','8'})]


genos1 = zip(*ped['7'].genotypes[0])
genos2 = zip(*ped['8'].genotypes[0])
identical = [ibs(x,y) for x,y in zip(genos1, genos2)]

from pydigree.common import table, runs