Esempio n. 1
0
def read_map(mapfile):
    """
    Reads a PLINK map file into a list of ChromosomeTemplate objects

    Arguments:
    mapfile: The file to be read

    Returns: a list of ChromosomeTemplate objects
    """
    last_chr, last_pos = None, 0
    chroms = []
    chromosome = None
    with open(mapfile) as f:
        for i, line in enumerate(f):
            line = line.strip().split()
            chr, label, cm, pos = line
            cm, pos = float(cm), int(pos)
            if pos < 0:
                raise FileFormatError("Invalid position: {}".format(pos))
            if chr != last_chr:
                # If this happens, we've moved on to a new chromosome,
                # or we've just started. If we haven't just started, We'll
                # close up the old one
                if i > 0:
                    chromosome.finalize()
                    chroms.append(chromosome)
                # Make the next chromosome
                chromosome = ChromosomeTemplate(label=chr)
            elif pos < last_pos:
                raise FileFormatError("Map file not sorted")
            chromosome.add_genotype(None, cm, label=label, bp=pos)
            last_chr, last_pos = chr, pos
    chromosome.finalize()
    chroms.append(chromosome)
    return chroms
Esempio n. 2
0
def read_gs_chromosome_template(templatef):
    """
    Reads a genomeSIMLA format chromosome template file
    
    Arguments
    ------
    templatef: The filename of the template file

    Returns: A ChromosomeTemplate object corresponding to the file
    """
    with open(templatef) as f:
        label = f.readline().strip()  # The label and
        f.readline()  # the number of markers, both of which we dont need.
        c = pydigree.ChromosomeTemplate(label=label)
        # genomeSIMLA chromosome files have marginal recombination probs
        # instead of map positions. We'll have to keep track of what the
        # last position was and add to it to get it into the shape we want
        # it to be in.
        last_cm = 0
        for line in f:
            if line == '\n':
                continue
            label, majf, minf, cm, bp = line.strip().split()
            bp = int(bp)
            cm = float(cm)
            last_cm += cm
            c.add_genotype(float(minf), last_cm, label=label, bp=bp)
    return c
Esempio n. 3
0
def write_map(pedigrees, mapfile, output_chromosomes=None):
    """
    Writes the genotype location data to a PLINK MAP file

    Arguments
    ------
    pedigrees: the population containing the data to be written
    mapfile: the name of the file to be output to

    Returns: Nothing
    """
    # Check if we're only supposed to be outputting certain chromosomes
    if output_chromosomes is not None:
        checkchroms = True
    else:
        checkchroms = False

    with open(mapfile, "w") as f:
        for ci, chromosome in enumerate(pedigrees.chromosomes):
            if checkchroms and chromosome.outputlabel not in output_chromosomes:
                continue
            for mi, marker in enumerate(chromosome._iinfo()):
                label, cm, mb, frequency = marker
                if not mb:
                    mb = int(cm * 10e6)
                if not label:
                    label = "SNP%s-%s" % (chromosome.outputlabel, mi)
                f.write("\t".join(str(x) for x in [chromosome.outputlabel, label, cm, mb]) + "\n")
Esempio n. 4
0
def read_phenotypes(pedigrees, csvfile, delimiter=',', missingcode='X'):
    """
    Reads a csv with header
    famid,ind,phen,phen,phen,phen etc etc

    Arguments
    ------
    Pedigrees:   An object of class PedigreeCollection
    csvfile:     the filename of the file containing phenotypes.
    delimiter:   the field delimiter for the file
    missingcode: the code for missing values

    Returns: Nothing
    """
    with open(csvfile) as f:
        header = f.readline().strip().split(delimiter)
        for line in f:
            # Match columns to their column name
            d = dict(list(zip(header, line.strip().split(delimiter))))
            for k, v in list(d.items()):
                # Convert all phenotypes into floats
                try:
                    v = float(v)
                except ValueError:
                    if not v:
                        v = None
                if k in set(['famid', 'id']):
                    continue
                fam, ind = d['famid'], d['id']
                pedigrees[fam][ind].phenotypes[k] = v
Esempio n. 5
0
def write_pedigree(pedigrees, filename, missingcode='X', delim=' '):
    ''' Writes pedigree to a LINKAGE formatted pedigree file '''
    sorting_key = lambda x: (x.population.label, x.depth, x.label)
    with open(filename, 'w') as f:
        for ind in sorted(pedigrees.individuals, key=sorting_key):
            oline = [ind.population.label,
                     ind.label,
                     '0' if ind.is_founder() else ind.father.label,
                     '0' if ind.is_founder() else ind.mother.label,
                     '1' if ind.sex == 1 else '0',
                     '-9']
            oline = delim.join(oline)
            f.write(oline + '\n')
Esempio n. 6
0
def write_phenotypes(pedigrees, filename, predicate=None,
                     missingcode='X', delim=','):
    "Writes phenotypes to a CSV (or other delimited) file"
    inds = pedigrees.individuals

    if isinstance(predicate, Callable):
        inds = [x for x in inds if predicate(x)]

    available_phenotypes = reduce(set.union,
                                  [set(x.phenotypes.keys()) for x in inds])
    available_phenotypes = sorted(available_phenotypes)
    header = ['famid', 'id'] + available_phenotypes

    with open(filename, 'w') as ofile:
        ofile.write(delim.join([str(x) for x in header]) + '\n')
        for ind in inds:
            row = [ind.population.label, ind.label]
            row += [ind.phenotypes.get(phenotype, missingcode)
                    for phenotype in available_phenotypes]
            row = delim.join([str(x) for x in row])
            ofile.write(row + '\n')
Esempio n. 7
0
def write_ped(pedigrees, pedfile, delim=" ", predicate=None, output_chromosomes=None):
    """
    write_ped writes data in a plink-format PED file, and optionally a
    plink-format map file.

    Arguments
    ------

    pedigrees: An object of class PedigreeCollection containing what you
        want to output
    pedfile: a string giving the name out the file to output to.
    mapfile: the name of a mapfile to output, if you want to output one.
        an object that evaluates as False or None will skip the mapfile
    genotypes: Should genotypes be output True/False
    delim: Field seperator
    predicate: Which inputs to include in the output file. If not specified
        all are output. If the string is 'affected', only affected
        individuals are output. If the string is 'phenotyped', all individuals
        with phenotype information are output. Any other value of predicate
        must be a function to perform on the individual that evaluates to
        True/False for whether the individual should be output.

    Returns: Nothing
    """

    # Check if we're only supposed to be outputting certain chromosomes
    if output_chromosomes is not None:
        checkchroms = True
    else:
        checkchroms = False

    if not predicate:
        predicate = lambda x: True
    elif predicate == "affected":
        predicate = lambda x: x.phenotypes["affected"] == 1
    elif predicate == "phenotyped":
        predicate = lambda x: x.phenotypes["affected"] in set([0, 1])
    elif not isinstance(predicate, collections.Callable):
        raise ValueError("Not a valid predicate!")

    afflab = {1: "2", 0: "1", None: "-9"}

    with open(pedfile, "w") as f:
        for pedigree in pedigrees.pedigrees:
            for ind in pedigree.individuals:
                if not predicate(ind):
                    continue
                # Get the phenotype code
                aff = afflab[ind.phenotypes["affected"]]
                # Prepare the 6-column identifier
                outline = [
                    pedigree.label,
                    ind.label,
                    ind.father.label if ind.father is not None else "0",
                    ind.mother.label if ind.mother is not None else "0",
                    1 if ind.sex == 0 else 2,
                    aff,
                ]
                # Make strings
                outline = list(map(str, outline))

                # Get the genotypes in the format we need them
                g = []
                for template, chromatids in zip(ind.chromosomes, ind.genotypes):
                    if checkchroms and template.outputlabel not in output_chromosomes:
                        continue
                    chroma, chromb = chromatids
                    ga = chroma.astype(str).tolist()
                    gb = chromb.astype(str).tolist()
                    gn = interleave(ga, gb)
                    g.extend(gn)
                outline.extend(g)

                # Write it out
                outline = delim.join(outline)
                f.write(outline)
                f.write("\n")
Esempio n. 8
0
def read_ped(filename, population=None, delimiter=None, affected_labels=None,
             population_handler=None, data_handler=None, connect_inds=True,
             onlyinds=None):
    """
    Reads a plink format pedigree file, ie:
        familyid indid father mother sex whatever whatever whatever
    into a pydigree pedigree object, with optional population to
    assign to pedigree members. If you don't provide a population
    you can't simulate genotypes!

    Arguments
    -----
    filename: The file to be read
    population: The population to assign individuals to
    delimiter: a string defining the field separator, default: any whitespace
    affected_labels: The labels that determine affection status.
    population_handler: a function to set up the population 
    data_handler: a function to turn the data into useful individual information
    connect_inds: build references between individuals. Requires all
        individuals be present in the file
    onlyinds: a list of individuals to be processed, allows skipping parts
        of a file

    Returns: An object of class PedigreeCollection
    """
    sex_codes = {'1': 0, '2': 1, 'M': 0, 'F': 1, '0': None, '-9': None}
    if not affected_labels:
        affected_labels = {'1': 0, '2': 1,
                           'A': 1, 'U': 0,
                           'X': None,
                           '-9': None}

    # Tries to get a phenotype and returns unknown on failure
    def getph(ph):
        try:
            return affected_labels[ph]
        except KeyError:
            return None

    population = Population()

    p = Pedigree()
    if isinstance(population_handler, Callable):
        population_handler(p)

    pc = PedigreeCollection()

    with open(filename) as f:
        # Parse the lines in the file
        for line in f:
            split = line.strip().split(delimiter)
            if len(split) > 5:
                fam, id, fa, mo, sex, aff = split[0:6]
            elif len(split) == 5:
                fam, id, fa, mo, sex = split[0:5]
                aff = None
            # Give a special id for now, to prevent overwriting duplicated
            # ids between families
            id = (fam, id)

            if onlyinds and (id not in onlyinds):
                continue

            p[id] = Individual(population, id, fa, mo, sex)
            p[id].phenotypes['affected'] = getph(aff)
            p[id].pedigree = p
            p[id].sex = sex_codes[p[id].sex]

            if isinstance(data_handler, Callable) and len(split) > 6:
                data = split[6:]
                data_handler(p[id],  data)

    # Fix the individual-level data
    if connect_inds:
        for ind in p.individuals:
            fam, id = ind.label
            # Actually make the references instead of just pointing at strings
            ind.father = p[(fam, ind.father)] if ind.father != '0' else None
            ind.mother = p[(fam, ind.mother)] if ind.mother != '0' else None

            ind.register_with_parents()

    # Place individuals into pedigrees
    pedigrees = {}
    for ind in p.individuals:
        if ind.label[0] not in pedigrees:
            pedigrees[ind.label[0]] = []

        pedigrees[ind.label[0]].append(ind)

    for pedigree_label, ped_inds in list(pedigrees.items()):
        ped = Pedigree(label=pedigree_label)

        if isinstance(population_handler, Callable):
            population_handler(ped)
        
        for ind in ped_inds:
            ind.label = ind.label[1]
            ped[ind.label] = ind
            ind.population = ped
            ind.pedigree = ped
        pc[pedigree_label] = ped

    return pc