def test_smartopen(): from pydigree.io.smartopen import smartopen datadir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_data', 'compression') # Plain text with smartopen(os.path.join(datadir, 'test')) as f: d = f.readlines() assert all(type(x) is str for x in d) assert [x.strip() for x in d] == ['genetics', 'pydigree', 'dna'] # Gzip with smartopen(os.path.join(datadir, 'test.gz')) as f: d = f.readlines() assert all(type(x) is str for x in d) assert [x.strip() for x in d] == ['genetics', 'pydigree', 'dna'] # bz2 with smartopen(os.path.join(datadir, 'test.gz')) as f: d = f.readlines() assert all(type(x) is str for x in d) assert [x.strip() for x in d] == ['genetics', 'pydigree', 'dna'] # xz with smartopen(os.path.join(datadir, 'test.xz')) as f: d = f.readlines() assert all(type(x) is str for x in d) assert [x.strip() for x in d] == ['genetics', 'pydigree', 'dna'] # lzma with smartopen(os.path.join(datadir, 'test.lzma')) as f: d = f.readlines() assert all(type(x) is str for x in d) assert [x.strip() for x in d] == ['genetics', 'pydigree', 'dna']
def _writeibd(self, replicatenumber): # Warning: Don't call this function! If the individuals in the pedigree dont have # LABEL genotypes, you're just going to get IBS configurations at each locus, not # actual IBD calculations. # # If you have data you want to identify IBD segments in, check # pydigree.sgs with smartopen( '{0}-{1}.ibd.gz'.format(self.label, replicatenumber + 1), 'w') as of: for ped in self.template.pedigrees: for ind1, ind2 in combinations_with_replacement( ped.individuals, 2): identical = [] for chrom_idx in range(ind1.chromosomes.nchrom()): if ind1 == ind2: genos = zip(*ind1.genotypes[chrom_idx]) ibd = [2 * (x == y) for x, y in genos] else: genos1 = zip(*ind1.genotypes[chrom_idx]) genos2 = zip(*ind2.genotypes[chrom_idx]) ibd = [ ibs(g1, g2) for g1, g2 in zip(genos1, genos2) ] identical.extend(ibd) outline = [ped.label, ind1.label, ind2.label] + identical outline = ' '.join([str(x) for x in outline]) of.write('{}\n'.format(outline))
def write_phenotypes(pedigrees, filename, predicate=None, missingcode='X', delim=','): """ Writes phenotypes to a CSV (or other field delimited) file :param pedigrees: Data to write :param filename: filename to write to :param missingcode: code to use for missing values :param delim: output field separator :type missingcode: string :type delim: string """ inds = pedigrees.individuals if isinstance(predicate, Callable): inds = [x for x in inds if predicate(x)] available_phenotypes = reduce(set.union, [set(x.phenotypes.keys()) for x in inds]) available_phenotypes = sorted(available_phenotypes) header = ['famid', 'id'] + available_phenotypes with smartopen(filename, 'w') as ofile: ofile.write(delim.join([str(x) for x in header]) + '\n') for ind in inds: row = [ind.population.label, ind.label] row += [ind.phenotypes.get(phenotype, missingcode) for phenotype in available_phenotypes] row = delim.join([str(x) for x in row]) ofile.write(row + '\n')
def read_phenotypes(pedigrees, csvfile, delimiter=',', missingcode='X'): """ Reads a csv with header famid, ind, phen, phen, phen, phen etc etc Arguments :param pedigrees: data to update :param csvfile: the filename of the file containing phenotypes. :param delimiter: the field delimiter for the file :param missingcode: the code for missing values :type pedigrees: PedigreeCollection :type csvfile: string :type missingcode: string :rtype: void """ with smartopen(csvfile) as f: header = f.readline().strip().split(delimiter) for line in f: # Match columns to their column name d = dict(list(zip(header, line.strip().split(delimiter)))) for k, v in list(d.items()): # Convert all phenotypes into floats try: v = float(v) except ValueError: if not v or v == missingcode: v = None if k in set(['famid', 'id']): continue fam, ind = d['famid'], d['id'] pedigrees[fam][ind].phenotypes[k] = v
def write_map(pedigrees, mapfile, output_chromosomes=None): ''' Writes the genotype location data to a PLINK MAP file :param pedigrees: the population containing the data to be written :param mapfile: the name of the file to be output to :param output_chromosomes: which chromosomes to write Returns: Nothing ''' # Check if we're only supposed to be outputting certain chromosomes if output_chromosomes is not None: checkchroms = True else: checkchroms = False with smartopen(mapfile, 'w') as f: for chrom in pedigrees.chromosomes: if checkchroms and chrom.outputlabel not in output_chromosomes: continue for mi, marker in enumerate(chrom.iterinfo()): label, cm, mb, _ = marker if not mb: mb = int(cm * 10e6) if not label: label = 'SNP%s-%s' % (chrom.outputlabel, mi) rec = [chrom.outputlabel, label, cm, mb] outline = '\t'.join(str(x) for x in rec) f.write(outline + '\n')
def read_gs_chromosome_template(templatef): """ Reads a genomeSIMLA format chromosome template file :param templatef: The filename of the template file :type templatef: string :rtype: A ChromosomeTemplate object corresponding to the file """ with smartopen(templatef) as f: label = f.readline().strip() # The label and f.readline() # the number of markers, both of which we dont need. c = pydigree.ChromosomeTemplate(label=label) # genomeSIMLA chromosome files have marginal recombination probs # instead of map positions. We'll have to keep track of what the # last position was and add to it to get it into the shape we want # it to be in. last_cm = 0 for line in f: if line == '\n': continue label, _, minf, cm, bp = line.strip().split() bp = int(bp) cm = float(cm) last_cm += cm c.add_genotype(float(minf), last_cm, label=label, bp=bp) return c
def write_pedigree(pedigrees, filename, delim=' '): ''' Writes pedigree to a LINKAGE formatted pedigree file :param pedigrees: Data to write :param filename: filename to write to :param delim: output field separator :rtype: void ''' sorting_key = lambda x: (x.population.label, x.depth, x.label) with smartopen(filename, 'w') as f: for ind in sorted(pedigrees.individuals, key=sorting_key): oline = [ ind.population.label, ind.label, '0' if ind.is_founder() else ind.father.label, '0' if ind.is_founder() else ind.mother.label, '1' if ind.sex == 1 else '0', '-9' ] oline = delim.join(oline) f.write(oline + '\n')
def write_pedigree(pedigrees, filename, delim=' '): ''' Writes pedigree to a LINKAGE formatted pedigree file :param pedigrees: Data to write :param filename: filename to write to :param delim: output field separator :rtype: void ''' sorting_key = lambda x: (x.population.label, x.depth, x.label) with smartopen(filename, 'w') as f: for ind in sorted(pedigrees.individuals, key=sorting_key): oline = [ind.population.label, ind.label, '0' if ind.is_founder() else ind.father.label, '0' if ind.is_founder() else ind.mother.label, '1' if ind.sex == 1 else '0', '-9'] oline = delim.join(oline) f.write(oline + '\n')
def _writeibd(self, replicatenumber): # Warning: Don't call this function! If the individuals in the pedigree dont have # LABEL genotypes, you're just going to get IBS configurations at each locus, not # actual IBD calculations. # # If you have data you want to identify IBD segments in, check # pydigree.sgs with smartopen('{0}-{1}.ibd.gz'.format(self.label, replicatenumber + 1), 'w') as of: for ped in self.template.pedigrees: for ind1, ind2 in combinations_with_replacement(ped.individuals, 2): identical = [] for chrom_idx in range(ind1.chromosomes.nchrom()): if ind1 == ind2: genos = zip(*ind1.genotypes[chrom_idx]) ibd = [2 * (x == y) for x, y in genos] else: genos1 = zip(*ind1.genotypes[chrom_idx]) genos2 = zip(*ind2.genotypes[chrom_idx]) ibd = [ibs(g1, g2) for g1, g2 in zip(genos1, genos2)] identical.extend(ibd) outline = [ped.label, ind1.label, ind2.label] + identical outline = ' '.join([str(x) for x in outline]) of.write('{}\n'.format(outline))
def write_phenotypes(pedigrees, filename, predicate=None, missingcode='X', delim=','): """ Writes phenotypes to a CSV (or other field delimited) file :param pedigrees: Data to write :param filename: filename to write to :param missingcode: code to use for missing values :param delim: output field separator :type missingcode: string :type delim: string """ inds = pedigrees.individuals if isinstance(predicate, Callable): inds = [x for x in inds if predicate(x)] available_phenotypes = reduce(set.union, [set(x.phenotypes.keys()) for x in inds]) available_phenotypes = sorted(available_phenotypes) header = ['famid', 'id'] + available_phenotypes with smartopen(filename, 'w') as ofile: ofile.write(delim.join([str(x) for x in header]) + '\n') for ind in inds: row = [ind.population.label, ind.label] row += [ ind.phenotypes.get(phenotype, missingcode) for phenotype in available_phenotypes ] row = delim.join([str(x) for x in row]) ofile.write(row + '\n')
def read_map(mapfile): """ Reads a PLINK map file into a list of ChromosomeTemplate objects :param mapfile: Path of the file to be read :type mapfile: string :rtype: a list of ChromosomeTemplate objects """ last_chr, last_pos = None, 0 chroms = ChromosomeSet() chromosome = None with smartopen(mapfile) as f: for i, line in enumerate(f): line = line.strip().split() chrom, label, cm, pos = line cm, pos = float(cm), int(pos) if pos < 0: raise FileFormatError('Invalid position: {}'.format(pos)) if chrom != last_chr: # If this happens, we've moved on to a new chromosome, # or we've just started. If we haven't just started, We'll # close up the old one if i > 0: chromosome.finalize() chroms.add_chromosome(chromosome) # Make the next chromosome chromosome = ChromosomeTemplate(label=chrom) elif pos < last_pos: raise FileFormatError('Map file not sorted') chromosome.add_genotype(None, cm, label=label, bp=pos) last_chr, last_pos = chrom, pos chromosome.finalize() chroms.add_chromosome(chromosome) return chroms
def read_ped(filename, population=None, delimiter=None, affected_labels=None, population_handler=None, data_handler=None, connect_inds=True, onlyinds=None): """ Reads a plink format pedigree file, ie: :: familyid indid father mother sex whatever whatever whatever into a pydigree pedigree object, with optional population to assign to pedigree members. If you don't provide a population you can't simulate genotypes! :param filename: The file to be read :param population: The population to assign individuals to :param delimiter: a string defining the field separator, default: any whitespace :param affected_labels: The labels that determine affection status. :param population_handler: a function to set up the population :param data_handler: a function to turn the data into useful individual information :param connect_inds: build references between individuals. Requires all individuals be present in the file :param onlyinds: only include data for specified individuals :type filename: string :type population: Population :type delimiter: string :type affected_labels: dict (str -> value) :type data_handler: callable :type connect_inds: bool :type onlyinds: iterable :returns: individuals contained in the pedigree file :rtype: PedigreeCollection """ if not affected_labels: affected_labels = { '1': 0, '2': 1, 'A': 1, 'U': 0, 'X': None, '-9': None } if not isinstance(data_handler, Callable): data_handler = lambda *x: None if not isinstance(population_handler, Callable): population_handler = lambda *x: None population = Population() if population is None else population p = Pedigree() population_handler(p) # Step 1: Read the data and create the individuals with smartopen(filename) as f: # Parse the lines in the file for line in f: rec = PEDRecord(line, delimiter) if onlyinds and (rec.ind_id not in onlyinds): continue ind = rec.create_individual(population) ind.pedigree = p ind.phenotypes['affected'] = affected_labels.get(rec.aff, None) p[ind.label] = ind if rec.data: data_handler(p[ind.label], rec.data) # Step 2: Create the between-individual relationships # Fix the individual-level data: individuals currently only have parent-ids # in their parent fields and not references to actual individuals if connect_inds: connect_individuals(p) # Step 3: Separate the individuals into pedigrees pc = sort_pedigrees(p.individuals, population_handler) return pc
def write_ped(pedigrees, pedfile, delim=' ', predicate=None, output_chromosomes=None): """ write_ped writes data in a plink-format PED file, and optionally a plink-format map file. :param pedigrees: An object of class PedigreeCollection containing what you want to output :param pedfile: a string giving the name out the file to output to. :param mapfile: the name of a mapfile to output, if you want to output one. an object that evaluates as False or None will skip the mapfile :param genotypes: Should genotypes be output True/False :param delim: Field seperator :param predicate: Which inputs to include in the output file. If not specified all are output. If the string is 'affected', only affected individuals are output. If the string is 'phenotyped', all individuals with phenotype information are output. Any other value of predicate must be a function to perform on the individual that evaluates to True/False for whether the individual should be output. Returns: Nothing """ # Check if we're only supposed to be outputting certain chromosomes checkchroms = output_chromosomes is not None if not predicate: predicate = lambda x: True elif predicate == 'affected': predicate = lambda x: x.phenotypes['affected'] == 1 elif predicate == 'phenotyped': predicate = lambda x: x.phenotypes['affected'] in set([0, 1]) elif not isinstance(predicate, collections.Callable): raise ValueError('Not a valid predicate!') pheno_label = {1: '2', 0: '1', None: '-9'} def getlab(ind, default): """ Gets the label of an individual, or return different value ind is None """ return ind.label if ind is not None else default with smartopen(pedfile, 'w') as f: for pedigree in pedigrees.pedigrees: for ind in pedigree.individuals: if not predicate(ind): continue # Prepare the 6-column identifier outline = [pedigree.label, ind.label, getlab(ind.father, '0'), getlab(ind.mother, '0'), 1 if ind.sex == 0 else 2, pheno_label[ind.phenotypes['affected']]] # Make strings outline = list(map(str, outline)) # Get the genotypes in the format we need them g = [] for template, chromatids in zip(ind.chromosomes, ind.genotypes): if checkchroms and template.outputlabel not in output_chromosomes: continue chroma, chromb = chromatids if isinstance(chroma, SparseAlleles): raise ValueError("Plink output not for Sparse Data") ga = chroma.astype(str).tolist() gb = chromb.astype(str).tolist() gn = interleave(ga, gb) g.extend(gn) outline.extend(g) # Write it out outline = delim.join(outline) f.write(outline) f.write('\n')
def read_ped(filename, population=None, delimiter=None, affected_labels=None, population_handler=None, data_handler=None, connect_inds=True, onlyinds=None): """ Reads a plink format pedigree file, ie: :: familyid indid father mother sex whatever whatever whatever into a pydigree pedigree object, with optional population to assign to pedigree members. If you don't provide a population you can't simulate genotypes! :param filename: The file to be read :param population: The population to assign individuals to :param delimiter: a string defining the field separator, default: any whitespace :param affected_labels: The labels that determine affection status. :param population_handler: a function to set up the population :param data_handler: a function to turn the data into useful individual information :param connect_inds: build references between individuals. Requires all individuals be present in the file :param onlyinds: only include data for specified individuals :type filename: string :type population: Population :type delimiter: string :type affected_labels: dict (str -> value) :type data_handler: callable :type connect_inds: bool :type onlyinds: iterable :returns: individuals contained in the pedigree file :rtype: PedigreeCollection """ if not affected_labels: affected_labels = {'1': 0, '2': 1, 'A': 1, 'U': 0, 'X': None, '-9': None} if not isinstance(data_handler, Callable): data_handler = lambda *x: None if not isinstance(population_handler, Callable): population_handler = lambda *x: None population = Population() if population is None else population p = Pedigree() population_handler(p) # Step 1: Read the data and create the individuals with smartopen(filename) as f: # Parse the lines in the file for line in f: rec = PEDRecord(line, delimiter) if onlyinds and (rec.ind_id not in onlyinds): continue ind = rec.create_individual(population) ind.pedigree = p ind.phenotypes['affected'] = affected_labels.get(rec.aff, None) p[ind.label] = ind if rec.data: data_handler(p[ind.label], rec.data) # Step 2: Create the between-individual relationships # Fix the individual-level data: individuals currently only have parent-ids # in their parent fields and not references to actual individuals if connect_inds: connect_individuals(p) # Step 3: Separate the individuals into pedigrees pc = sort_pedigrees(p.individuals, population_handler) return pc
def write_ped(pedigrees, pedfile, delim=' ', predicate=None, output_chromosomes=None): """ write_ped writes data in a plink-format PED file, and optionally a plink-format map file. :param pedigrees: An object of class PedigreeCollection containing what you want to output :param pedfile: a string giving the name out the file to output to. :param mapfile: the name of a mapfile to output, if you want to output one. an object that evaluates as False or None will skip the mapfile :param genotypes: Should genotypes be output True/False :param delim: Field seperator :param predicate: Which inputs to include in the output file. If not specified all are output. If the string is 'affected', only affected individuals are output. If the string is 'phenotyped', all individuals with phenotype information are output. Any other value of predicate must be a function to perform on the individual that evaluates to True/False for whether the individual should be output. Returns: Nothing """ # Check if we're only supposed to be outputting certain chromosomes checkchroms = output_chromosomes is not None if not predicate: predicate = lambda x: True elif predicate == 'affected': predicate = lambda x: x.phenotypes['affected'] == 1 elif predicate == 'phenotyped': predicate = lambda x: x.phenotypes['affected'] in set([0, 1]) elif not isinstance(predicate, collections.Callable): raise ValueError('Not a valid predicate!') pheno_label = {1: '2', 0: '1', None: '-9'} def getlab(ind, default): """ Gets the label of an individual, or return different value ind is None """ return ind.label if ind is not None else default with smartopen(pedfile, 'w') as f: for pedigree in pedigrees.pedigrees: for ind in pedigree.individuals: if not predicate(ind): continue # Prepare the 6-column identifier outline = [ pedigree.label, ind.label, getlab(ind.father, '0'), getlab(ind.mother, '0'), 1 if ind.sex == 0 else 2, pheno_label[ind.phenotypes['affected']] ] # Make strings outline = list(map(str, outline)) # Get the genotypes in the format we need them g = [] for template, chromatids in zip(ind.chromosomes, ind.genotypes): if checkchroms and template.outputlabel not in output_chromosomes: continue chroma, chromb = chromatids if isinstance(chroma, SparseAlleles): raise ValueError("Plink output not for Sparse Data") ga = chroma.astype(str).tolist() gb = chromb.astype(str).tolist() gn = interleave(ga, gb) g.extend(gn) outline.extend(g) # Write it out outline = delim.join(outline) f.write(outline) f.write('\n')