def read_npz(in_file): '''Read problem from NPZ file. in_file may be a file name or an open file descriptor.''' files = np.load(in_file) graph = nx.DiGraph() graph.add_nodes_from(files['pedigree_nodes']) graph.add_edges_from(files['pedigree_graph'][0]) p = Pedigree(graph, sample_id=files['pedigree_sample_id'], sex=files['pedigree_sex'], phenotype=files['pedigree_phenotype'], node_type=files['pedigree_node_type'], sample_index=files['pedigree_sample_index'], num_genotyped=files['pedigree_num_genotyped'][0]) g = GenotypeFactory.new_instance('genotype', files['genotype_data'], files['genotype_snp']) h = GenotypeFactory.new_instance('haplotype', files['haplotype_data'], files['haplotype_snp'], qc=MISSING) error = files['error'] h.qc = files['haplotype_qc'] info = files['info'][0] frames = Frames((k, w) for k, v in files['frames'][0].iteritems() for w in v[0]) if files['frames'][0] else None lam = files['lam'] # Optional fields if 'genotype_map' in files.files: g.map = files['genotype_map'] if 'haplotype_poo_phase' in files.files: h.poo_phase = files['haplotype_poo_phase'] if 'haplotype_hap_type' in files.files: h.hap_type = files['haplotype_hap_type'] return Problem(p, g, haplotype=h, error=error, info=info, frames=frames, lam=lam)
def test_create_from_mock_data(self): '''Create a simple genotype set from the hutterites pedigree and some mock genotype data.''' # Load data from text file to compare with the load result snp = np.array( [(0, 'rs1', 0., 12), (0, 'rs2', 0., 34), (0, 'rs3', 0., 56), (0, 'rs4', 0., 78)], dtype={ 'names': ('chrom', 'snp', 'dist_cm', 'base_pair'), 'formats': ('i2', 'S12', 'i8', 'i8') }) sample_id = [126251, 111161] data = np.array([[[1, 2]], [[2, 2]], [[1, 2]], [[1, 1]]]) g = GenotypeFactory.new_instance('genotype', data, snp, sample_id) itu.assert_size_equals(g, 4, 1) assert_equal(4, g.num_snps, 'Incorrect number of SNPS') assert_equal(g.segment_intersect([0, 40]), [0, 2], 'Wrong interval intersection') assert_equal([0, 2], g.segment_intersect([10, 40]), 'Wrong interval intersection') assert_equal([0, 3], g.segment_intersect([10, 60]), 'Wrong interval intersection') assert_equal([1, 3], g.segment_intersect([20, 60]), 'Wrong interval intersection') assert_equal([0, 4], g.segment_intersect([0, 100]), 'Wrong interval intersection') assert_equal([1, 4], g.segment_intersect([20, 100]), 'Wrong interval intersection')
def read_tabix(file_name, genotyped_id_file=os.environ['OBER_DATA'] + '/hutt/hutt.3chipoverlap.clean.fam'): '''Read a Haplotype object from an ITABIX CGI-imputed file. Line format: tab-delimited 7849538 chr11 1909005 1909006 snp T C dbsnp.107:rs3817198 <genotypes> ''' # Load entire file into memory. It must fit, if we are to load it into a Genotype object d = np.loadtxt(file_name, str) # Read SNP metadata into a record array snp_dtype = [ ('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic position [CENTI-Morgans!!] ('base_pair', np.uint) # Base pair position on chromosome ] snp = np.array([(int(line[1][3:]), line[7], 0, int(line[3])) for line in d], dtype=snp_dtype) data = np.array([[(CGI_LETTER_TO_ALLELE[x[1]], CGI_LETTER_TO_ALLELE[x[2]]) for x in line[8:]] for line in d]) hap_type = np.array([[int(x[0]) for x in line[8:]] for line in d]) sample_id = read_sample_id(genotyped_id_file) # Construct object return GenotypeFactory.new_instance('haplotype', data, snp, sample_id, hap_type=hap_type)
def read(self, clazz, **kwargs): """Load genotype data. If prefix is specified, will use prefix.tfam, prefix.tped input file names, unless tfam and/or tped are specified (with or without the prefix argument), in which case they override the prefix-based names.""" # Read input arguments prefix = kwargs.get("prefix", None) load_ids = kwargs.get("load_ids", True) tped = kwargs.get("tped", None if prefix is None else (prefix + ".tped")) if tped is None: raise ValueError("Must specify plink file prefix and/or tped file name") if load_ids: tfam = kwargs.get("tfam", None if prefix is None else (prefix + ".tfam")) if tfam is None: raise ValueError("If loading IDs, must specify plink file prefix and/or tfam file name") # lazily-load data or not fetch all of it lazy_load = kwargs.get("lazy_load", False) # Read TPED file in two sweeps. # See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#map # Read the first line in the file to determine the number of samples with open(tped, "r") as f: reader = csv.reader(f, delimiter=" ", skipinitialspace=True) line = reader.next() if line[-1] == "": line = line[:-1] # Trim last item in field list of this line if it is blank num_items = len(line) # Read SNP metadata into a record array snp_dtype = [ ("chrom", np.uint8), # Chromosome # containing the SNP ("name", np.chararray), # SNP name (e.g., 'rs...') ("dist_cm", np.float), # Genetic position [CENTI-Morgans!!] ("base_pair", np.uint), # Base pair position on chromosome ] snp = np.loadtxt(tped, usecols=range(4), dtype=snp_dtype) # Fix the special case of a single row, where loadtxt is buggy if snp.size == 1: snp = np.array([tuple(snp[key] for key, _ in snp_dtype)], dtype=snp_dtype) # Read Genotype data if lazy_load: # Only pass pointer to file, to be read into a data structure that supports lazy loading data = tped else: # Read Genotype data into array data = np.genfromtxt(tped, usecols=range(4, num_items), dtype=np.byte) if np.size(snp) == 1: data = data.reshape([1, data.shape[0] / 2, 2]) else: data = data.reshape([data.shape[0], data.shape[1] / 2, 2]) # Load TFAM data, use only study IDs sample_id = np.genfromtxt(tfam, dtype=np.int)[:, 1] if load_ids else None # Construct object return GenotypeFactory.new_instance(clazz, data, snp, sample_id, lazy_load=lazy_load)
def iplex_to_genotype(path, t, suffix='_iPlex_HUTTERITE.txt'): '''Convert an iPlex data into a Genotype object that matches the ordering of SNPs and samples in an ImputationSet object t.''' # Read iPlex data into a dictionary of dictionaries iplex = __read_iplex(path, suffix) # Match SNPs by name. This is a slow (m x n) operation where m=#iplex SNPs, n=#imputation_set SNPs, # but speed is not an issue here since m, n are in the tens to hundreds iplex_snps = iplex.keys() print iplex_snps snp_metadata = t.snp['name'] # index of each IPLEX SNP in imputation_set index = [ np.where([iplex_snp in x for x in snp_metadata])[0][0] for iplex_snp in iplex_snps ] # Match genotype IDs of each iplex SNP and imputation_set s = t.pedigree.genotyped_sample_id() data = np.zeros_like(t.imputed_data) print t.genotype.map num_snps = len(iplex_snps) print 'num_snps', num_snps for i in xrange(num_snps): snp = iplex_snps[i] snp_index = index[i] iplex_data = iplex[snp] values = np.array(iplex_data.values()) letter = t.genotype.map[snp_index] print '#%2d iPlex SNP %-14s index in t %3d %s/%s' % ( i, snp, snp_index, letter[0], letter[1]) # Reverse strand if needed if values[0][0] not in letter: print 'Reversing strand letters of iPlex data (Dakota labeling uses standard dbSNP letters)' letter = ''.join(REVERSE_STRAND[x] for x in letter) allele = __allele_dict(letter) iplex_id = np.array(iplex_data.keys()) samples_in_t = np.in1d(iplex_id, s) sample_index = [t.pedigree.node_of[x] for x in iplex_id[samples_in_t]] recoded_genotypes = np.array([[allele[v[0]], allele[v[1]]] for v in values[samples_in_t]]) f1, f2 = im.gt.allele_frequencies(recoded_genotypes) if f2 > 0.7: # Minor allele has a much larger frequency than the major allele, swap them print 'Major-minor allele CGI letter swap detected, fixing (f1=%.2f, f2=%.2f)' % ( f1, f2) im.gt.swap_alleles(recoded_genotypes) t.genotype.map[snp_index] = letter[1] + letter[0] # Insert data into the appropriate row of the target genotype object data array data[snp_index, sample_index, :] = recoded_genotypes # return data, snp_metadata[index], t.genotype.sample_id g = GenotypeFactory.new_instance('genotype', data, t.snp, t.genotype.sample_id) g.map = t.genotype.map return g
def iplex_to_genotype(path, t, suffix='_iPlex_HUTTERITE.txt'): '''Convert an iPlex data into a Genotype object that matches the ordering of SNPs and samples in an ImputationSet object t.''' # Read iPlex data into a dictionary of dictionaries iplex = __read_iplex(path, suffix) # Match SNPs by name. This is a slow (m x n) operation where m=#iplex SNPs, n=#imputation_set SNPs, # but speed is not an issue here since m, n are in the tens to hundreds iplex_snps = iplex.keys() print iplex_snps snp_metadata = t.snp['name'] # index of each IPLEX SNP in imputation_set index = [np.where([iplex_snp in x for x in snp_metadata])[0][0] for iplex_snp in iplex_snps] # Match genotype IDs of each iplex SNP and imputation_set s = t.pedigree.genotyped_sample_id() data = np.zeros_like(t.imputed_data) print t.genotype.map num_snps = len(iplex_snps) print 'num_snps', num_snps for i in xrange(num_snps): snp = iplex_snps[i] snp_index = index[i] iplex_data = iplex[snp] values = np.array(iplex_data.values()) letter = t.genotype.map[snp_index] print '#%2d iPlex SNP %-14s index in t %3d %s/%s' % (i, snp, snp_index, letter[0], letter[1]) # Reverse strand if needed if values[0][0] not in letter: print 'Reversing strand letters of iPlex data (Dakota labeling uses standard dbSNP letters)' letter = ''.join(REVERSE_STRAND[x] for x in letter) allele = __allele_dict(letter) iplex_id = np.array(iplex_data.keys()) samples_in_t = np.in1d(iplex_id, s) sample_index = [t.pedigree.node_of[x] for x in iplex_id[samples_in_t]] recoded_genotypes = np.array([[allele[v[0]], allele[v[1]]] for v in values[samples_in_t]]) f1, f2 = im.gt.allele_frequencies(recoded_genotypes) if f2 > 0.7: # Minor allele has a much larger frequency than the major allele, swap them print 'Major-minor allele CGI letter swap detected, fixing (f1=%.2f, f2=%.2f)' % (f1, f2) im.gt.swap_alleles(recoded_genotypes) t.genotype.map[snp_index] = letter[1] + letter[0] # Insert data into the appropriate row of the target genotype object data array data[snp_index, sample_index, :] = recoded_genotypes # return data, snp_metadata[index], t.genotype.sample_id g = GenotypeFactory.new_instance('genotype', data, t.snp, t.genotype.sample_id) g.map = t.genotype.map return g
def read(input_type, clazz, **kwargs): """Read a Genotype object of class clazz ('genotype'/'haplotype'/'problem') from file of the format 'input_type'. Supported formats: input_type='plink' (PLINK format); 'npz' (our NPZ format).""" if input_type == "npz": data = np.load(kwargs.get("file")) g = GenotypeFactory.new_instance(clazz, data["data"], data["snp"], sample_id=data["sample_id"]) # If there exists a genetic map, load it. If not, don't. For backward-compatibility with older # Genotype npz files that didn't have the map yet if "map" in data.files: g.map = data["map"] if "poo_phase" in data.files: g.poo_phase = data["poo_phase"] return g elif input_type == "plink": return _plink_reader.read(clazz, **kwargs) else: raise ValueError("Unsupported genotype input type %s" % (input,))
def test_create_from_mock_data(self): '''Create a simple genotype set from the hutterites pedigree and some mock genotype data.''' # Load data from text file to compare with the load result snp = np.array([(0, 'rs1', 0., 12), (0, 'rs2', 0., 34), (0, 'rs3', 0., 56), (0, 'rs4', 0., 78)], dtype={'names': ('chrom', 'snp', 'dist_cm', 'base_pair'), 'formats': ('i2', 'S12', 'i8', 'i8')}) sample_id = [126251, 111161] data = np.array([[[1, 2]], [[2, 2]], [[1, 2]], [[1, 1]]]) g = GenotypeFactory.new_instance('genotype', data, snp, sample_id) itu.assert_size_equals(g, 4, 1) assert_equal(4, g.num_snps, 'Incorrect number of SNPS') assert_equal(g.segment_intersect([0, 40]), [0, 2], 'Wrong interval intersection') assert_equal([0, 2], g.segment_intersect([10,40]), 'Wrong interval intersection') assert_equal([0, 3], g.segment_intersect([10,60]), 'Wrong interval intersection') assert_equal([1, 3], g.segment_intersect([20,60]), 'Wrong interval intersection') assert_equal([0, 4], g.segment_intersect([0,100]), 'Wrong interval intersection') assert_equal([1, 4], g.segment_intersect([20,100]), 'Wrong interval intersection')
def read(input_type, clazz, **kwargs): '''Read a Genotype object of class clazz ('genotype'/'haplotype'/'problem') from file of the format 'input_type'. Supported formats: input_type='plink' (PLINK format); 'npz' (our NPZ format).''' if input_type == 'npz': data = np.load(kwargs.get('file')) g = GenotypeFactory.new_instance(clazz, data['data'], data['snp'], sample_id=data['sample_id']) # If there exists a genetic map, load it. If not, don't. For backward-compatibility with older # Genotype npz files that didn't have the map yet if 'map' in data.files: g.map = data['map'] if 'poo_phase' in data.files: g.poo_phase = data['poo_phase'] return g elif input_type == 'plink': return _plink_reader.read(clazz, **kwargs) else: raise ValueError('Unsupported genotype input type %s' % (input, ))
def read_tabix(file_name, genotyped_id_file=os.environ["OBER_DATA"] + "/hutt/hutt.3chipoverlap.clean.fam"): """Read a Haplotype object from an ITABIX CGI-imputed file. Line format: tab-delimited 7849538 chr11 1909005 1909006 snp T C dbsnp.107:rs3817198 <genotypes> """ # Load entire file into memory. It must fit, if we are to load it into a Genotype object d = np.loadtxt(file_name, str) # Read SNP metadata into a record array snp_dtype = [ ("chrom", np.uint8), # Chromosome # containing the SNP ("name", np.chararray), # SNP name (e.g., 'rs...') ("dist_cm", np.float), # Genetic position [CENTI-Morgans!!] ("base_pair", np.uint), # Base pair position on chromosome ] snp = np.array([(int(line[1][3:]), line[7], 0, int(line[3])) for line in d], dtype=snp_dtype) data = np.array([[(CGI_LETTER_TO_ALLELE[x[1]], CGI_LETTER_TO_ALLELE[x[2]]) for x in line[8:]] for line in d]) hap_type = np.array([[int(x[0]) for x in line[8:]] for line in d]) sample_id = read_sample_id(genotyped_id_file) # Construct object return GenotypeFactory.new_instance("haplotype", data, snp, sample_id, hap_type=hap_type)
def read(self, clazz, **kwargs): '''Load genotype data. If prefix is specified, will use prefix.tfam, prefix.tped input file names, unless tfam and/or tped are specified (with or without the prefix argument), in which case they override the prefix-based names.''' # Read input arguments prefix = kwargs.get('prefix', None) load_ids = kwargs.get('load_ids', True) tped = kwargs.get('tped', None if prefix is None else (prefix + '.tped')) if tped is None: raise ValueError( 'Must specify plink file prefix and/or tped file name') if load_ids: tfam = kwargs.get('tfam', None if prefix is None else (prefix + '.tfam')) if tfam is None: raise ValueError( 'If loading IDs, must specify plink file prefix and/or tfam file name' ) # lazily-load data or not fetch all of it lazy_load = kwargs.get('lazy_load', False) # Read TPED file in two sweeps. # See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#map # Read the first line in the file to determine the number of samples with open(tped, 'r') as f: reader = csv.reader(f, delimiter=' ', skipinitialspace=True) line = reader.next() if line[-1] == '': line = line[: -1] # Trim last item in field list of this line if it is blank num_items = len(line) # Read SNP metadata into a record array snp_dtype = [ ('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic position [CENTI-Morgans!!] ('base_pair', np.uint) # Base pair position on chromosome ] snp = np.loadtxt(tped, usecols=range(4), dtype=snp_dtype) # Fix the special case of a single row, where loadtxt is buggy if snp.size == 1: snp = np.array([tuple(snp[key] for key, _ in snp_dtype)], dtype=snp_dtype) # Read Genotype data if lazy_load: # Only pass pointer to file, to be read into a data structure that supports lazy loading data = tped else: # Read Genotype data into array data = np.genfromtxt(tped, usecols=range(4, num_items), dtype=np.byte) if np.size(snp) == 1: data = data.reshape([1, data.shape[0] / 2, 2]) else: data = data.reshape([data.shape[0], data.shape[1] / 2, 2]) # Load TFAM data, use only study IDs sample_id = np.genfromtxt(tfam, dtype=np.int)[:, 1] if load_ids else None # Construct object return GenotypeFactory.new_instance(clazz, data, snp, sample_id, lazy_load=lazy_load)