def too_slow_test_layout_hut(self): '''Test generating layout coordinates for the Hutterites pedigree. This test is more about speed, to see that we do it in reasonable time.''' p = io_pedigree.read(itu.HUTT_PED) positions = pedigree_plot_laplacian._layout_positions(p.graph) #@UnusedVariable # Save to file - slow '''
def test_marriage_graph_layout_positions(self): '''Test generating the extended graph layout position.''' p = io_pedigree.read(itu.SMALL_FILE) g = p.graph g_extended = pedigree_plot_laplacian._marriage_graph(g) assert_equal(g.number_of_nodes(), 8, 'Wrong number of nodes') assert_equal(g.number_of_edges(), 10, 'Wrong number of edges') assert_equal(g_extended.number_of_nodes(), 12, 'Wrong number of nodes') assert_equal(g_extended.number_of_edges(), 13, 'Wrong number of edges') #positions = pedigree_plot_laplacian._layout_positions(g, g_extended) #expected = { 1: (-0.23562670914672229, 2), 2: (-0.063382627268225591, 3), 3: (-0.23562670914672237, 3), 4: (0.1502736499569044, 1), 5: (0.15027364995690434, 2), 6: (0.43352532974526942, 0), 7: (0.43352532974526858, 0), 8: (-0.48586913569278134, 2), -1: (-0.054615070741569148, 2.5), -4: (0.31358843226722749, 0.5), -3: (-0.054615070741569086, 2.5), -2: (-0.35145106893398231, 1.5) }
def read_pedigree_from_test_file(file_name, genotyped_id_file=None): '''Load a pedigree from a PLINK TFAM file.''' data = np.genfromtxt(file_name, np.dtype(int)) p = io_pedigree.read(file_name, genotyped_id_file=genotyped_id_file) assert_equal(p._graph.number_of_nodes(), data.shape[0], 'Incorrect number of nodes') assert nx.is_directed_acyclic_graph(p._graph), 'Pedigree is not a DAG' return p
def too_slow_test_layout_hut(self): '''Test generating layout coordinates for the Hutterites pedigree. This test is more about speed, to see that we do it in reasonable time.''' p = io_pedigree.read(itu.HUTT_PED) positions = pedigree_plot_laplacian._layout_positions( p.graph) #@UnusedVariable # Save to file - slow '''
def test_save_load_pedigree_plink(self): '''Check that saving and loading a pedigree object from file preserves the original object.''' p = itu.Templates.pedigree_hut() out_file = tempfile.TemporaryFile() io_pedigree.write(p, out_file) out_file.seek(0) p2 = io_pedigree.read( out_file, genotyped_id_file=itu.GENOTYPE_SAMPLE+'.tfam') out_file.close() assert_equal(p, p2, 'Saving and loading did not restore the original pedigree')
def test_marriage_graph(self): '''Test generating the extended graph.''' p = io_pedigree.read(itu.HUTT_PED) g = p.graph g_extended = pedigree_plot_laplacian._marriage_graph(g) assert_equal(g.number_of_nodes(), 3671, 'Wrong number of nodes') assert_equal(g.number_of_edges(), 7200, 'Wrong number of edges') assert_equal(g_extended.number_of_nodes(), 4661, 'Wrong number of nodes') assert_equal(g_extended.number_of_edges(), 5580, 'Wrong number of edges')
def read(self, file_name, genotyped_id_file=None): '''Load pedigree from file in old format.''' p = io_pedigree.read(file_name, genotyped_id_file) # Load data from text file a second time to read the old-study-specific-column. Not efficient. data = np.genfromtxt(file_name, np.dtype(int)) old_generation = dict(data[:,(1,6)]) # Wrap by old pedigree object return PedigreeOldStudy(p, old_generation)
def test_lca_small(self): '''Test lowest common ancestor computation in a small pedigree.''' p = io_pedigree.read(itu.SMALL_FILE) # Direct siblings self.__compute_and_check_lca(p, 6, 7, [4, 5], 2) # Far siblings u = 6 v = 8 w = self.__compute_and_check_lca(p, u, v, [3], 3) assert_equal(shortest_path(p.graph, w, u), [3, 5, 6], 'Wrong shorted path from ancestor to node u') assert_equal(shortest_path(p.graph, w, v), [3, 8], 'Wrong shorted path from ancestor to node v') # No common ancestor exists self.__compute_and_check_lca(p, 1, 2, [None], Infinity)
def test_marriage_graph_layout_positions(self): '''Test generating the extended graph layout position.''' p = io_pedigree.read(itu.SMALL_FILE) g = p.graph g_extended = pedigree_plot_laplacian._marriage_graph(g) assert_equal(g.number_of_nodes(), 8, 'Wrong number of nodes') assert_equal(g.number_of_edges(), 10, 'Wrong number of edges') assert_equal(g_extended.number_of_nodes(), 12, 'Wrong number of nodes') assert_equal(g_extended.number_of_edges(), 13, 'Wrong number of edges') #positions = pedigree_plot_laplacian._layout_positions(g, g_extended) #expected = {1: (-0.23562670914672229, 2), 2: (-0.063382627268225591, 3), 3: (-0.23562670914672237, 3), 4: (0.1502736499569044, 1), 5: (0.15027364995690434, 2), 6: (0.43352532974526942, 0), 7: (0.43352532974526858, 0), 8: (-0.48586913569278134, 2), -1: (-0.054615070741569148, 2.5), -4: (0.31358843226722749, 0.5), -3: (-0.054615070741569086, 2.5), -2: (-0.35145106893398231, 1.5)}
def test_families(self): '''Test family computation.''' p = io_pedigree.read(itu.HUTT_PED) assert_equal(len(list(pt.families(p.graph))), 990, 'Wrong number of pedigree families')
-------------------------------------------------- Main program -------------------------------------------------- ''' options, args = __parse_command_line_args(sys.argv) var_file_prefix, input_file, output_file = args genotype_filter = __GENOTYPE_FILTER[options.genotype_filter] genotype_cleaner = __GENOTYPE_CLEANER[options.genotype_filter] allele_start_index = 1 if options.type == 'imputed' else 0 try: # Read location list snp_dao = SnpDao(options.db_url) input_file = sys.stdin if args[1] == '-' else open(input_file, 'rb') # Sample IDs are read from the pedigree, and must match the genotype files' ordering pedigree = io_pedigree.read(options.pedigree_file, options.genotype_id_file) node_of = pedigree.node_of sample_id = np.loadtxt( options.id_file, dtype=np.int, ndmin=1) if options.id_file else pedigree._sample_id N = pedigree.num_genotyped sample_index = np.array( filter(lambda x: x is not None and x < N, map(node_of.get, sample_id)) ) # Filter FINDIVs that are not in the imputed FINDIV set out = open(output_file, 'wb') # Print header line if options.output_format == 'matrix': out.write('\t'.join([ 'variant', 'chromosome', 'bp_start', 'bp_stop', 'variant_type',
def test_lca_hut(self): '''Test lowest common ancestor computation in a large pedigree.''' p = io_pedigree.read(itu.HUTT_PED) self.__compute_and_check_lca(p, 169512, 170362, [8551], 9)
-------------------------------------------------- Main program -------------------------------------------------- ''' options, args = __parse_command_line_args(sys.argv) var_file_prefix, input_file, output_file = args genotype_filter = __GENOTYPE_FILTER[options.genotype_filter] genotype_cleaner = __GENOTYPE_CLEANER[options.genotype_filter] allele_start_index = 1 if options.type == 'imputed' else 0 try: # Read location list snp_dao = SnpDao(options.db_url) input_file = sys.stdin if args[1] == '-' else open(input_file, 'rb') # Sample IDs are read from the pedigree, and must match the genotype files' ordering pedigree = io_pedigree.read(options.pedigree_file, options.genotype_id_file) node_of = pedigree.node_of sample_id = np.loadtxt(options.id_file, dtype=np.int, ndmin=1) if options.id_file else pedigree._sample_id N = pedigree.num_genotyped sample_index = np.array(filter(lambda x: x is not None and x < N, map(node_of.get, sample_id))) # Filter FINDIVs that are not in the imputed FINDIV set out = open(output_file, 'wb') # Print header line if options.output_format == 'matrix': out.write('\t'.join(['variant', 'chromosome', 'bp_start', 'bp_stop', 'variant_type', 'ref_allele', 'minor_allele'] + map(str, sample_id)) + '\n') # Extract data using tabix for each location. A location may be a range of bps and may # correspond to multiple output lines. snp_count = [0, 0, 0, 0, 0] # #found variants; #not-found variants; #multiply-matching variants; #nameless variants # print list(enumerate(it.chain.from_iterable(__parse_line(line, snp_dao, debug=options.debug) # for line in (line.rstrip('\n').rstrip('\r')
def read_plink(**kwargs): '''Load a problem from the following PLINK files: Default Override Option Data Format ====================================================================================== prefix.pdg.tfam pedigree Pedigree adjacency PLINK TFAM (genotyped+nongenotyped samples) prefix.tfam pedigree_genotyped Genotyped sample pedigree (sub-graph of the pedigree) PLINK TFAM corresponding to prefix.tped prefix.tped genotype Genotype data PLINK TPED prefix.hap.tped haplotype* Haplotype data PLINK TPED prefix.err error** Genotype errors flagged Integer array (snps x samples) prefix.info info Problem info pickle (binary) prefix.frm frames LD-independent SNP frames text file prefix.lam lam*** Haplotype est. recombination rate text file * - hap data not loaded if this option is None. ** - errors set to 0 if this file is not found or this option is set to None. *** - data not loaded if if this file is not found. ''' # Read input options verbose = kwargs.get('verbose', False) prefix = kwargs.get('prefix', '') overrideable_option = lambda name, default: kwargs.get(name, default if prefix else None) pedigree = overrideable_option('pedigree', prefix + '.pdg.tfam') pedigree_genotyped = overrideable_option('pedigree_genotyped', prefix + '.tfam') genotype = overrideable_option('genotype', prefix + '.tped') haplotype = overrideable_option('haplotype', prefix + '.hap.tped') error_file = overrideable_option('error', prefix + '.err') info = overrideable_option('info', prefix + '.info') if not np.all([[pedigree, pedigree_genotyped, genotype, error_file] is not None]): raise ValueError('Must specify a prefix or pedigree, pedigree_genotyped, genotype, error files') frames_file = overrideable_option('frames', prefix + '.frm') lam_file = overrideable_option('lam', prefix + '.lam') # Load data print_location = lambda x : x if x else '-' if verbose: print 'Reading pedigree from %s, %s ...' % (print_location(pedigree), print_location(pedigree_genotyped),) p = io_pedigree.read(pedigree, genotyped_id_file=pedigree_genotyped) if verbose: print 'Reading genotype data from %s ...' % (print_location(genotype),) g = io_genotype.read('plink', 'genotype', tped=genotype, load_ids=False) if verbose: print 'Reading haplotype data from %s ...' % (print_location(haplotype),) h = io_genotype.read('plink', 'haplotype', tped=haplotype, load_ids=False) if haplotype else None if verbose: print 'Reading error data from %s ...' % (print_location(error_file),) error = np.loadtxt(error_file) if error_file and os.path.isfile(error_file) else None if verbose: print 'Reading frame data from %s ...' % (print_location(frames_file),) frames = db_gene.snp.ld_graph.read_frames(frames_file) if frames_file else None lam = np.loadtxt(lam_file) if lam_file and os.path.isfile(lam_file) else None # info = ProblemInfo(p, g) if info is None else info problem = Problem(p, g, haplotype=h, error=error, frames=frames, lam=lam) if haplotype and info: if verbose: print 'Reading problem info from %s ...' % (info,) with open(info, 'rb') as fout: problem.info = pickle.load(fout) return problem
def read_plink(**kwargs): '''Load a problem from the following PLINK files: Default Override Option Data Format ====================================================================================== prefix.pdg.tfam pedigree Pedigree adjacency PLINK TFAM (genotyped+nongenotyped samples) prefix.tfam pedigree_genotyped Genotyped sample pedigree (sub-graph of the pedigree) PLINK TFAM corresponding to prefix.tped prefix.tped genotype Genotype data PLINK TPED prefix.hap.tped haplotype* Haplotype data PLINK TPED prefix.err error** Genotype errors flagged Integer array (snps x samples) prefix.info info Problem info pickle (binary) prefix.frm frames LD-independent SNP frames text file prefix.lam lam*** Haplotype est. recombination rate text file * - hap data not loaded if this option is None. ** - errors set to 0 if this file is not found or this option is set to None. *** - data not loaded if if this file is not found. ''' # Read input options verbose = kwargs.get('verbose', False) prefix = kwargs.get('prefix', '') overrideable_option = lambda name, default: kwargs.get( name, default if prefix else None) pedigree = overrideable_option('pedigree', prefix + '.pdg.tfam') pedigree_genotyped = overrideable_option('pedigree_genotyped', prefix + '.tfam') genotype = overrideable_option('genotype', prefix + '.tped') haplotype = overrideable_option('haplotype', prefix + '.hap.tped') error_file = overrideable_option('error', prefix + '.err') info = overrideable_option('info', prefix + '.info') if not np.all([[pedigree, pedigree_genotyped, genotype, error_file] is not None]): raise ValueError( 'Must specify a prefix or pedigree, pedigree_genotyped, genotype, error files' ) frames_file = overrideable_option('frames', prefix + '.frm') lam_file = overrideable_option('lam', prefix + '.lam') # Load data print_location = lambda x: x if x else '-' if verbose: print 'Reading pedigree from %s, %s ...' % ( print_location(pedigree), print_location(pedigree_genotyped), ) p = io_pedigree.read(pedigree, genotyped_id_file=pedigree_genotyped) if verbose: print 'Reading genotype data from %s ...' % ( print_location(genotype), ) g = io_genotype.read('plink', 'genotype', tped=genotype, load_ids=False) if verbose: print 'Reading haplotype data from %s ...' % ( print_location(haplotype), ) h = io_genotype.read('plink', 'haplotype', tped=haplotype, load_ids=False) if haplotype else None if verbose: print 'Reading error data from %s ...' % (print_location(error_file), ) error = np.loadtxt( error_file) if error_file and os.path.isfile(error_file) else None if verbose: print 'Reading frame data from %s ...' % ( print_location(frames_file), ) frames = db_gene.snp.ld_graph.read_frames( frames_file) if frames_file else None lam = np.loadtxt( lam_file) if lam_file and os.path.isfile(lam_file) else None # info = ProblemInfo(p, g) if info is None else info problem = Problem(p, g, haplotype=h, error=error, frames=frames, lam=lam) if haplotype and info: if verbose: print 'Reading problem info from %s ...' % (info, ) with open(info, 'rb') as fout: problem.info = pickle.load(fout) return problem