Ejemplo n.º 1
0
    def __init__(self, problem, fraction=None, test_index=None):
        '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index
        is specified, these specific test indices are used; otherwise a random fraction is generated.
        
        If test_index = 'hap', data is read from problem.h (haplotype array). The entire array
        is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.'''
        # Create a working copy of the problem. Only the data is copied.
        if not (fraction is not None) ^ (test_index is not None):
            raise ValueError('Must specify fraction or test_index')
        self.problem = Problem(problem.pedigree, problem.genotype.copy())
        self.h = self.problem.h

        # Create test set; save original genotypes in g_orig
        if test_index is None:
            self.fraction = fraction
            self.g_orig, i = clear_random_portion(self.problem.genotype.data,
                                                  fraction)
        elif test_index == 'hap':
            # Don't clear anything; call everything a test index.
            h = problem.h
            i = tuple(
                util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1])))
            self.g_orig = problem.g
            self.h = h
            self.fraction = 1.0
        else:
            self.g_orig, i = clear_index(self.problem.g, test_index)
            self.fraction = (1.0 * i[0].size) / (self.h.shape[0] *
                                                 self.h.shape[1])
        self.num_tests = i[0].size
        self.test_index = i
        self.r_orig = recode.recode_single_genotype(self.g_orig)
        self.fill = self.problem.fill_fraction()[:, SAMPLE]
        self.__recode_single_genotype = None
Ejemplo n.º 2
0
 def problem_hut():
     '''Load the hutterites data set. Cached since it's large.'''
     if not Templates.PROBLEM_HUT:
         pedigree = Templates.pedigree_hut()
         genotype = io_genotype.read('plink',
                                     'genotype',
                                     prefix=GENOTYPE_SAMPLE,
                                     load_ids=False)
         Templates.PROBLEM_HUT = Problem(pedigree, genotype)
     return Templates.PROBLEM_HUT
Ejemplo n.º 3
0
def pipeline_validation_experiment(location_file, true_type, true_location, pedigree, debug=False, remove_partial_calls=False):
    '''Load (the ''true'') genotypes from an external source. Load a list of locations from ''location_file''. Impute them and compare
    with the true genotypes.'''
    g = extract_genotypes(location_file)
    t = ImputationSet(pedigree, g)
    if true_type == 'iplex': true_genotype = im.imputation.reader.iplex_to_genotype(true_location, t)  # os.environ['OBER'] + '/data/impute/rare/to_livne_20121205', t)
    else: raise ValueError('Unsupported true genotype format ''%s''' % (true_type,))
    problem = Problem(pedigree, true_genotype)
    p, t = impute_problem(problem, debug=debug, remove_partial_calls=remove_partial_calls)
    return p, t
Ejemplo n.º 4
0
 def setUp(self):
     '''Load test data and expected results.'''
     unittest.TestCase.setUp(self)
     # The way to load a pedigree in conjunction with a genotype set is to recode
     # its sample IDs to consecutive for easier access by phasers.
     self.problem = io.read_plink(prefix=itu.GENOTYPE_TRIO, haplotype=None, pedigree=itu.GENOTYPE_TRIO + '.tfam')
     self.phaser = trivial_phaser()
     
     # Expected results
     self.solution = Problem(self.problem.pedigree, io_genotype.read('plink', 'genotype', prefix=itu.GENOTYPE_TRIO_SOLUTION))
Ejemplo n.º 5
0
Archivo: io.py Proyecto: orenlivne/ober
def read_npz(in_file):
    '''Read problem from NPZ file. in_file may be a file name or an open 
    file descriptor.'''

    files = np.load(in_file)
    graph = nx.DiGraph()
    graph.add_nodes_from(files['pedigree_nodes'])
    graph.add_edges_from(files['pedigree_graph'][0])
    p = Pedigree(graph,
                 sample_id=files['pedigree_sample_id'],
                 sex=files['pedigree_sex'],
                 phenotype=files['pedigree_phenotype'],
                 node_type=files['pedigree_node_type'],
                 sample_index=files['pedigree_sample_index'],
                 num_genotyped=files['pedigree_num_genotyped'][0])
    g = GenotypeFactory.new_instance('genotype', files['genotype_data'],
                                     files['genotype_snp'])
    h = GenotypeFactory.new_instance('haplotype',
                                     files['haplotype_data'],
                                     files['haplotype_snp'],
                                     qc=MISSING)
    error = files['error']
    h.qc = files['haplotype_qc']
    info = files['info'][0]
    frames = Frames((k, w) for k, v in files['frames'][0].iteritems()
                    for w in v[0]) if files['frames'][0] else None
    lam = files['lam']

    # Optional fields
    if 'genotype_map' in files.files: g.map = files['genotype_map']
    if 'haplotype_poo_phase' in files.files:
        h.poo_phase = files['haplotype_poo_phase']
    if 'haplotype_hap_type' in files.files:
        h.hap_type = files['haplotype_hap_type']

    return Problem(p,
                   g,
                   haplotype=h,
                   error=error,
                   info=info,
                   frames=frames,
                   lam=lam)
Ejemplo n.º 6
0
Archivo: io.py Proyecto: orenlivne/ober
def read_plink(**kwargs):
    '''Load a problem from the following PLINK files:
    
        Default          Override Option    Data                                Format
        ======================================================================================
        prefix.pdg.tfam  pedigree           Pedigree adjacency                  PLINK TFAM
                                            (genotyped+nongenotyped samples)
        prefix.tfam      pedigree_genotyped Genotyped sample pedigree
                                            (sub-graph of the pedigree)         PLINK TFAM
                                            corresponding to prefix.tped
        prefix.tped      genotype           Genotype data                       PLINK TPED
        prefix.hap.tped  haplotype*         Haplotype data                      PLINK TPED 
        prefix.err       error**            Genotype errors flagged             Integer array (snps x samples) 
        prefix.info      info               Problem info                        pickle (binary)
        prefix.frm       frames             LD-independent SNP frames           text file
        prefix.lam       lam***             Haplotype est. recombination rate   text file
        
        * - hap data not loaded if this option is None.
        ** - errors set to 0 if this file is not found or this option is set to None.
        *** - data not loaded if if this file is not found.
    '''

    # Read input options
    verbose = kwargs.get('verbose', False)
    prefix = kwargs.get('prefix', '')
    overrideable_option = lambda name, default: kwargs.get(
        name, default if prefix else None)
    pedigree = overrideable_option('pedigree', prefix + '.pdg.tfam')
    pedigree_genotyped = overrideable_option('pedigree_genotyped',
                                             prefix + '.tfam')
    genotype = overrideable_option('genotype', prefix + '.tped')
    haplotype = overrideable_option('haplotype', prefix + '.hap.tped')
    error_file = overrideable_option('error', prefix + '.err')
    info = overrideable_option('info', prefix + '.info')
    if not np.all([[pedigree, pedigree_genotyped, genotype, error_file]
                   is not None]):
        raise ValueError(
            'Must specify a prefix or pedigree, pedigree_genotyped, genotype, error files'
        )
    frames_file = overrideable_option('frames', prefix + '.frm')
    lam_file = overrideable_option('lam', prefix + '.lam')

    # Load data
    print_location = lambda x: x if x else '-'

    if verbose:
        print 'Reading pedigree from %s, %s ...' % (
            print_location(pedigree),
            print_location(pedigree_genotyped),
        )
    p = io_pedigree.read(pedigree, genotyped_id_file=pedigree_genotyped)

    if verbose:
        print 'Reading genotype data from %s ...' % (
            print_location(genotype), )
    g = io_genotype.read('plink', 'genotype', tped=genotype, load_ids=False)

    if verbose:
        print 'Reading haplotype data from %s ...' % (
            print_location(haplotype), )
    h = io_genotype.read('plink', 'haplotype', tped=haplotype,
                         load_ids=False) if haplotype else None

    if verbose:
        print 'Reading error data from %s ...' % (print_location(error_file), )
    error = np.loadtxt(
        error_file) if error_file and os.path.isfile(error_file) else None

    if verbose:
        print 'Reading frame data from %s ...' % (
            print_location(frames_file), )
    frames = db_gene.snp.ld_graph.read_frames(
        frames_file) if frames_file else None

    lam = np.loadtxt(
        lam_file) if lam_file and os.path.isfile(lam_file) else None

    # info = ProblemInfo(p, g) if info is None else info
    problem = Problem(p, g, haplotype=h, error=error, frames=frames, lam=lam)
    if haplotype and info:
        if verbose:
            print 'Reading problem info from %s ...' % (info, )
        with open(info, 'rb') as fout:
            problem.info = pickle.load(fout)
    return problem