Example #1
0
def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
    '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
    try:
        if not_chr_arg:
            log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun))
            out = parsefunc(_splitp(not_chr_arg), **kwargs)
        elif chr_arg:
            f = ps.sub_chr(chr_arg, '[1-22]')
            log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
            out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs)
    except ValueError as e:
        log.log('Error parsing {N}.'.format(N=noun))
        raise e

    return out
Example #2
0
def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
    '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
    try:
        if not_chr_arg:
            log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun))
            out = parsefunc(_splitp(not_chr_arg), **kwargs)
        elif chr_arg:
            f = ps.sub_chr(chr_arg, '[1-22]')
            log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
            out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs)
    except ValueError as e:
        log.log('Error parsing {N}.'.format(N=noun))
        raise e

    return out
Example #3
0
def _read_header(args):
    '''Read header from .expscore files'''
    if args.exp_chr:
        f = ps.sub_chr(args.exp_chr, 1) + '.expscore'
        comp = ps.which_compression(f)
        indices, cnames, cgroups = ps.filter_columns(f + comp[0],
                                                     comp[1],
                                                     fsuffix='expscore',
                                                     args=args)
        cnames = [x for x in cnames if x not in ['SNP', 'CHR', 'BP', 'CM']]

    else:
        f = args.exp + '.expscore'
        comp = ps.which_compression(f)
        indices, cnames, cgroups = ps.filter_columns(f + comp[0],
                                                     comp[1],
                                                     fsuffix='expscore',
                                                     args=args)
        cnames = [x for x in cnames if x not in ['SNP', 'CHR', 'BP', 'CM']]

    return indices, cnames, cgroups
Example #4
0
def _read_multiple_chr_split_files(chr_arg_array, not_chr_arg_array, log, noun,
                                   parsefunc, **kwargs):
    '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
    try:
        if not_chr_arg_array:
            fh_list = []
            for not_chr_arg in not_chr_arg_array:
                log.log('Reading {N} from {F} ...'.format(F=not_chr_arg,
                                                          N=noun))
                fh_list += _splitp(not_chr_arg)
            out = parsefunc(fh_lit, **kwargs)
        elif chr_arg_array:
            fh_list = []
            for chr_arg in chr_arg_array:
                f = ps.sub_chr(chr_arg, '[1-22]')
                log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
                fh_list += _splitp(chr_arg)
            out = parsefunc(fh_list, _N_CHR, **kwargs)
    except ValueError as e:
        log.log('Error parsing {N}.'.format(N=noun))
        raise e

    return out
Example #5
0
def _read_nonneg_constraints(args, log, snp_names, ref_ld_cnames):

    assert args.ref_ld_chr or args.ref_ld is not None, 'non-negative constraints must be used with --ref_ld_chr or --ref_ld'
    noun = 'annot matrix'
    
    try:
        #read annotation files
        if args.ref_ld_chr is not None:
            f = ps.sub_chr(args.ref_ld_chr, '[1-22]')
            log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
            fh_list = _splitp(args.ref_ld_chr)
        else:
            f = args.ref_ld
            log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
            fh_list = _splitp(args.ref_ld)
        df_annotations_list = []
        columns_list = []
        for fh in fh_list:
            df_annotations_list_fh = []
            if args.ref_ld_chr is not None:
                flist_small = [ps.sub_chr(fh, chr) + '.annot_small.gz' for chr in xrange(1, _N_CHR + 1)]
                flist_notsmall = [ps.sub_chr(fh, chr) + '.annot.gz' for chr in xrange(1, _N_CHR + 1)]
            else:
                flist_small = [fh + '.annot_small.gz']
                flist_notsmall = [fh + '.annot.gz']
            for f_i in xrange(len(flist_small)):
                fname = flist_small[f_i]
                if (not os.path.exists(fname)):
                    fname = flist_notsmall[f_i]
                df_annotations_chr = pd.read_csv(fname, delim_whitespace=True)
                # if not args.constrain_all_snps:
                    # df_annotations_chr = df_annotations_chr.loc[df_annotations_chr['SNP'].isin(snp_names)]
                df_annotations_chr.set_index(df_annotations_chr.columns[:4].tolist(), drop=True, inplace=True)
                
                if (args.anno is not None):
                    annotations = args.anno.split(',')
                    #for a in annotations:
                    #    assert a in df_annotations_chr.columns
                    df_annotations_chr = df_annotations_chr.loc[:, [c for c in df_annotations_chr.columns if (c in annotations)]]
                
                df_annotations_list_fh.append(df_annotations_chr)
            df_fh = pd.concat(df_annotations_list_fh, axis=0)
            columns_list += df_fh.columns.tolist()
            df_annotations_list.append(df_fh)        
        assert len(np.unique(columns_list)) == len(columns_list), 'duplicate columns exist in different annotation files'
        df_annotations = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True), df_annotations_list)
        del df_annotations_list
        df_annotations.drop_duplicates(inplace=True)
        log.log('%d constraints remained after removing duplicate constraints'%(df_annotations.shape[0]))
        
        # if not args.constrain_all_snps:
            # assert (df_annotations.index.get_level_values(2) == snp_names).all()    #SNP Names        
            
        #define tau constraints        
        annotation_names = [(c[:c.index('L2')] if ('L2' in c) else c) for c in ref_ld_cnames]
        assert (df_annotations.columns == annotation_names).all()
        
        #add intercept column if required
        if (args.intercept_h2 is None and not args.no_intercept):
            df_annotations['intercept'] = np.zeros(df_annotations.shape[0])
                
        return df_annotations.astype(np.float)
        
    except ValueError as e:
        log.log('Error parsing {N}.'.format(N=noun))
        raise e