Example #1
0
def potatoSimulate(args):
    log.info("loading database files")
    GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
    GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
    log.info("done!")
    simulateSNPs(GenotypeData, GenotypeData_acc, args['AccID'], args['numSNPs'], args['outFile'], args['err_rate'])
    log.info("finished!")
Example #2
0
def potatoCrossIdentifier(args):
    (snpCHR, snpPOS, snpGT, snpWEI,
     DPmean) = snpmatch.parseInput(inFile=args['inFile'],
                                   logDebug=args['logDebug'])
    log.info("loading genotype files!")
    GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
    GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
    log.info("done!")
    log.info("running cross identifier!")
    crossIdentifier(args['binLen'], snpCHR, snpPOS, snpWEI, DPmean,
                    GenotypeData, GenotypeData_acc, args['outFile'])
    log.info("finished!")
Example #3
0
def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outFile):
  NumSNPs = len(snpCHR)
  log.info("loading database files")
  GenotypeData = genotype.load_hdf5_genotype_data(hdf5File)
  GenotypeData_acc = genotype.load_hdf5_genotype_data(hdf5accFile)
  log.info("done!")
  num_lines = len(GenotypeData.accessions)
  ScoreList = np.zeros(num_lines, dtype="float")
  NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32")
  NumMatSNPs = 0
  overlappedInds = np.zeros(0, dtype=int)
  chunk_size = 1000
  for i in np.array(GenotypeData.chrs, dtype=int):
    perchrTarPos = np.where(snpCHR == i)[0]
    perchrtarSNPpos = snpPOS[perchrTarPos]
    log.info("Analysing chromosome %s positions", i)
    start = GenotypeData.chr_regions[i-1][0]
    end = GenotypeData.chr_regions[i-1][1]
    chrpositions = GenotypeData.positions[start:end]
    matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start
    matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0]
    matchedTarWei = snpWEI[perchrTarPos[matchedTarInd],]
    TarGTs0 = np.zeros(len(matchedTarInd), dtype="int8")
    TarGTs1 = np.ones(len(matchedTarInd), dtype="int8") + 1
    TarGTs2 = np.ones(len(matchedTarInd), dtype="int8")
    overlappedInds = np.append(overlappedInds, perchrTarPos[matchedTarInd])
    NumMatSNPs = NumMatSNPs + len(matchedAccInd)
    for j in range(0, len(matchedAccInd), chunk_size):
      t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:]
      samSNPs0 = np.reshape(np.repeat(TarGTs0[j:j+chunk_size], num_lines), (len(TarGTs0[j:j+chunk_size]),num_lines))
      samSNPs1 = np.reshape(np.repeat(TarGTs1[j:j+chunk_size], num_lines), (len(TarGTs1[j:j+chunk_size]),num_lines))
      samSNPs2 = np.reshape(np.repeat(TarGTs2[j:j+chunk_size], num_lines), (len(TarGTs2[j:j+chunk_size]),num_lines))
      tempScore0 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs0, dtype=int).T, matchedTarWei[j:j+chunk_size,0]).T, axis=0)
      tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0)
      tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0)
      ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2
      if(len(TarGTs0[j:j+chunk_size]) > 1):
        NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites
      elif(len(TarGTs0[j:j+chunk_size]) == 1):
        NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
    log.info("Done analysing %s positions", NumMatSNPs)
  log.info("writing score file!")
  overlap = float(NumMatSNPs)/NumSNPs
  print_out_table(outFile + '.scores.txt',GenotypeData.accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean)
  if not outFile:
    outFile = "genotyper"
  print_topHits(outFile + ".matches.json", GenotypeData.accessions, ScoreList, NumInfoSites, overlap, NumMatSNPs)
  getHeterozygosity(snpGT[overlappedInds], outFile + ".matches.json")
  return (ScoreList, NumInfoSites)
Example #4
0
def main():

    geno_in, scale_kinship, kinship_out_hdf5, kinship_out_csv = sys.argv[1:]

    logger = LoggerFactory.get_logger(kinship_out_hdf5 + '.log')
    LoggerFactory.log_command(logger, sys.argv[1:])

    ## Import genotype data
    geno = genotype.load_hdf5_genotype_data(geno_in)
    SNP_acc = geno.accessions
    logger.info('Finished reading SNP from %s', geno_in)

    logger.info('Start calculating kinship')
    K = geno.get_ibs_kinship_matrix()
    if (scale_kinship == '1'):
        logger.info('Scaling')
        K = kinship.scale_k(K)
    else:
        logger.info('NOT scaling')

    logger.info('Saving kinship to HDF5 file %s', kinship_out_hdf5)
    kinship.save_kinship_to_file(kinship_out_hdf5, K, geno.accessions,
                                 geno.num_snps)
    logger.info('Saving kinship to CSV file %s', kinship_out_csv)
    save_kinship_in_text_format(kinship_out_csv, K, geno.accessions)

    logger.info('Done!')
def _load_genotype_(folder,genotype_id):
    data_format = 'binary'
    file_prefix = os.path.join(folder,str(genotype_id))

    hdf5_file = os.path.join(file_prefix,'all_chromosomes_%s.hdf5' % data_format)
    if os.path.isfile(hdf5_file):
        return genotype.load_hdf5_genotype_data(hdf5_file)
    raise Exception('No Genotype files in %s folder were found.' % file_prefix)
    def on_post(self, req, resp,genotype_id,chr,position):
        #filter nan
        position = int(position)
        genotypeData = genotype.load_hdf5_genotype_data('%s/%s/all_chromosomes_binary.hdf5' % (self.storage_path,genotype_id))
        num_snps = int(req.params.get('num_snps',250))
        accessions =  req.context.get('doc',[])

        ld_data = _replace_NaN(ld.calculate_ld_for_region(genotypeData,accessions,chr,position,num_snps=num_snps))
        req.context['result'] = ld_data
        resp.status = falcon.HTTP_200
def main():

    geno_in, acc_in, maf_lb, maf_ub, geno_out = sys.argv[1:]

    logger = LoggerFactory.get_logger(geno_out + '.log')
    LoggerFactory.log_command(logger, sys.argv[1:])

    maf_lb, maf_ub = float(maf_lb), float(maf_ub)

    ## Import genotype data
    geno = genotype.load_hdf5_genotype_data(geno_in)
    SNP_acc = geno.accessions
    logger.info('Finished reading SNP from %s', geno_in)

    ## accession subset
    with open(acc_in, 'rb') as f:
        reader = csv.reader(f)
        file_acc = list(reader)
    logger.info('Finished reading accession subset from %s', acc_in)

    ## get common accessions in the same order for genotype and accession subset
    acc_common = [acc for acc in SNP_acc if acc in file_acc]

    ## filtering
    logger.info(
        'Start subsetting accessions and filtering SNPs by MAF >%f and <=%f',
        maf_lb, maf_ub)
    match = lambda a, b: [b.index(x) if x in b else None for x in a]
    geno.filter_accessions_ix(match(acc_common, SNP_acc))
    (num_snps, num_removed) = filter_maf_snps(geno, maf_lb, maf_ub)
    logger.info('Removed %d from %d SNPs', num_removed, num_snps)
    logger.info('Number of SNPs remaining %d', geno.num_snps)

    logger.info('Start writing filtered genotype file to %s', geno_out)
    geno.save_as_hdf5(geno_out)
    logger.info('Finished')

    logger.info('Done!')
Example #8
0
                     type="string")
inOptions.add_option("-r",
                     "--rareAlleleFreq",
                     dest="allelFreq",
                     help="Allele frequency to consider as rare allele",
                     default=0.05,
                     type="float")

#inOptions.add_option("-s", "--error_rate", dest="error", help="Maximum score which is considered to be for top hit accession", default=0.98, type="float")

(options, args) = inOptions.parse_args()

logging.basicConfig(format='%(levelname)s:%(asctime)s:  %(message)s',
                    level=logging.DEBUG)

GenotypeData = genotype.load_hdf5_genotype_data(options.hdf5File)
NumAcc = len(GenotypeData.accessions)

snps = GenotypeData.get_snps_iterator(is_chunked=True, chunk_size=1000)
chunk_i = 0
NumRareAllele = numpy.zeros(NumAcc)
InfoPOS = numpy.zeros(NumAcc)
logging.info("Starting the calculation")
for snp in snps:

    chunk_i = chunk_i + 1
    snps_array = numpy.array(snp)
    info_array = numpy.copy(snps_array)
    snps_array[snps_array == -1] = 0
    info_array[info_array == 0] = 1
    info_array[info_array == -1] = 0
                     type="float")
inOptions.add_option("-i",
                     "--input_vcf",
                     dest="inFile",
                     help="Input VCF file",
                     type="string")
inOptions.add_option("-o",
                     "--output",
                     dest="outFile",
                     help="Output file with the probability scores",
                     type="string")
inOptions.set_defaults(error=0.001, qual=100)
(options, args) = inOptions.parse_args()

inputVCFfile = open(options.inFile, 'r')
GenotypeData = genotype.load_hdf5_genotype_data(
    '/lustre/scratch/users/rahul.pisupati/all_chromosomes_binary.hdf5')

# Create a numpy array containing all the positions
ScoreList = numpy.zeros(len(GenotypeData.accessions))
NumSNP = 0
CheckStatus = 0
for vcfLine in inputVCFfile.readlines()[0:]:
    if (vcfLine[0][0] != '#'):
        if (float(vcfLine.split()[5]) > options.qual
                and len(vcfLine.split()[3]) == 1
                and len(vcfLine.split()[4]) == 1):
            dataSNPlist = getAllele(GenotypeData,
                                    vcfLine.split()[0].replace("Chr", ""),
                                    vcfLine.split()[1])
            NumSNP += 1
            if (dataSNPlist != '0'):
Example #10
0
def crossGenotyper(args):
    ## Get the VCF file (filtered may be) generated by GATK.
    # inputs:
    # 1) VCF file
    # 2) Parent1 and Parent2
    # 3) SNP matrix (hdf5 file)
    # 4) Bin length, default as 200Kbp
    # 5) Chromosome length
    (snpCHR, snpPOS, snpGT, snpWEI,
     DPmean) = snpmatch.parseInput(inFile=args['inFile'],
                                   logDebug=args['logDebug'])
    parents = args['parents']
    ## need to filter the SNPs present in C and M
    log.info("loading HDF5 file")
    GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
    ## die if either parents are not in the dataset
    try:
        indP1 = np.where(
            GenotypeData_acc.accessions == parents.split("x")[0])[0][0]
        indP2 = np.where(
            GenotypeData_acc.accessions == parents.split("x")[1])[0][0]
    except:
        snpmatch.die("parents are not in the dataset")
    snpsP1 = GenotypeData_acc.snps[:, indP1]
    snpsP2 = GenotypeData_acc.snps[:, indP2]
    # identifying the segregating SNPs between the accessions
    # only selecting 0 or 1
    segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0)
                          & (snpsP1 < 2) & (snpsP2 < 2))[0]
    log.info("number of segregating snps between parents: %s", len(segSNPsind))
    (ChrBins, PosBins) = getBins(GenotypeData_acc, args['binLen'])
    log.info("number of bins: %s", len(ChrBins))
    outfile = open(args['outFile'], 'w')
    for i in range(len(PosBins)):
        start = np.sum(PosBins[0:i])
        end = start + PosBins[i]
        # first snp positions which are segregating and are in this window
        reqPOSind = segSNPsind[np.where((segSNPsind < end)
                                        & (segSNPsind >= start))[0]]
        reqPOS = GenotypeData_acc.positions[reqPOSind]
        perchrTarPosind = np.where(snpCHR == ChrBins[i])[0]
        perchrTarPos = snpPOS[perchrTarPosind]
        matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]]
        matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos,
                                                         reqPOS))[0]]
        matchedTarGTs = snpGT[matchedTarInd]
        try:
            TarGTs = snpmatch.parseGT(matchedTarGTs)
            TarGTs[np.where(TarGTs == 2)[0]] = 4
            genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd])
            genP1no = len(np.where(genP1 == 0)[0])
            if len(genP1) > 0:
                pValP1 = st.binom_test(genP1no,
                                       len(genP1),
                                       0.8,
                                       alternative="greater")
                pValP2 = st.binom_test(len(genP1) - genP1no,
                                       len(genP1),
                                       0.8,
                                       alternative="greater")
                if pValP1 < 0.05:
                    outfile.write("%s\t%s\t%s\t0\t%s\n" %
                                  (i + 1, genP1no, len(genP1), pValP1))
                elif pValP2 < 0.05:
                    outfile.write("%s\t%s\t%s\t1\t%s\n" %
                                  (i + 1, genP1no, len(genP1), pValP2))
                elif float(genP1no) / len(genP1) >= 0.8 or float(
                        genP1no) / len(genP1) <= 0.2:
                    outfile.write("%s\t%s\t%s\tNA\tNA\n" %
                                  (i + 1, genP1no, len(genP1)))
                else:
                    outfile.write("%s\t%s\t%s\t0.5\tNA\n" %
                                  (i + 1, genP1no, len(genP1)))
            else:
                outfile.write("%s\t%s\t%s\tNA\tNA\n" %
                              (i + 1, genP1no, len(genP1)))
        except:
            outfile.write("%s\tNA\tNA\tNA\tNA\n" % (i + 1))
        if i % 10 == 0:
            log.info("progress: %s windows", i + 10)
    log.info("done!")
    outfile.close()
Example #11
0
def geno():
    return genotype.load_hdf5_genotype_data('%s/all_chromosomes_binary.hdf5' %
                                            resource_path)
Example #12
0
def main():

    geno_hdf5, kinship_hdf5, RNA_csv, COV_file, RNA_start, RNA_end, out_file = sys.argv[
        1:]
    RNA_start, RNA_end = int(RNA_start), int(RNA_end)

    logger = LoggerFactory.get_logger(out_file + '.log',
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)
    LoggerFactory.log_command(logger, sys.argv[1:])

    step_list = [""]

    ## Import genotype data
    logger.info('Start reading SNP from %s', geno_hdf5)
    geno = genotype.load_hdf5_genotype_data(geno_hdf5)
    SNP_accx = ['X' + acc for acc in geno.accessions]
    logger.info('Finished reading SNP from %s', geno_hdf5)

    ## Import phenotype data
    logger.info('Finished reading RNA from %s', RNA_csv)
    RNA_df = pd.read_csv(RNA_csv, sep='\t', header=0,
                         index_col=0)  # genes x accessions
    RNA_accx = list(RNA_df.columns.values)
    RNA_genes = list(RNA_df.index)
    logger.info('Finished reading RNA from %s', RNA_csv)

    ## get common accessions in the same order for genotype, phenotype and phenotype covariates
    logger.info('Consolidate accessions from genotype and RNA file')
    accx_common = [accx for accx in SNP_accx if accx in RNA_accx]
    RNA = RNA_df.as_matrix(columns=accx_common).T  # accession x genes
    match = lambda a, b: [b.index(x) if x in b else None for x in a]
    geno.filter_accessions_ix(match(accx_common, SNP_accx))
    logger.info(
        'Number of accessions: genotype file %d, RNA file %d, common %d',
        len(SNP_accx), len(RNA_accx), len(accx_common))

    logger.info('Start building SNP matrix in memory')
    snps = np.vstack(geno.get_snps_iterator(is_chunked=True))
    snps = snps.T.astype(int)
    logger.info('Finished')

    logger.info('Start loading kinship matrix from %s', kinship_hdf5)
    load_k = kinship.load_kinship_from_file(kinship_hdf5, scaled=False)
    K0 = load_k['k'].astype(float)
    K_accx = ['X' + acc for acc in load_k['accessions']]
    K_accx_ix = np.ix_(match(accx_common, K_accx), match(accx_common, K_accx))
    K = K0[K_accx_ix]
    logger.info('Finished')

    logger.info('Start loading covariance from %s', COV_file)
    COV_df = pd.read_csv(COV_file, sep='\t', header=0,
                         index_col=0)  # cov x accessions
    COV = COV_df.ix[accx_common].as_matrix()
    logger.info('Finished')

    logger.info('Start association testing: RNA start %d, RNA end %d',
                RNA_start, RNA_end)

    run_lmm_chunk(snps, RNA, COV, RNA_start, RNA_end,
                  list(RNA_genes[RNA_start:RNA_end]), K, out_file)
Example #13
0
def main():

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    # create console handler and set level to debug
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    # create formatter
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # add formatter to ch
    ch.setFormatter(formatter)
    # add ch to logger
    logger.addHandler(ch)

    step_list = [""]

    ## Output directory
    out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/marginal_test_cov_01'
    logger.info('Out dir %s', out_dir)

    ## Import genotype data
    #SSH: f = h5py.File('/Limix/samples/SNPs.h5py')
    #SSH: transsnp = f['snp'][:]
    #transsnp.shape
    SNP_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/SNP_matrix_imputed_hdf5/1001_SNP_MATRIX/imputed_snps_binary.hdf5'
    #SNP_file = '/gale/netapp/home/shhuang/data/1001_genomes/1001_250k_fullimputed/PYGWAS_GENOTYPES/1/all_chromosomes_binary.hdf5'
    geno = genotype.load_hdf5_genotype_data(SNP_file)
    #f = h5py.File(SNP_file)
    SNP_accx = ['X' + acc for acc in geno.accessions]
    logger.info('Finished reading SNP from %s', SNP_file)

    ## Import phenotype data
    #SSH: f = open('/Limix/samples/RNA_wo_Index.csv')
    #SSH: f.readline()
    #SSH: RNA = []
    #SSH: for l in f:
    #SSH:    RNA.append(l.strip().split(','))
    #SSH: RNA = np.array(RNA)
    #SSH: RNA = RNA.astype(float)
    #SSH: RNA = RNA.T
    #SSH: RNA = (RNA-RNA.mean(axis=0))/RNA.std(axis=0)

    RNA_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01.txt'
    RNA_df = pd.read_csv(RNA_file, sep='\t', header=0,
                         index_col=0)  # genes x accessions
    RNA_accx = list(RNA_df.columns.values)
    RNA_genes = list(RNA_df.index)
    logger.info('Finished reading RNA from %s', RNA_file)

    ## get common accessions in the same order for genotype, phenotype and phenotype covariates
    accx_common = [accx for accx in SNP_accx if accx in RNA_accx]
    RNA = RNA_df.as_matrix(columns=accx_common).T  # accession x genes

    ## filtering
    match = lambda a, b: [b.index(x) if x in b else None for x in a]
    geno.filter_accessions_ix(match(accx_common, SNP_accx))
    #(num_snps,num_removed) = geno.filter_non_binary()
    (num_snps, num_removed) = filter_maf_snps(geno, 0.01, 0.5)

    #snps_ix = np.ix_(geno.filter_snps,geno.filter_accessions)
    #snps = geno.snps[:][snps_ix]
    #snps = snps.T.astype(int)

    snps = np.vstack(geno.get_snps_iterator(is_chunked=True))
    snps = snps.T.astype(int)
    logger.info('Finished filtering SNPs')

    #SSH: transsnp = f['snps'][:,match(accx_common,SNP_accx)] # SNP x accessions
    # trans kinship matrix
    #SSH: ts = transsnp.T # accessions x SNP
    #SSH: sumts = ts.sum(axis=0)
    #SSH: pos_tf = (sumts!=0)&(sumts!=ts.shape[0])
    #SSH: ts = ts[:,pos_tf]	# not fixed
    #SSH: ts = ts.astype(float)
    #SSH: ts = (ts-ts.mean(axis=0))/ts.std(axis=0)
    #SSH: transk = np.dot(ts,ts.T) # accessions x accessions

    #transk
    ## Scaling Kinship matrix (from the Bjarni's scale_k())
    #SSH: c = sp.sum((sp.eye(len(transk)) - (1.0 / len(transk)) * sp.ones(transk.shape)) * sp.array(transk))
    #SSH: scalar = (len(transk) - 1) / c
    #SSH: transK = scalar * transk

    ## save the filtered genotypes
    #np.savetxt(os.path.join(out_dir,'positions_tf.txt'), geno.filter_snps, delimiter='\t',fmt='%d')

    if ('calc_kinship' in step_list):
        K = geno.get_ibd_kinship_matrix()
        scaledK = kinship.scale_k(K).astype(float)
        kinship.save_kinship_to_file(
            os.path.join(out_dir, 'kinship_maf1.hdf5'), K, geno.accessions,
            geno.num_snps)
    else:
        load_k = kinship.load_kinship_from_file(os.path.join(
            out_dir, 'kinship_maf1.hdf5'),
                                                scaled=False)
        K = load_k['k'].astype(float)
        scaledK = kinship.scale_k(K)
    logger.info('Done with kinship file')

    num_cores = 32
    logger.info('Start testing')
    for k in range(4, 5):

        ## Import phenotype covariates
        COV_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k%d.txt' % (
            k)
        COV_df = pd.read_csv(COV_file, sep='\t', header=0,
                             index_col=0)  # cov x accessions
        COV = COV_df.ix[accx_common].as_matrix()
        logger.info('Cov file %s', COV_file)

        #x = Parallel(n_jobs=num_cores)(delayed(runn_lmm)(ts,RNA,i,transK,out_csv) for i in range(0,10))
        #x = Parallel(n_jobs=num_cores,verbose=100,max_nbytes=1e6)(delayed(has_shareable_memory)(run_lmm(ts,RNA,COV,i,transK,out_csv)) for i in range(0,RNA.shape[1]))
        out_dir_k = os.path.join(out_dir, 'gNorm_k%d' % k)
        if not os.path.exists(out_dir_k): os.makedirs(out_dir_k)
        #x = Parallel(n_jobs=num_cores,verbose=100,max_nbytes=1e6)(
        #    delayed(run_lmm)(snps,RNA,COV,i,scaledK,os.path.join(out_dir_k,'%s.csv'%RNA_genes[i]))
        #   for i in range(0,10))
        for i in range(0, 5000):
            logger.debug('Gene %d', i)
            run_lmm(snps, RNA, COV, i, scaledK,
                    os.path.join(out_dir_k, '%s.csv' % RNA_genes[i]))
from pygwas.core import genotype


#__________________________________________
inOptions = OptionParser()
inOptions.add_option("-p", "--pos_file", dest="posFile", help="Position file removing the header from VCF", type="string")
inOptions.add_option("-t", "--file_num_snps", dest="file_num_snps", help="Output from the CalculateSNPseachAcc.py script", type="string")
inOptions.add_option("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file", type="string")
inOptions.add_option("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file", type="string")
inOptions.add_option("-o", "--output", dest="outFile", help="Output file with the probability scores", type="string")
inOptions.add_option("-r", "--refScore", dest="refScore", help="Output for refined score", type="string")

(options, args) = inOptions.parse_args()


GenotypeData = genotype.load_hdf5_genotype_data(options.hdf5File)
GenotypeData_acc = genotype.load_hdf5_genotype_data(options.hdf5accFile)

# Create a numpy array containing all the positions
targetSNPs = pandas.read_table(options.posFile, header=None)
NumSNPs = len(targetSNPs)
ScoreList = numpy.zeros(len(GenotypeData.accessions))
TotMatchedSNPind = numpy.zeros(0, dtype="uint32")
NumMatSNPs = 0
for i in range(1,6):
  perchrtarSNPpos = targetSNPs[1][numpy.where(targetSNPs[0] == i)[0]]
  start = GenotypeData.chr_regions[i-1][0]
  end = GenotypeData.chr_regions[i-1][1]
  chrpositions = GenotypeData.positions[start:end]
  matchedSNPind = numpy.where(numpy.in1d(chrpositions, perchrtarSNPpos))[0] + start
  TotMatchedSNPind = numpy.append(TotMatchedSNPind, matchedSNPind)
Example #15
0
def crossGenotyper(args):
    ## Get the VCF file (filtered may be) generated by GATK.
    ## inputs:
    # 1) VCF file
    # 2) Parent1 and Parent2
    # 3) SNP matrix (hdf5 file)
    # 4) Bin length, default as 200Kbp
    # 5) Chromosome length
    log.info("loading genotype data for parents")
    if args['father'] is not None:
        log.info("input files: %s and %s" % (args['parents'], args['father']))
        if not os.path.isfile(args['parents']) and os.path.isfile(
                args['father']):
            die("either of the input files do not exists, please provide VCF/BED file for parent genotype information"
                )
        (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI,
         p1DPmean) = parsers.parseInput(inFile=args['parents'],
                                        logDebug=args['logDebug'])
        (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI,
         p2DPmean) = parsers.parseInput(inFile=args['father'],
                                        logDebug=args['logDebug'])
        commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR)
        commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype)
        commonSNPsPOS = np.zeros(0, dtype=int)
        snpsP1 = np.zeros(0, dtype='int8')
        snpsP2 = np.zeros(0, dtype='int8')
        for i in commonCHRs_ids:
            perchrP1inds = np.where(p1snpCHR == i)[0]
            perchrP2inds = np.where(p2snpCHR == i)[0]
            perchrPositions = np.union1d(p1snpPOS[perchrP1inds],
                                         p2snpPOS[perchrP2inds])
            commonSNPsCHR = np.append(commonSNPsCHR,
                                      np.repeat(i, len(perchrPositions)))
            commonSNPsPOS = np.append(commonSNPsPOS, perchrPositions)
            perchrsnpsP1 = np.repeat(-1, len(perchrPositions)).astype('int8')
            perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8')
            perchrsnpsP1_inds = np.where(
                np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0]
            perchrsnpsP2_inds = np.where(
                np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0]
            snpsP1 = np.append(snpsP1,
                               parsers.parseGT(p1snpGT[perchrsnpsP1_inds]))
            snpsP2 = np.append(snpsP2,
                               parsers.parseGT(p2snpGT[perchrsnpsP2_inds]))
        log.info("done!")
    else:
        parents = args['parents']
        ## need to filter the SNPs present in C and M
        if not args['hdf5accFile']:
            snpmatch.die("needed a HDF5 genotype file and not specified")
        log.info("loading HDF5 file")
        g_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
        ## die if either parents are not in the dataset
        #import ipdb; ipdb.set_trace()
        try:
            indP1 = np.where(g_acc.accessions == parents.split("x")[0])[0][0]
            indP2 = np.where(g_acc.accessions == parents.split("x")[1])[0][0]
        except:
            snpmatch.die("parents are not in the dataset")
        snpsP1 = g_acc.snps[:, indP1]
        snpsP2 = g_acc.snps[:, indP2]
        commonSNPsCHR = np.array(g_acc.chromosomes)
        commonSNPsPOS = np.array(g_acc.positions)
        log.info("done!")
    log.info("running cross genotyper")
    crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2,
                         args['inFile'], args['binLen'], args['outFile'],
                         args['logDebug'])
#!/usr/bin/python
import sys
#These are the modules that are needed for this script
# module load numpy
# module use /net/gmi.oeaw.ac.at/software/shared/nordborg_common/modulefiles/
# module load pygwas

import numpy
from pygwas.core import genotype

hdf5file = sys.argv[1]

GenotypeData = genotype.load_hdf5_genotype_data(hdf5file)

# Calculate the number of SNPs in all the accessions
# Takes a really long time

# Calculate the final probability based on the score count and total count
#outfile = open("totalSNPsNum_1001genomes.txt", 'w')
for i in range(0, len(GenotypeData.accessions)):
    #  outScore = numpy.count_nonzero(GenotypeData.snps[:, i])
    outScore = len(numpy.where(GenotypeData.snps[:, i] == 1)[0])
    #  outScore = len(numpy.where((GenotypeData.snps[:,i] == 1) | (GenotypeData.snps[:,i] == -1))[0])
    print GenotypeData.accessions[i], "\t", outScore
#  print "Written count for", i+1, "accessions", "Accession:", GenotypeData.accessions[i], "Count:", outScore

#outfile.close()
Example #17
0
def geno():
    return genotype.load_hdf5_genotype_data('%s/all_chromosomes_binary.hdf5' %resource_path)