def test_roh_mhmm_100pct(self): # values correspond to start/stop/length/is_marginal roh_expected = np.array([[1, 100, 100, True]], dtype=object) fraction_expected = 1.0 gv = np.zeros((4, 2), dtype=np.int16) pos = [1, 10, 50, 100] roh, fraction = allel.roh_mhmm(gv, pos, contig_size=100) aeq(roh.values, roh_expected) assert fraction == fraction_expected
def test_roh_mhmm_0pct(self): fraction_expected = 0.0 gv = np.zeros((4, 2), dtype=np.int16) gv[2, 0] = 1 pos = [1, 10, 50, 100] roh, fraction = allel.roh_mhmm(gv, pos, contig_size=100) assert roh.shape[0] == 0 assert fraction == fraction_expected
def main(): if len(sys.argv) != 3: print('vcf-file-name output-file') sys.exit(1) vcfFileName = sys.argv[1] outputFile = sys.argv[2] logger.debug('reading ' + vcfFileName) vcf = allel.read_vcf(vcfFileName, fields=[ 'samples', 'calldata/GT', 'variants/ALT', 'variants/CHROM', 'variants/FILTER_PASS', 'variants/ID', 'variants/POS', 'variants/QUAL', 'variants/REF', 'variants/INFO' ]) logger.debug('reading data into genotype array') genoArray = allel.GenotypeArray(vcf['calldata/GT']) '''logger.debug('counting allele frequencies') alleleFrequency = genoArray.count_alleles().to_frequencies() logger.debug('calculating expected heterozygosity') expectedHeterozygosity = allel.heterozygosity_expected(alleleFrequency, ploidy=2) #print('expected: ' + str(expectedHeterozygosity)) logger.debug('calculating observed heterozygosity') observedHeterozygosity = allel.heterozygosity_observed(genoArray) #print('observed: ' + str(observedHeterozygosity)) logger.debug('calculating inbreeding coefficient') inbreedingCoefficient = allel.inbreeding_coefficient(genoArray) #print('ibc: ' + str(inbreedingCoefficient)) logger.debug('calculating delta heterozygosity') diff = list() for i in range(len(expectedHeterozygosity)): diff.append(expectedHeterozygosity[i] - observedHeterozygosity[i])''' logger.debug('finding runs of homozygosity') numVariants = len(genoArray) numSamples = len(vcf['samples']) posArray = np.asarray([i for i in range(numVariants)]) isAccessible = np.asarray([True for i in range(numVariants)]) runsOfHomozygosity = dict() for i in range(numSamples): genoVector = genoArray[:, i] roh = allel.roh_mhmm(genoVector, posArray, is_accessible=isAccessible) print('roh = ' + str(roh)) if not roh[0].empty: runsOfHomozygosity[i] = dict() runsOfHomozygosity[i]['confidence'] = float(roh[1]) for j in range(len(roh[0])): runsOfHomozygosity[i][j] = dict() runsOfHomozygosity[i][j]['start'] = int( roh[0].iloc[j]['start']) runsOfHomozygosity[i][j]['stop'] = int(roh[0].iloc[j]['stop']) runsOfHomozygosity[i][j]['is_marginal'] = bool( roh[0].iloc[j]['is_marginal']) '''logger.debug('saving ibc.txt') np.savetxt(outputDir + '/ibc.txt', inbreedingCoefficient) logger.debug('saving zygosityDelta.txt') with open(outputDir + '/zygosityDelta.txt', 'w') as f: for item in diff: f.write("%s\n" % item) f.close()''' logger.debug('writing to ' + outputFile) #roh = json.dumps(runsOfHomozygosity) with open(outputFile, 'w') as f: json.dump(runsOfHomozygosity, f) f.close()