def __call__(self, row: str) -> gencodeContainer: fields: ty.List[ty.Any] = row.split("\t") fields[0] = fields[0].replace("chr", "") fields[3] = int(fields[3]) fields[4] = int(fields[4]) fields[6] = fields[6].split(".")[0] return gencodeContainer(*fields)
def test_int_returns_int_subclass(self): class BadInt: def __int__(self): return True class BadInt2(builtins.int): def __int__(self): return True # class TruncReturnsBadInt: # def __trunc__(self): # return BadInt() # # class TruncReturnsIntSubclass: # def __trunc__(self): # return True bad_int = BadInt() with self.assertWarns(DeprecationWarning): n = int(bad_int) self.assertEqual(n, 1) bad_int = BadInt2() with self.assertWarns(DeprecationWarning): n = int(bad_int) self.assertEqual(n, 1)
def __call__(self, row: str) -> CIContainer: # Columns in the raw credible_sets data: # phenotype_id: this corresponds to genes in gene expression data # and both gene and credible inteval in Txrevise data # variant_id: in chrom_pos_ref_alt format; we don't use this # chr # pos # ref # alt # cs_id: this is simply {phenotype_id}_{cs_index} # cs_index: credible set label, either L1 or L2 # finemapped_region: a range for region tested, in chrom:start-end format # pip: generated using SuSie # z: z-score # cs_min_r2 # cs_avg_r2 # cs_size: credible set size, i.e. the number of variants contained in this credible set # posterior_mean: posterior effect size # posterior_sd: posterior standard deviation # cs_log10bf: log10 of the Bayes Factor for this credible set ### Extra columns added by joining main QTL data ### # ma_samples # maf # pvalue # beta # se # type # ac # an # r2 # mol_trait_obj_id # gid # median_tpm # rsid fields: ty.List[ty.Any] = row.split("\t") if self.study and self.tissue: # Tissue-and-study-specific files have two fewer columns (study and tissue), # and so the fields must be appended to match the number of fields in the all-tissue file fields = [self.study, self.tissue] + fields fields[5] = int(fields[5]) # pos fields[11] = float(fields[11]) # pip fields[12] = float(fields[12]) # z fields[13] = float(fields[13]) # cs_min_r2 fields[14] = float(fields[14]) # cs_avg_r2 fields[15] = int(fields[15]) # cs_size fields[16] = float(fields[16]) # posterior_mean fields[17] = float(fields[17]) # posteriof_sd fields[18] = float(fields[18]) # cs_log10bf # Extra fields from joined file if len(fields) > 19: fields[19] = int(fields[19]) # ma_samples fields[20] = float(fields[20]) # maf fields[21] = float(fields[21]) # pvalue fields[22] = float(fields[22]) # beta fields[23] = float(fields[23]) # se fields[25] = int(fields[25]) # ac fields[26] = int(fields[26]) # an fields[30] = float(fields[30]) # median_tpm return CIContainer(*fields)
def test_keyword_args(self): # Test invoking int() using keyword arguments. self.assertEqual(int(x=1.2), 1) self.assertEqual(int('100', base=2), 4) self.assertEqual(int(x='100', base=2), 4) self.assertRaises(TypeError, int, base=10) self.assertRaises(TypeError, int, base=0)
def test_intconversion(self): # Test __int__() class ClassicMissingMethods: pass self.assertRaises(TypeError, int, ClassicMissingMethods()) class MissingMethods(object): pass self.assertRaises(TypeError, int, MissingMethods()) class Foo0: def __int__(self): return 42 self.assertEqual(int(Foo0()), 42) class Classic: pass for base in (object, Classic): class IntOverridesTrunc(base): def __int__(self): return 42 def __trunc__(self): return -12 self.assertEqual(int(IntOverridesTrunc()), 42)
def test_int_returns_int_subclass(self): class BadInt: def __int__(self): return True class BadInt2(builtins.int): def __int__(self): return True # class TruncReturnsBadInt: # def __trunc__(self): # return BadInt() # # class TruncReturnsIntSubclass: # def __trunc__(self): # return True bad_int = BadInt() with self.assertWarns(DeprecationWarning): n = int(bad_int) self.assertEqual(n, 1) self.assertIs(type(n), builtins.int) bad_int = BadInt2() with self.assertWarns(DeprecationWarning): n = int(bad_int) self.assertEqual(n, 1) self.assertIs(type(n), builtins.int)
def test_int_buffer(self): with test_support.check_py3k_warnings(): self.assertEqual(int(buffer('123', 1, 2)), 23) self.assertEqual(int(buffer('123\x00', 1, 2)), 23) self.assertEqual(int(buffer('123 ', 1, 2)), 23) self.assertEqual(int(buffer('123A', 1, 2)), 23) self.assertEqual(int(buffer('1234', 1, 2)), 23)
def check(s, base=None): with self.assertRaises(ValueError, msg="int(%r, %r)" % (s, base)) as cm: if base is None: int(s) else: int(s, base) self.assertEqual(cm.exception.args[0], "invalid literal for int() with base %d: %r" % (10 if base is None else base, s))
def line_parser(row) -> ty.Tuple[str, int, str, str, int]: """For new dbSNP format, builds 152+""" fields = row.split() # the new dbSNP format uses refseq ids + version; convert these to human-readable chromosome names chrom = VERSIONLESS_CHROMS[fields[0].split('.')[0]] pos = int(fields[1]) ref = fields[3] alt = fields[4] # Get the RSID from the VCF info field, in case the id column is ambiguous for some reason rsid = int(RSID_CAPTURE.search(fields[7]).group(1)) return (chrom, pos, ref, alt, rsid)
def test_int_base_indexable(self): class MyIndexable(object): def __init__(self, value): self.value = value def __index__(self): return self.value # Check out of range bases. for base in 2**100, -2**100, 1, 37: with self.assertRaises(ValueError): int('43', base) # Check in-range bases. self.assertEqual(int('101', base=MyIndexable(2)), 5) self.assertEqual(int('101', base=MyIndexable(10)), 101) self.assertEqual(int('101', base=MyIndexable(36)), 1 + 36**2)
def test_non_numeric_input_types(self): # Test possible non-numeric types for the argument x, including # subclasses of the explicitly documented accepted types. class CustomStr(str): pass class CustomBytes(bytes): pass class CustomByteArray(bytearray): pass values = [b'100', bytearray(b'100'), CustomStr('100'), CustomBytes(b'100'), CustomByteArray(b'100')] for x in values: msg = 'x has type %s' % type(x).__name__ self.assertEqual(int(x), 100, msg=msg) self.assertEqual(int(x, 2), 4, msg=msg)
def _import_reference_thread_tped(self, i): # Load with gzip.open(self._refData + '.chr' + str(i) + '.tped.gz', 'rt') as f: db = snpdb.db() db.open(self._refData + '.chr' + str(i)) for line in f: L = line.split() # [chr,rid,irrelevant,pos,genotype] # Get genotype and dephase if L[1][0:2] == 'rs': dephased = (np.array(L[4:], dtype='b') - 1) genotype = np.sum(dephased.reshape( (int(len(dephased) / 2), 2)), axis=1) # PLINK uses 1 for minor allele -> Convert to minor allele count genotype[genotype == 2] = -1 genotype[genotype == 0] = 2 genotype[genotype == -1] = 0 m = np.mean(genotype) s = np.std(genotype) if s != 0: # Compute MAF MAF = m / 2. if (MAF > 0.5): MAF = 1.0 - MAF T = [L[1], round(MAF, 3), genotype] # Store db.insert({int(L[3]): T}) db.close() return True
def test_non_numeric_input_types(self): # Test possible non-numeric types for the argument x, including # subclasses of the explicitly documented accepted types. class CustomStr(str): pass class CustomBytes(bytes): pass class CustomByteArray(bytearray): pass factories = [ bytes, bytearray, lambda b: CustomStr(b.decode()), CustomBytes, CustomByteArray, memoryview, ] try: from array import array except ImportError: pass else: factories.append(lambda b: array('B', b)) for f in factories: x = f(b'100') with self.subTest(type(x)): self.assertEqual(int(x), 100) if isinstance(x, (str, bytes, bytearray)): self.assertEqual(int(x, 2), 4) else: msg = "can't convert non-string" with self.assertRaisesRegex(TypeError, msg): int(x, 2) with self.assertRaisesRegex(ValueError, 'invalid literal'): int(f(b'A' * 0x10))
def test_valid_non_numeric_input_types_for_x(self): # Test possible valid non-numeric types for x, including subclasses # of the allowed built-in types. class CustomStr(str): pass class CustomByteArray(bytearray): pass factories = [str, bytearray, CustomStr, CustomByteArray, buffer] if have_unicode: class CustomUnicode(unicode): pass factories += [unicode, CustomUnicode] for f in factories: with test_support.check_py3k_warnings(quiet=True): x = f('100') msg = 'x has value %s and type %s' % (x, type(x).__name__) try: self.assertEqual(int(x), 100, msg=msg) if isinstance(x, basestring): self.assertEqual(int(x, 2), 4, msg=msg) except TypeError, err: raise AssertionError('For %s got TypeError: %s' % (type(x).__name__, err)) if not isinstance(x, basestring): errmsg = "can't convert non-string" with self.assertRaisesRegexp(TypeError, errmsg, msg=msg): int(x, 2) errmsg = 'invalid literal' with self.assertRaisesRegexp(ValueError, errmsg, msg=msg), \ test_support.check_py3k_warnings(quiet=True): int(f('A' * 0x10))
def test_int_subclass_with_int(self): class MyInt(builtins.int): def __int__(self): return 42 class BadInt(builtins.int): def __int__(self): return 42.0 my_int = MyInt(7) self.assertEqual(my_int, 7) self.assertEqual(int(my_int), 42) self.assertRaises(TypeError, int, BadInt())
def test_int_returns_int_subclass(self): class BadInt: def __int__(self): return True class BadInt2(builtins.int): def __int__(self): return True # class TruncReturnsBadInt: # def __trunc__(self): # return BadInt() # # class TruncReturnsIntSubclass: # def __trunc__(self): # return True bad_int = BadInt() n = int(bad_int) self.assertEqual(n, 1) bad_int = BadInt2() n = int(bad_int) self.assertEqual(n, 1)
def test_non_numeric_input_types(self): # Test possible non-numeric types for the argument x, including # subclasses of the explicitly documented accepted types. class CustomStr(str): pass class CustomBytes(bytes): pass class CustomByteArray(bytearray): pass values = [ b'100', bytearray(b'100'), CustomStr('100'), CustomBytes(b'100'), CustomByteArray(b'100') ] for x in values: msg = 'x has type %s' % type(x).__name__ self.assertEqual(int(x), 100, msg=msg) self.assertEqual(int(x, 2), 4, msg=msg)
def variant_parser(row: str) -> VariantContainer: """ This is a stub class that specifies how to parse a line. It could accept configuration in the future, eg diff column numbers if there was more than one file with the same data arranged in diff ways It does the work of finding the fields, and of turning the text file into numeric data where appropriate The parser is the piece tied to file format, so this must change if the file format changes! """ fields = row.split('\t') # For now we clean up three fields exactly. # if data format changes! fields[0] = fields[0].replace('chr', '') # chrom fields[1] = int(fields[1]) # pos fields[10] = float(fields[10]) # pvalue_nominal return VariantContainer(*fields)
def test_intconversion(self): # Test __int__() class ClassicMissingMethods: pass self.assertRaises(AttributeError, int, ClassicMissingMethods()) class MissingMethods(object): pass self.assertRaises(TypeError, int, MissingMethods()) class Foo0: def __int__(self): return 42 class Foo1(object): def __int__(self): return 42 class Foo2(__builtin__.int): def __int__(self): return 42 class Foo3(__builtin__.int): def __int__(self): return self class Foo4(__builtin__.int): def __int__(self): return 42L class Foo5(__builtin__.int): def __int__(self): return 42. self.assertEqual(int(Foo0()), 42) self.assertEqual(int(Foo1()), 42) self.assertEqual(int(Foo2()), 42) self.assertEqual(int(Foo3()), 0) self.assertEqual(int(Foo4()), 42L) self.assertRaises(TypeError, int, Foo5()) class Classic: pass for base in (object, Classic): class IntOverridesTrunc(base): def __int__(self): return 42 def __trunc__(self): return -12 self.assertEqual(int(IntOverridesTrunc()), 42)
def test_underscores(self): for lit in VALID_UNDERSCORE_LITERALS: if any(ch in lit for ch in '.eEjJ'): continue self.assertEqual(int(lit, 0), eval(lit)) self.assertEqual(int(lit, 0), int(lit.replace('_', ''), 0)) for lit in INVALID_UNDERSCORE_LITERALS: if any(ch in lit for ch in '.eEjJ'): continue self.assertRaises(ValueError, int, lit, 0) # Additional test cases with bases != 0, only for the constructor: self.assertEqual(int("1_00", 3), 9) self.assertEqual(int("0_100"), 100) # not valid as a literal! self.assertEqual(int(b"1_00"), 100) # byte underscore self.assertRaises(ValueError, int, "_100") self.assertRaises(ValueError, int, "+_100") self.assertRaises(ValueError, int, "1__00") self.assertRaises(ValueError, int, "100_")
def test_int_base_limits(self): """Testing the supported limits of the int() base parameter.""" self.assertEqual(int('0', 5), 0) with self.assertRaises(ValueError): int('0', 1) with self.assertRaises(ValueError): int('0', 37) with self.assertRaises(ValueError): int('0', -909) # An old magic value base from Python 2. with self.assertRaises(ValueError): int('0', base=0-(2**234)) with self.assertRaises(ValueError): int('0', base=2**234) # Bases 2 through 36 are supported. for base in range(2,37): self.assertEqual(int('0', base=base), 0)
def test_int_memoryview(self): self.assertEqual(int(memoryview(b'123')[1:3]), 23) self.assertEqual(int(memoryview(b'123\x00')[1:3]), 23) self.assertEqual(int(memoryview(b'123 ')[1:3]), 23) self.assertEqual(int(memoryview(b'123A')[1:3]), 23) self.assertEqual(int(memoryview(b'1234')[1:3]), 23)
def test_small_ints(self): self.assertIs(int('10'), 10) self.assertIs(int('-1'), -1) if have_unicode: self.assertIs(int(u'10'), 10) self.assertIs(int(u'-1'), -1)
def test_int_base_bad_types(self): """Not integer types are not valid bases; issue16772.""" with self.assertRaises(TypeError): int('0', 5.5) with self.assertRaises(TypeError): int('0', 5.0)
def test_no_args(self): self.assertEqual(int(), 0)
def _import_reference_thread_vcf(self, i, keepfile, qualityT, SNPonly): # Load filter info keep = set([]) if keepfile is not None: f = open(keepfile, 'r') for line in f: S = line.split("\t")[0] keep.add(S) f.close() sampleMap = {} # Load with gzip.open(self._refData + '.chr' + str(i) + '.vcf.gz', 'rt') as f: # Find header for line in f: # Detect infos and headers if line[:2] == "##": continue # Detect sample names if line[:2] == "#C": data = line.split("\t") tmp = data[9:] for j in range(0, len(tmp)): if (keepfile is None) or (tmp[j] in keep): sampleMap[j] = tmp[j] break sampleKeys = list(sampleMap.keys()) db = snpdb.db() db.open(self._refData + '.chr' + str(i)) # Main data import loop for line in f: # Data line data = line.split("\t") # Get GT pos tmp = data[8].split(":") GT = -1 for j in range(0, len(tmp)): if tmp[j] == 'GT': GT = j break # Checks if (GT == -1) or (data[2][:2] != 'rs') or ( data[6] != 'PASS' and qualityT is not None and (int(data[5]) < qualityT)): continue # Read genotype genotypes = data[9:] # Infer alternate alleles (pos 0: ref allele) alleles = [data[3]] alleles.extend(data[4].split(",")) if SNPonly and (len(data[3]) > 1): continue counter = np.zeros(len(alleles), dtype='int') # Only read samples in sampleMap genomap = {} for j in range(0, len(sampleKeys)): geno = genotypes[sampleKeys[j]].split(":")[GT] # Ignore half-calls if geno[0] != "." and geno[2] != ".": counter[int(geno[0])] += 1 counter[int(geno[2])] += 1 genomap[sampleKeys[j]] = geno # Reference allele refp = 0 SC = np.argsort(counter) # Sort alleles count for p in SC: if p != refp: if SNPonly and len(alleles[p]) > 1: continue minp = str(p) gd = np.zeros(len(sampleKeys), dtype='B') for j in range(0, len(sampleKeys)): #geno = genotypes[sampleKeys[j]].split(":")[GT] geno = genomap[sampleKeys[j]] # Ignore half-calls if geno[0] != '.' and geno[2] != '.': if geno[0] == minp: gd[j] += 1 if geno[2] == minp: gd[j] += 1 # Compute MAF MAF = np.mean(gd) / 2. if (MAF > 0.5): MAF = 1.0 - MAF T = [data[2], MAF, gd, alleles[p], alleles[refp]] # Stores alt and ref allele db.insert({int(data[1]): T}) f.close() db.close() return True
def process_file(CHR, file, snpdb): c = 0 t = 0 with gzip.open(file[:-7] + ".snpid.vcf.gz", 'wt') as h: with gzip.open(snpdb, 'rt') as g: for dbline in g: # Search for start if dbline[0] == '#' or dbline.split('\t')[0] != CHR: continue else: break dbline = dbline.split('\t', 5) CACHE = None with gzip.open(file, 'rt') as f: for line in f: found = False # Search for start if line[0] == '#': # Copy line to new file h.write(line) continue line_split = line.split('\t', 5) #print("READ:",line[0:5]) t += 1 if line_split[2][:2] != 'rs': ln = int(line_split[1]) # Seek line position in db while int(dbline[1]) < ln and dbline[0] == CHR: dbline = g.readline().split('\t', 5) # Read all SNPs at same position if int(dbline[1]) == ln and dbline[0] == CHR: CACHE = [dbline] while int(dbline[1]) == ln and dbline[0] == CHR: dbline = g.readline().split('\t', 5) if int(dbline[1]) == ln and dbline[0] == CHR: CACHE.append(dbline) #print(CACHE) if CACHE is not None and int(CACHE[0][1]) == ln: R = line_split[3] A = line_split[4] for C in CACHE: X = C[3] Y = C[4].split(",") if R == X and A in Y: #if line_split[3] == C[3] and line_split[4] == C[4]: # Store with replace #print("***",C[0:5],line[0:5]) h.write( re.sub(CHR + ":(\w|:)+", C[2], line, 1)) found = True c += 1 break else: # Store line h.write(line) c += 1 pass print("# SNPs in original ref data ( CHR", CHR, "):", t) print("# SNPs matched:", c)
def inner(line): # Return a stateful closure that does the actual work of parsing try: fields = line.strip().split(delimiter) if len(fields) == 1: raise exceptions.LineParseException( 'Unable to split line into separate fields. This line may have a missing or incorrect delimiter.' ) # Fetch values ref = None alt = None if _marker_col is not None: chrom, pos, ref, alt = utils.parse_marker(fields[_marker_col]) else: chrom = fields[_chrom_col] pos = fields[_pos_col] if chrom.startswith('chr'): chrom = chrom[3:] chrom = chrom.upper() # Explicit columns will override a value from the marker, by design if _ref_col is not None: ref = fields[_ref_col] if _alt_col is not None: alt = fields[_alt_col] pval = fields[_pvalue_col] # Some optional fields rsid = None beta = None stderr_beta = None alt_allele_freq = None allele_count = None n_samples = None if _rsid_col is not None: rsid = fields[_rsid_col] if rsid in MISSING_VALUES: rsid = None elif not rsid.startswith('rs'): rsid = 'rs' + rsid if _beta_col is not None: beta = fields[_beta_col] if _stderr_col is not None: stderr_beta = fields[_stderr_col] if _allele_freq_col is not None: alt_allele_freq = fields[_allele_freq_col] if _allele_count_col is not None: allele_count = fields[_allele_count_col] n_samples = fields[_n_samples_col] # Perform type coercion log_pval = utils.parse_pval_to_log(pval, is_neg_log=_is_neg_log_pvalue) try: pos = int(pos) except ValueError: # Some programs seem to write long positions using scientific notation, which int cannot handle try: pos = int(float(pos)) except ValueError: # If we still can't parse, it's probably bad data raise exceptions.LineParseException( 'Positions should be specified as integers. Could not parse value: {}' .format(pos)) if beta is not None: beta = None if beta in MISSING_VALUES else float(beta) if stderr_beta is not None: stderr_beta = None if stderr_beta in MISSING_VALUES else float( stderr_beta) if _allele_freq_col or _allele_count_col: alt_allele_freq = utils.parse_allele_frequency( freq=alt_allele_freq, allele_count=allele_count, n_samples=n_samples, is_alt_effect=_is_alt_effect) # Some old GWAS files simply won't provide ref or alt information, and the parser will need to do without if ref in MISSING_VALUES: ref = None if isinstance(ref, str): ref = ref.upper() if alt in MISSING_VALUES: alt = None if isinstance(alt, str): alt = alt.upper() result = container(chrom, pos, rsid, ref, alt, log_pval, beta, stderr_beta, alt_allele_freq) except Exception as e: raise exceptions.LineParseException(str(e), line=line) return result
def test_basic(self): self.assertEqual(int(314), 314) self.assertEqual(int(3.14), 3) # Check that conversion from float truncates towards zero self.assertEqual(int(-3.14), -3) self.assertEqual(int(3.9), 3) self.assertEqual(int(-3.9), -3) self.assertEqual(int(3.5), 3) self.assertEqual(int(-3.5), -3) self.assertEqual(int("-3"), -3) self.assertEqual(int(" -3 "), -3) self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3) # Different base: self.assertEqual(int("10",16), 16) # Test conversion from strings and various anomalies for s, v in L: for sign in "", "+", "-": for prefix in "", " ", "\t", " \t\t ": ss = prefix + sign + s vv = v if sign == "-" and v is not ValueError: vv = -v try: self.assertEqual(int(ss), vv) except ValueError: pass s = repr(-1-sys.maxsize) x = int(s) self.assertEqual(x+1, -sys.maxsize) self.assertIsInstance(x, builtins.int) # should return int self.assertEqual(int(s[1:]), sys.maxsize+1) # should return int x = int(1e100) self.assertIsInstance(x, builtins.int) x = int(-1e100) self.assertIsInstance(x, builtins.int) # SF bug 434186: 0x80000000/2 != 0x80000000>>1. # Worked by accident in Windows release build, but failed in debug build. # Failed in all Linux builds. x = -1-sys.maxsize self.assertEqual(x >> 1, x//2) x = int('1' * 600) self.assertIsInstance(x, builtins.int) self.assertRaises(TypeError, int, 1, 12) self.assertEqual(int('0o123', 0), 83) self.assertEqual(int('0x123', 16), 291) # Bug 1679: "0x" is not a valid hex literal self.assertRaises(ValueError, int, "0x", 16) self.assertRaises(ValueError, int, "0x", 0) self.assertRaises(ValueError, int, "0o", 8) self.assertRaises(ValueError, int, "0o", 0) self.assertRaises(ValueError, int, "0b", 2) self.assertRaises(ValueError, int, "0b", 0) # SF bug 1334662: int(string, base) wrong answers # Various representations of 2**32 evaluated to 0 # rather than 2**32 in previous versions self.assertEqual(int('100000000000000000000000000000000', 2), 4294967296) self.assertEqual(int('102002022201221111211', 3), 4294967296) self.assertEqual(int('10000000000000000', 4), 4294967296) self.assertEqual(int('32244002423141', 5), 4294967296) self.assertEqual(int('1550104015504', 6), 4294967296) self.assertEqual(int('211301422354', 7), 4294967296) self.assertEqual(int('40000000000', 8), 4294967296) self.assertEqual(int('12068657454', 9), 4294967296) self.assertEqual(int('4294967296', 10), 4294967296) self.assertEqual(int('1904440554', 11), 4294967296) self.assertEqual(int('9ba461594', 12), 4294967296) self.assertEqual(int('535a79889', 13), 4294967296) self.assertEqual(int('2ca5b7464', 14), 4294967296) self.assertEqual(int('1a20dcd81', 15), 4294967296) self.assertEqual(int('100000000', 16), 4294967296) self.assertEqual(int('a7ffda91', 17), 4294967296) self.assertEqual(int('704he7g4', 18), 4294967296) self.assertEqual(int('4f5aff66', 19), 4294967296) self.assertEqual(int('3723ai4g', 20), 4294967296) self.assertEqual(int('281d55i4', 21), 4294967296) self.assertEqual(int('1fj8b184', 22), 4294967296) self.assertEqual(int('1606k7ic', 23), 4294967296) self.assertEqual(int('mb994ag', 24), 4294967296) self.assertEqual(int('hek2mgl', 25), 4294967296) self.assertEqual(int('dnchbnm', 26), 4294967296) self.assertEqual(int('b28jpdm', 27), 4294967296) self.assertEqual(int('8pfgih4', 28), 4294967296) self.assertEqual(int('76beigg', 29), 4294967296) self.assertEqual(int('5qmcpqg', 30), 4294967296) self.assertEqual(int('4q0jto4', 31), 4294967296) self.assertEqual(int('4000000', 32), 4294967296) self.assertEqual(int('3aokq94', 33), 4294967296) self.assertEqual(int('2qhxjli', 34), 4294967296) self.assertEqual(int('2br45qb', 35), 4294967296) self.assertEqual(int('1z141z4', 36), 4294967296) # tests with base 0 # this fails on 3.0, but in 2.x the old octal syntax is allowed self.assertEqual(int(' 0o123 ', 0), 83) self.assertEqual(int(' 0o123 ', 0), 83) self.assertEqual(int('000', 0), 0) self.assertEqual(int('0o123', 0), 83) self.assertEqual(int('0x123', 0), 291) self.assertEqual(int('0b100', 0), 4) self.assertEqual(int(' 0O123 ', 0), 83) self.assertEqual(int(' 0X123 ', 0), 291) self.assertEqual(int(' 0B100 ', 0), 4) # without base still base 10 self.assertEqual(int('0123'), 123) self.assertEqual(int('0123', 10), 123) # tests with prefix and base != 0 self.assertEqual(int('0x123', 16), 291) self.assertEqual(int('0o123', 8), 83) self.assertEqual(int('0b100', 2), 4) self.assertEqual(int('0X123', 16), 291) self.assertEqual(int('0O123', 8), 83) self.assertEqual(int('0B100', 2), 4) # the code has special checks for the first character after the # type prefix self.assertRaises(ValueError, int, '0b2', 2) self.assertRaises(ValueError, int, '0b02', 2) self.assertRaises(ValueError, int, '0B2', 2) self.assertRaises(ValueError, int, '0B02', 2) self.assertRaises(ValueError, int, '0o8', 8) self.assertRaises(ValueError, int, '0o08', 8) self.assertRaises(ValueError, int, '0O8', 8) self.assertRaises(ValueError, int, '0O08', 8) self.assertRaises(ValueError, int, '0xg', 16) self.assertRaises(ValueError, int, '0x0g', 16) self.assertRaises(ValueError, int, '0Xg', 16) self.assertRaises(ValueError, int, '0X0g', 16) # SF bug 1334662: int(string, base) wrong answers # Checks for proper evaluation of 2**32 + 1 self.assertEqual(int('100000000000000000000000000000001', 2), 4294967297) self.assertEqual(int('102002022201221111212', 3), 4294967297) self.assertEqual(int('10000000000000001', 4), 4294967297) self.assertEqual(int('32244002423142', 5), 4294967297) self.assertEqual(int('1550104015505', 6), 4294967297) self.assertEqual(int('211301422355', 7), 4294967297) self.assertEqual(int('40000000001', 8), 4294967297) self.assertEqual(int('12068657455', 9), 4294967297) self.assertEqual(int('4294967297', 10), 4294967297) self.assertEqual(int('1904440555', 11), 4294967297) self.assertEqual(int('9ba461595', 12), 4294967297) self.assertEqual(int('535a7988a', 13), 4294967297) self.assertEqual(int('2ca5b7465', 14), 4294967297) self.assertEqual(int('1a20dcd82', 15), 4294967297) self.assertEqual(int('100000001', 16), 4294967297) self.assertEqual(int('a7ffda92', 17), 4294967297) self.assertEqual(int('704he7g5', 18), 4294967297) self.assertEqual(int('4f5aff67', 19), 4294967297) self.assertEqual(int('3723ai4h', 20), 4294967297) self.assertEqual(int('281d55i5', 21), 4294967297) self.assertEqual(int('1fj8b185', 22), 4294967297) self.assertEqual(int('1606k7id', 23), 4294967297) self.assertEqual(int('mb994ah', 24), 4294967297) self.assertEqual(int('hek2mgm', 25), 4294967297) self.assertEqual(int('dnchbnn', 26), 4294967297) self.assertEqual(int('b28jpdn', 27), 4294967297) self.assertEqual(int('8pfgih5', 28), 4294967297) self.assertEqual(int('76beigh', 29), 4294967297) self.assertEqual(int('5qmcpqh', 30), 4294967297) self.assertEqual(int('4q0jto5', 31), 4294967297) self.assertEqual(int('4000001', 32), 4294967297) self.assertEqual(int('3aokq95', 33), 4294967297) self.assertEqual(int('2qhxjlj', 34), 4294967297) self.assertEqual(int('2br45qc', 35), 4294967297) self.assertEqual(int('1z141z5', 36), 4294967297)
def test_small_ints(self): # Bug #3236: Return small longs from PyLong_FromString self.assertIs(int('10'), 10) self.assertIs(int('-1'), -1) self.assertIs(int(b'10'), 10) self.assertIs(int(b'-1'), -1)
def test_basic(self): self.assertEqual(int(314), 314) self.assertEqual(int(3.14), 3) self.assertEqual(int(314L), 314) # Check that conversion from float truncates towards zero self.assertEqual(int(-3.14), -3) self.assertEqual(int(3.9), 3) self.assertEqual(int(-3.9), -3) self.assertEqual(int(3.5), 3) self.assertEqual(int(-3.5), -3) # Different base: self.assertEqual(int("10",16), 16L) if have_unicode: self.assertEqual(int(unicode("10"),16), 16L) # Test conversion from strings and various anomalies for s, v in L: for sign in "", "+", "-": for prefix in "", " ", "\t", " \t\t ": ss = prefix + sign + s vv = v if sign == "-" and v is not ValueError: vv = -v try: self.assertEqual(int(ss), vv) except v: pass s = repr(-1-sys.maxint) x = int(s) self.assertEqual(x+1, -sys.maxint) self.assertIsInstance(x, __builtin__.int) # should return long self.assertEqual(int(s[1:]), sys.maxint+1) # should return long x = int(1e100) self.assertIsInstance(x, long) x = int(-1e100) self.assertIsInstance(x, long) # SF bug 434186: 0x80000000/2 != 0x80000000>>1. # Worked by accident in Windows release build, but failed in debug build. # Failed in all Linux builds. x = -1-sys.maxint self.assertEqual(x >> 1, x//2) self.assertRaises(ValueError, int, '123\0') self.assertRaises(ValueError, int, '53', 40) # SF bug 1545497: embedded NULs were not detected with # explicit base self.assertRaises(ValueError, int, '123\0', 10) self.assertRaises(ValueError, int, '123\x00 245', 20) x = int('1' * 600) self.assertIsInstance(x, long) if have_unicode: x = int(unichr(0x661) * 600) self.assertIsInstance(x, long) self.assertRaises(TypeError, int, 1, 12) self.assertEqual(int('0123', 0), 83) self.assertEqual(int('0x123', 16), 291) # Bug 1679: "0x" is not a valid hex literal self.assertRaises(ValueError, int, "0x", 16) self.assertRaises(ValueError, int, "0x", 0) self.assertRaises(ValueError, int, "0o", 8) self.assertRaises(ValueError, int, "0o", 0) self.assertRaises(ValueError, int, "0b", 2) self.assertRaises(ValueError, int, "0b", 0) # SF bug 1334662: int(string, base) wrong answers # Various representations of 2**32 evaluated to 0 # rather than 2**32 in previous versions self.assertEqual(int('100000000000000000000000000000000', 2), 4294967296L) self.assertEqual(int('102002022201221111211', 3), 4294967296L) self.assertEqual(int('10000000000000000', 4), 4294967296L) self.assertEqual(int('32244002423141', 5), 4294967296L) self.assertEqual(int('1550104015504', 6), 4294967296L) self.assertEqual(int('211301422354', 7), 4294967296L) self.assertEqual(int('40000000000', 8), 4294967296L) self.assertEqual(int('12068657454', 9), 4294967296L) self.assertEqual(int('4294967296', 10), 4294967296L) self.assertEqual(int('1904440554', 11), 4294967296L) self.assertEqual(int('9ba461594', 12), 4294967296L) self.assertEqual(int('535a79889', 13), 4294967296L) self.assertEqual(int('2ca5b7464', 14), 4294967296L) self.assertEqual(int('1a20dcd81', 15), 4294967296L) self.assertEqual(int('100000000', 16), 4294967296L) self.assertEqual(int('a7ffda91', 17), 4294967296L) self.assertEqual(int('704he7g4', 18), 4294967296L) self.assertEqual(int('4f5aff66', 19), 4294967296L) self.assertEqual(int('3723ai4g', 20), 4294967296L) self.assertEqual(int('281d55i4', 21), 4294967296L) self.assertEqual(int('1fj8b184', 22), 4294967296L) self.assertEqual(int('1606k7ic', 23), 4294967296L) self.assertEqual(int('mb994ag', 24), 4294967296L) self.assertEqual(int('hek2mgl', 25), 4294967296L) self.assertEqual(int('dnchbnm', 26), 4294967296L) self.assertEqual(int('b28jpdm', 27), 4294967296L) self.assertEqual(int('8pfgih4', 28), 4294967296L) self.assertEqual(int('76beigg', 29), 4294967296L) self.assertEqual(int('5qmcpqg', 30), 4294967296L) self.assertEqual(int('4q0jto4', 31), 4294967296L) self.assertEqual(int('4000000', 32), 4294967296L) self.assertEqual(int('3aokq94', 33), 4294967296L) self.assertEqual(int('2qhxjli', 34), 4294967296L) self.assertEqual(int('2br45qb', 35), 4294967296L) self.assertEqual(int('1z141z4', 36), 4294967296L) # tests with base 0 # this fails on 3.0, but in 2.x the old octal syntax is allowed self.assertEqual(int(' 0123 ', 0), 83) self.assertEqual(int(' 0123 ', 0), 83) self.assertEqual(int('000', 0), 0) self.assertEqual(int('0o123', 0), 83) self.assertEqual(int('0x123', 0), 291) self.assertEqual(int('0b100', 0), 4) self.assertEqual(int(' 0O123 ', 0), 83) self.assertEqual(int(' 0X123 ', 0), 291) self.assertEqual(int(' 0B100 ', 0), 4) self.assertEqual(int('0', 0), 0) self.assertEqual(int('+0', 0), 0) self.assertEqual(int('-0', 0), 0) self.assertEqual(int('00', 0), 0) self.assertRaises(ValueError, int, '08', 0) self.assertRaises(ValueError, int, '-012395', 0) # without base still base 10 self.assertEqual(int('0123'), 123) self.assertEqual(int('0123', 10), 123) # tests with prefix and base != 0 self.assertEqual(int('0x123', 16), 291) self.assertEqual(int('0o123', 8), 83) self.assertEqual(int('0b100', 2), 4) self.assertEqual(int('0X123', 16), 291) self.assertEqual(int('0O123', 8), 83) self.assertEqual(int('0B100', 2), 4) # the code has special checks for the first character after the # type prefix self.assertRaises(ValueError, int, '0b2', 2) self.assertRaises(ValueError, int, '0b02', 2) self.assertRaises(ValueError, int, '0B2', 2) self.assertRaises(ValueError, int, '0B02', 2) self.assertRaises(ValueError, int, '0o8', 8) self.assertRaises(ValueError, int, '0o08', 8) self.assertRaises(ValueError, int, '0O8', 8) self.assertRaises(ValueError, int, '0O08', 8) self.assertRaises(ValueError, int, '0xg', 16) self.assertRaises(ValueError, int, '0x0g', 16) self.assertRaises(ValueError, int, '0Xg', 16) self.assertRaises(ValueError, int, '0X0g', 16) # SF bug 1334662: int(string, base) wrong answers # Checks for proper evaluation of 2**32 + 1 self.assertEqual(int('100000000000000000000000000000001', 2), 4294967297L) self.assertEqual(int('102002022201221111212', 3), 4294967297L) self.assertEqual(int('10000000000000001', 4), 4294967297L) self.assertEqual(int('32244002423142', 5), 4294967297L) self.assertEqual(int('1550104015505', 6), 4294967297L) self.assertEqual(int('211301422355', 7), 4294967297L) self.assertEqual(int('40000000001', 8), 4294967297L) self.assertEqual(int('12068657455', 9), 4294967297L) self.assertEqual(int('4294967297', 10), 4294967297L) self.assertEqual(int('1904440555', 11), 4294967297L) self.assertEqual(int('9ba461595', 12), 4294967297L) self.assertEqual(int('535a7988a', 13), 4294967297L) self.assertEqual(int('2ca5b7465', 14), 4294967297L) self.assertEqual(int('1a20dcd82', 15), 4294967297L) self.assertEqual(int('100000001', 16), 4294967297L) self.assertEqual(int('a7ffda92', 17), 4294967297L) self.assertEqual(int('704he7g5', 18), 4294967297L) self.assertEqual(int('4f5aff67', 19), 4294967297L) self.assertEqual(int('3723ai4h', 20), 4294967297L) self.assertEqual(int('281d55i5', 21), 4294967297L) self.assertEqual(int('1fj8b185', 22), 4294967297L) self.assertEqual(int('1606k7id', 23), 4294967297L) self.assertEqual(int('mb994ah', 24), 4294967297L) self.assertEqual(int('hek2mgm', 25), 4294967297L) self.assertEqual(int('dnchbnn', 26), 4294967297L) self.assertEqual(int('b28jpdn', 27), 4294967297L) self.assertEqual(int('8pfgih5', 28), 4294967297L) self.assertEqual(int('76beigh', 29), 4294967297L) self.assertEqual(int('5qmcpqh', 30), 4294967297L) self.assertEqual(int('4q0jto5', 31), 4294967297L) self.assertEqual(int('4000001', 32), 4294967297L) self.assertEqual(int('3aokq95', 33), 4294967297L) self.assertEqual(int('2qhxjlj', 34), 4294967297L) self.assertEqual(int('2br45qc', 35), 4294967297L) self.assertEqual(int('1z141z5', 36), 4294967297L)
def __call__(self, row: str) -> VariantContainer: """ This is a stub class that specifies how to parse a line. It could accept configuration in the future, eg diff column numbers if there was more than one file with the same data arranged in diff ways It does the work of finding the fields, and of turning the text file into numeric data where appropriate The parser is the piece tied to file format, so this must change if the file format changes! """ fields: ty.List[ty.Any] = row.split("\t") # Revise if data format changes! # fields[1] = fields[1].replace("chr", "") # chrom if self.tissue and self.study: # Tissue-and-study-specific files have two fewer columns (study and tissue), # and so the fields must be appended to match the number of fields in the all-tissue file tissuevar = self.tissue fields = [self.study, tissuevar] + fields else: tissuevar = fields[1] # Field numbers. See also: https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/tabix/Columns.md # 0: study # 1: tissue # 2: molecular_trait_id # for spliceQTLs, this looks like 'ENSG00000008128.grp_1.contained.ENST00000356200' # 3: chromosome # 4: position (int) # 5: ref # 6: alt # 7: variant (chr_pos_ref_alt) # 8: ma_samples (int) # 9: maf (float) # 10: pvalue (float) # 11: beta (float) # 12: se (float) # 13: type (SNP, INDEL, etc) # 14: ac (allele count) (int) # 15: an (total number of alleles = 2 * sample size) (int) # 16: r2 (float) # 17: molecular_trait_object_id # for spliceQTLs, this looks like 'ENSG00000008128.contained' # 18: gene_id (ENSG#) # 19: median_tpm (float) # 20: rsid if self.datatype == "ge": fields[2] = None fields[4] = int(fields[4]) # pos fields[8] = int(fields[8]) # ma_samples fields[9] = float(fields[9]) # maf fields[10] = parser_utils.parse_pval_to_log( fields[10], is_neg_log=False ) # pvalue_nominal --> serialize as log fields[11] = float(fields[11]) # beta fields[12] = float(fields[12]) # stderr_beta fields[14] = int(fields[14]) # allele_count fields[15] = int(fields[15]) # total_number_of_alleles try: fields[16] = float(fields[16]) # r2 except ValueError: # TODO: Make the "NA" -> None check more explicit fields[16] = None fields[19] = float(fields[19]) # median_tpm # FIXME: Handle NA case # Append build build = "GRCh38" # Append tss_distance gene_tss = self.tss_dict.get(fields[18].split(".")[0], float("nan")) tss_distance = math.copysign(1, gene_tss) * (fields[4] - abs(gene_tss)) tss_position = -abs(gene_tss) # Append gene symbol geneSymbol = self.gene_json.get( fields[18].split(".")[0], "Unknown_gene" ) # Add tissue grouping and sample size from GTEx # tissue_data = TISSUE_DATA.get(tissuevar, ("Unknown_Tissue", None)) # fields.extend(tissue_data) # Append system information tissueSystem = TISSUES_TO_SYSTEMS.get(tissuevar, "Unknown") if fields[2] is not None: (_, _, _, transcript) = fields[2].split(".") else: transcript = None fields.extend( [ build, tss_distance, tss_position, geneSymbol, tissueSystem, transcript, ] ) return VariantContainer(*fields)