def threshold(speeds, thresh=0, with_time=False): count = 0 if with_time: for entry in speeds: if entry[0] >= thresh: count += 1 else: for entry in speeds: if entry >= thresh: count += 1 return 100*float(abs(count-ex_def_thresh))/float(ex_def_thresh)
def _validate_numeric(col: int, data: ty.Iterator) -> bool: vals = [row[col] for row in data] cleaned_vals = [val for val in vals if val not in const.MISSING_VALUES] try: for v in cleaned_vals: float(v) except Exception: return False return True
def is_numeric(val: str) -> bool: """Check whether an unparsed string is a numeric value""" if val in MISSING_VALUES: return True try: float(val) except Exception: return False else: return True
def diff(min=0, max=10, iterations=30): average = 0 error = 0 no_results = 0 for i in range(iterations): new_speeds = [x + random.uniform(min, max) for x in original_speeds] for j in range(len(original)): cur.execute('UPDATE speeds SET speed = %f WHERE time = %d' % (new_speeds[j], original[j][1])) conn.commit() result = fast_float(os.popen('ruby elastic_pathing.rb ./%s ./map.sq3' % file).read().split(' ')[0], default=-1) if result >= 0: average += result error += threshold(new_speeds, thresh=DEF_THRESH) else: no_results += 1 average = 100 - (100*float(average)/(dist*(iterations-no_results))) error /= float(iterations-no_results) restore_file() return (error, average)
def parse_pval_to_log(value: str, is_neg_log: bool = False) -> ty.Union[float, None]: """ Parse a given number, and return the -log10 pvalue """ if value in MISSING_VALUES or value is None: return None val = float(value) if is_neg_log: # Take as is return val # Regular pvalue: validate and convert if val < 0 or val > 1: raise ValueError('p value is not in the allowed range') # 0-values are explicitly allowed and will convert to infinity by design, as they often indicate underflow errors # in the input data. if val == 0: # Determine whether underflow is due to the source data, or value conversiuon if value == '0': # The source data is bad, so insert an obvious placeholder value return math.inf else: # h/t @welchr: aggressively turn the underflowing string value into -log10 via regex # Only do this if absolutely necessary, because it is a performance hit base, _, exponent = REGEX_PVAL.search(value).groups() base = float(base) if exponent != '': exp = float(exponent) else: exp = 0 if base == 0: return math.inf return -(math.log10(float(base)) + float(exp)) else: return -math.log10(val)
def shuffle_round_diff(skip_length=40, iterations=10, will_round=False, nearest=0, will_diff=False, diff_min=0, diff_max=10): skips = [1] for i in range(1, len(original)): if i % skip_length == 0: skips.append(i) skips.append(len(original)) errors = [0.0] * len(skips) # Get errors for shuffling with different intervals for k in range(len(skips)): # Repeeat to minimize randomness error = 0 no_results = 0 for i in range(iterations): # Set shuffle offset offset = 0 # Make copy of original speed values # and round/diff if needed if will_round and will_diff: print('Not Implemented') elif will_round: new_speeds = [round(x/nearest)*nearest for x in original_speeds] elif will_diff: new_speeds = [x + random.uniform(diff_min, diff_max) for x in original_speeds] else: new_speeds = original_speeds[:] # Shuffle values and save if skips[k] > 1: while offset < skips[-1]: end = offset + skips[k] if end > skips[-1]: end = skips[-1] shuffle_speeds = new_speeds[offset:end] shuffle(shuffle_speeds) new_speeds[offset:end] = shuffle_speeds offset += skips[k] for j in range(skips[-1]): cur.execute('UPDATE speeds SET speed = %f WHERE time = %d'\ % (new_speeds[j], original[j][1])) conn.commit() # Run Elastic Pathing algorithm result = fast_float(os.popen('ruby elastic_pathing.rb ./%s ./map.sq3' % file).read().split(' ')[0], default=-1) if result >= 0: error += result else: no_results += 1 threshold(new_speeds, thresh=DEF_THRESH) errors[k] = 100*float(error)/float(iterations-no_results)/dist restore_file() return (threshold(new_speeds, thresh=DEF_THRESH), [(skips[i], 100-errors[i]) for i in range(len(errors))])
def norm_val(val, empty_as_null: bool) -> Union[bytes, int, float, None]: """Normalize a value""" if val is None: return None if fastnumbers.isfloat(val) or fastnumbers.isint(val): return fastnumbers.float(val) val = val.strip() if len(val) == 0 and empty_as_null: return None return val.encode("utf-8", "ignore")
def filter_rows(row_indices, query_col_index, query_col_coords): # Pretty sure this returns an array with the rows that have been queried out of the column col_type = next(parse_data_values(query_col_index, max_column_type_length, [(query_col_index, 0, 1)], file_handles["ct"])) # Returns the next item from a iterator, which is the "string_like_object" from parse_data_values? if col_type == b"n": # why are we treating it as bytes? Speed? I think this is evaluating if the next column is a number for row_index in row_indices: if fastnumbers.float(next(parse_data_values(row_index, line_length, query_col_coords, file_handles["data"]))) >= 0.1: # This line confuses me, is it projecting all columns with a numberiv value greater than .1? yield row_index else: for row_index in row_indices: value = next(parse_data_values(row_index, line_length, query_col_coords, file_handles["data"])) # What is the purpose of the file handles? if value.startswith(b"A") or value.endswith(b"Z"): # Filters all data that starts with A or ends with Z? yield row_index
def filter_row(line, col_coords, query_col_index): col_type = next( parse_data_values(query_col_index, max_column_type_length, [(query_col_index, 0, 1)], file_handles["ct"])) if col_type == b"n": for coords in col_coords: if fastnumbers.float(line[coords[1]:coords[2]].rstrip()) >= 0.1: yield coords else: for coords in col_coords: value = line[coords[1]:coords[2]].rstrip() if value.startswith(b"A") or value.endswith(b"Z"): yield coords
def variant_parser(row: str) -> VariantContainer: """ This is a stub class that specifies how to parse a line. It could accept configuration in the future, eg diff column numbers if there was more than one file with the same data arranged in diff ways It does the work of finding the fields, and of turning the text file into numeric data where appropriate The parser is the piece tied to file format, so this must change if the file format changes! """ fields = row.split('\t') # For now we clean up three fields exactly. # if data format changes! fields[0] = fields[0].replace('chr', '') # chrom fields[1] = int(fields[1]) # pos fields[10] = float(fields[10]) # pvalue_nominal return VariantContainer(*fields)
def filter_rows_numeric(self, row_indices, the_filter, operator_dict, data_handle, cc_handle, mccl, ll): if the_filter.operator not in operator_dict: raise Exception("Invalid operator: " + oper) query_col_coords = list( parse_data_coords([the_filter.column_index], cc_handle, mccl)) for row_index in row_indices: value = next( parse_data_values(row_index, ll, query_col_coords, data_handle)).rstrip() if value == b"": # Is missing continue # See https://stackoverflow.com/questions/18591778/how-to-pass-an-operator-to-a-python-function if operator_dict[the_filter.operator](fastnumbers.float(value), the_filter.query_value): yield row_index
def parse_allele_frequency( *, freq: str = None, allele_count: str = None, n_samples: str = None, is_alt_effect: bool = True) -> ty.Union[float, None]: """ Parse an allele frequency, OR convert counts to frequency. :param freq: :param allele_count: :param n_samples: :param is_alt_effect: :return: """ if freq is not None and allele_count is not None: # A single uber-func is generally less than the performance penalty of calling two separate functions raise exceptions.ConfigurationException( 'Frequency and allele count options are mutually exclusive') if freq is None and (allele_count in MISSING_VALUES or n_samples in MISSING_VALUES): # Allele count parsing return None elif freq is None and allele_count is not None: allele_freq = int(allele_count) / int( n_samples) / 2 # 2 alleles per sample elif freq in MISSING_VALUES: # Frequency-based parsing return None else: allele_freq = float(freq) # No matter how the frequency is specified, this stuff is always done if allele_freq < 0 or allele_freq > 1: raise ValueError('Allele frequency is not in the allowed range') if not is_alt_effect: # Orient the frequency to the alt allele return 1 - allele_freq else: return allele_freq
def __call__(self, row: str) -> VariantContainer: """ This is a stub class that specifies how to parse a line. It could accept configuration in the future, eg diff column numbers if there was more than one file with the same data arranged in diff ways It does the work of finding the fields, and of turning the text file into numeric data where appropriate The parser is the piece tied to file format, so this must change if the file format changes! """ fields: ty.List[ty.Any] = row.split("\t") # Revise if data format changes! # fields[1] = fields[1].replace("chr", "") # chrom if self.tissue and self.study: # Tissue-and-study-specific files have two fewer columns (study and tissue), # and so the fields must be appended to match the number of fields in the all-tissue file tissuevar = self.tissue fields = [self.study, tissuevar] + fields else: tissuevar = fields[1] # Field numbers. See also: https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/tabix/Columns.md # 0: study # 1: tissue # 2: molecular_trait_id # for spliceQTLs, this looks like 'ENSG00000008128.grp_1.contained.ENST00000356200' # 3: chromosome # 4: position (int) # 5: ref # 6: alt # 7: variant (chr_pos_ref_alt) # 8: ma_samples (int) # 9: maf (float) # 10: pvalue (float) # 11: beta (float) # 12: se (float) # 13: type (SNP, INDEL, etc) # 14: ac (allele count) (int) # 15: an (total number of alleles = 2 * sample size) (int) # 16: r2 (float) # 17: molecular_trait_object_id # for spliceQTLs, this looks like 'ENSG00000008128.contained' # 18: gene_id (ENSG#) # 19: median_tpm (float) # 20: rsid if self.datatype == "ge": fields[2] = None fields[4] = int(fields[4]) # pos fields[8] = int(fields[8]) # ma_samples fields[9] = float(fields[9]) # maf fields[10] = parser_utils.parse_pval_to_log( fields[10], is_neg_log=False ) # pvalue_nominal --> serialize as log fields[11] = float(fields[11]) # beta fields[12] = float(fields[12]) # stderr_beta fields[14] = int(fields[14]) # allele_count fields[15] = int(fields[15]) # total_number_of_alleles try: fields[16] = float(fields[16]) # r2 except ValueError: # TODO: Make the "NA" -> None check more explicit fields[16] = None fields[19] = float(fields[19]) # median_tpm # FIXME: Handle NA case # Append build build = "GRCh38" # Append tss_distance gene_tss = self.tss_dict.get(fields[18].split(".")[0], float("nan")) tss_distance = math.copysign(1, gene_tss) * (fields[4] - abs(gene_tss)) tss_position = -abs(gene_tss) # Append gene symbol geneSymbol = self.gene_json.get( fields[18].split(".")[0], "Unknown_gene" ) # Add tissue grouping and sample size from GTEx # tissue_data = TISSUE_DATA.get(tissuevar, ("Unknown_Tissue", None)) # fields.extend(tissue_data) # Append system information tissueSystem = TISSUES_TO_SYSTEMS.get(tissuevar, "Unknown") if fields[2] is not None: (_, _, _, transcript) = fields[2].split(".") else: transcript = None fields.extend( [ build, tss_distance, tss_position, geneSymbol, tissueSystem, transcript, ] ) return VariantContainer(*fields)
file_path = sys.argv[1] col_names_file_path = sys.argv[2] out_file_path = sys.argv[3] discrete_query_col_index = int(sys.argv[4]) num_query_col_index = int(sys.argv[5]) memory_map = True col_index_range = getColIndicesToQuery(col_names_file_path, memory_map) with open(file_path, 'rb') as my_file: if memory_map: my_file = mmap.mmap(my_file.fileno(), 0, prot=mmap.PROT_READ) with open(out_file_path, 'wb') as out_file: header_items = my_file.readline().rstrip(b"\n").split(b"\t") out_file.write(b"\t".join([header_items[i] for i in col_index_range]) + b"\n") match_count = 0 for line in iter(my_file.readline, b""): line_items = line.rstrip(b"\n").split(b"\t") discrete_value = line_items[discrete_query_col_index] num_value = fastnumbers.float(line_items[num_query_col_index]) if (discrete_value.startswith(b"A") or discrete_value.endswith(b"Z")) and num_value >= 0.1: out_file.write(b"\t".join([line_items[i] for i in col_index_range]) + b"\n") match_count += 1 my_file.close()
def inner(line): # Return a stateful closure that does the actual work of parsing try: fields = line.strip().split(delimiter) if len(fields) == 1: raise exceptions.LineParseException( 'Unable to split line into separate fields. This line may have a missing or incorrect delimiter.' ) # Fetch values ref = None alt = None if _marker_col is not None: chrom, pos, ref, alt = utils.parse_marker(fields[_marker_col]) else: chrom = fields[_chrom_col] pos = fields[_pos_col] if chrom.startswith('chr'): chrom = chrom[3:] chrom = chrom.upper() # Explicit columns will override a value from the marker, by design if _ref_col is not None: ref = fields[_ref_col] if _alt_col is not None: alt = fields[_alt_col] pval = fields[_pvalue_col] # Some optional fields rsid = None beta = None stderr_beta = None alt_allele_freq = None allele_count = None n_samples = None if _rsid_col is not None: rsid = fields[_rsid_col] if rsid in MISSING_VALUES: rsid = None elif not rsid.startswith('rs'): rsid = 'rs' + rsid if _beta_col is not None: beta = fields[_beta_col] if _stderr_col is not None: stderr_beta = fields[_stderr_col] if _allele_freq_col is not None: alt_allele_freq = fields[_allele_freq_col] if _allele_count_col is not None: allele_count = fields[_allele_count_col] n_samples = fields[_n_samples_col] # Perform type coercion log_pval = utils.parse_pval_to_log(pval, is_neg_log=_is_neg_log_pvalue) try: pos = int(pos) except ValueError: # Some programs seem to write long positions using scientific notation, which int cannot handle try: pos = int(float(pos)) except ValueError: # If we still can't parse, it's probably bad data raise exceptions.LineParseException( 'Positions should be specified as integers. Could not parse value: {}' .format(pos)) if beta is not None: beta = None if beta in MISSING_VALUES else float(beta) if stderr_beta is not None: stderr_beta = None if stderr_beta in MISSING_VALUES else float( stderr_beta) if _allele_freq_col or _allele_count_col: alt_allele_freq = utils.parse_allele_frequency( freq=alt_allele_freq, allele_count=allele_count, n_samples=n_samples, is_alt_effect=_is_alt_effect) # Some old GWAS files simply won't provide ref or alt information, and the parser will need to do without if ref in MISSING_VALUES: ref = None if isinstance(ref, str): ref = ref.upper() if alt in MISSING_VALUES: alt = None if isinstance(alt, str): alt = alt.upper() result = container(chrom, pos, rsid, ref, alt, log_pval, beta, stderr_beta, alt_allele_freq) except Exception as e: raise exceptions.LineParseException(str(e), line=line) return result
def is_match(value, value_type): if value_type == b"n": return fastnumbers.float(value) >= 0.1 else: return value.startswith(b"A") or value.endswith(b"Z")
import msgpack import os import random import string import sys import fastnumbers dimensions = int(sys.argv[1]) out_file_path = sys.argv[2] random.seed(0) acgt = ["A", "C", "G", "T"] num_homo_ref = int(fastnumbers.float(dimensions) * 0.7) num_het = int(fastnumbers.float(dimensions) * 0.2) num_homo_alt = int(fastnumbers.float(dimensions) * 0.1) genotype_options = [] for ref_allele in acgt: for alt_allele in acgt: if ref_allele == alt_allele: continue homo_ref_gt = ref_allele * 2 het_gt = ref_allele + alt_allele homo_alt_gt = alt_allele * 2 genotype_options.append((homo_ref_gt, het_gt, homo_alt_gt)) num_genotype_options = len(genotype_options)
def __call__(self, row: str) -> CIContainer: # Columns in the raw credible_sets data: # phenotype_id: this corresponds to genes in gene expression data # and both gene and credible inteval in Txrevise data # variant_id: in chrom_pos_ref_alt format; we don't use this # chr # pos # ref # alt # cs_id: this is simply {phenotype_id}_{cs_index} # cs_index: credible set label, either L1 or L2 # finemapped_region: a range for region tested, in chrom:start-end format # pip: generated using SuSie # z: z-score # cs_min_r2 # cs_avg_r2 # cs_size: credible set size, i.e. the number of variants contained in this credible set # posterior_mean: posterior effect size # posterior_sd: posterior standard deviation # cs_log10bf: log10 of the Bayes Factor for this credible set ### Extra columns added by joining main QTL data ### # ma_samples # maf # pvalue # beta # se # type # ac # an # r2 # mol_trait_obj_id # gid # median_tpm # rsid fields: ty.List[ty.Any] = row.split("\t") if self.study and self.tissue: # Tissue-and-study-specific files have two fewer columns (study and tissue), # and so the fields must be appended to match the number of fields in the all-tissue file fields = [self.study, self.tissue] + fields fields[5] = int(fields[5]) # pos fields[11] = float(fields[11]) # pip fields[12] = float(fields[12]) # z fields[13] = float(fields[13]) # cs_min_r2 fields[14] = float(fields[14]) # cs_avg_r2 fields[15] = int(fields[15]) # cs_size fields[16] = float(fields[16]) # posterior_mean fields[17] = float(fields[17]) # posteriof_sd fields[18] = float(fields[18]) # cs_log10bf # Extra fields from joined file if len(fields) > 19: fields[19] = int(fields[19]) # ma_samples fields[20] = float(fields[20]) # maf fields[21] = float(fields[21]) # pvalue fields[22] = float(fields[22]) # beta fields[23] = float(fields[23]) # se fields[25] = int(fields[25]) # ac fields[26] = int(fields[26]) # an fields[30] = float(fields[30]) # median_tpm return CIContainer(*fields)
from random import shuffle import matplotlib.pyplot as plt, numpy as np, os, random, re, sqlite3 as sql # Track filename file = 'track-5.sq3' os.popen('cp central_new_jersey/P4/%s .' % file) # Open and store file conn = sql.connect(file) cur = conn.cursor() cur.execute('SELECT * FROM speeds') original = cur.fetchall() original_speeds = [i[0] for i in original] # Distance travelled by car dist = float(os.popen('ruby elastic_pathing.rb ./%s ./map.sq3' % file).read().split(' ')[1]) # Default threshold DEF_THRESH = 20 # Default threshold exceeding number ex_def_thresh = 0 for i in original_speeds: if i >= 20: ex_def_thresh += 1 # Run Elastic Pathing with shuffling (and rounding, and diff) def shuffle_round_diff(skip_length=40, iterations=10, will_round=False, nearest=0, will_diff=False, diff_min=0, diff_max=10): skips = [1] for i in range(1, len(original)): if i % skip_length == 0: skips.append(i) skips.append(len(original))