Exemple #1
0
def threshold(speeds, thresh=0, with_time=False):
	count = 0
	if with_time:
		for entry in speeds:
			if entry[0] >= thresh: count += 1
	else:
		for entry in speeds:
			if entry >= thresh: count += 1
	return 100*float(abs(count-ex_def_thresh))/float(ex_def_thresh)
Exemple #2
0
    def _validate_numeric(col: int, data: ty.Iterator) -> bool:
        vals = [row[col] for row in data]
        cleaned_vals = [val for val in vals if val not in const.MISSING_VALUES]
        try:

            for v in cleaned_vals:
                float(v)
        except Exception:
            return False
        return True
Exemple #3
0
def is_numeric(val: str) -> bool:
    """Check whether an unparsed string is a numeric value"""
    if val in MISSING_VALUES:
        return True

    try:
        float(val)
    except Exception:
        return False
    else:
        return True
Exemple #4
0
def diff(min=0, max=10, iterations=30):
	average = 0
	error = 0
	no_results = 0
	for i in range(iterations):
		new_speeds = [x + random.uniform(min, max) for x in original_speeds]
		for j in range(len(original)): cur.execute('UPDATE speeds SET speed = %f WHERE time = %d' % (new_speeds[j], original[j][1]))
		conn.commit()
		result = fast_float(os.popen('ruby elastic_pathing.rb ./%s ./map.sq3' % file).read().split(' ')[0], default=-1)
		if result >= 0:
			average += result
			error += threshold(new_speeds, thresh=DEF_THRESH)
		else: no_results += 1
	average = 100 - (100*float(average)/(dist*(iterations-no_results)))
	error /= float(iterations-no_results)
	restore_file()
	return (error, average)
Exemple #5
0
def parse_pval_to_log(value: str,
                      is_neg_log: bool = False) -> ty.Union[float, None]:
    """
    Parse a given number, and return the -log10 pvalue
    """
    if value in MISSING_VALUES or value is None:
        return None

    val = float(value)

    if is_neg_log:  # Take as is
        return val

    # Regular pvalue: validate and convert
    if val < 0 or val > 1:
        raise ValueError('p value is not in the allowed range')

    # 0-values are explicitly allowed and will convert to infinity by design, as they often indicate underflow errors
    #   in the input data.
    if val == 0:
        # Determine whether underflow is due to the source data, or value conversiuon
        if value == '0':
            # The source data is bad, so insert an obvious placeholder value
            return math.inf
        else:
            # h/t @welchr: aggressively turn the underflowing string value into -log10 via regex
            # Only do this if absolutely necessary, because it is a performance hit
            base, _, exponent = REGEX_PVAL.search(value).groups()
            base = float(base)

            if exponent != '':
                exp = float(exponent)
            else:
                exp = 0

            if base == 0:
                return math.inf

            return -(math.log10(float(base)) + float(exp))
    else:
        return -math.log10(val)
Exemple #6
0
def shuffle_round_diff(skip_length=40, iterations=10, will_round=False, nearest=0, will_diff=False, diff_min=0, diff_max=10):
	skips = [1]
	for i in range(1, len(original)):
		if i % skip_length == 0: skips.append(i)
	skips.append(len(original))
	errors = [0.0] * len(skips)
	# Get errors for shuffling with different intervals
	for k in range(len(skips)):
		# Repeeat to minimize randomness
		error = 0
		no_results = 0
		for i in range(iterations):
			# Set shuffle offset
			offset = 0
			# Make copy of original speed values
			# and round/diff if needed
			if will_round and will_diff: print('Not Implemented')
			elif will_round: new_speeds = [round(x/nearest)*nearest for x in original_speeds]
			elif will_diff: new_speeds = [x + random.uniform(diff_min, diff_max) for x in original_speeds]
			else: new_speeds = original_speeds[:]
			# Shuffle values and save
			if skips[k] > 1:
				while offset < skips[-1]:
					end = offset + skips[k]
					if end > skips[-1]: end = skips[-1]
					shuffle_speeds = new_speeds[offset:end]
					shuffle(shuffle_speeds)
					new_speeds[offset:end] = shuffle_speeds
					offset += skips[k]
				for j in range(skips[-1]):
					cur.execute('UPDATE speeds SET speed = %f WHERE time = %d'\
					% (new_speeds[j], original[j][1]))
				conn.commit()
			# Run Elastic Pathing algorithm
			result = fast_float(os.popen('ruby elastic_pathing.rb ./%s ./map.sq3' % file).read().split(' ')[0], default=-1)
			if result >= 0: error += result
			else: no_results += 1
		threshold(new_speeds, thresh=DEF_THRESH)
		errors[k] = 100*float(error)/float(iterations-no_results)/dist
	restore_file()
	return (threshold(new_speeds, thresh=DEF_THRESH), [(skips[i], 100-errors[i]) for i in range(len(errors))])
Exemple #7
0
def norm_val(val, empty_as_null: bool) -> Union[bytes, int, float, None]:
    """Normalize a value"""
    if val is None:
        return None

    if fastnumbers.isfloat(val) or fastnumbers.isint(val):
        return fastnumbers.float(val)

    val = val.strip()
    if len(val) == 0 and empty_as_null:
        return None

    return val.encode("utf-8", "ignore")
Exemple #8
0
def filter_rows(row_indices, query_col_index, query_col_coords): # Pretty sure this returns an array with the rows that have been queried out of the column
    col_type = next(parse_data_values(query_col_index, max_column_type_length, [(query_col_index, 0, 1)], file_handles["ct"])) # Returns the next item from a iterator, which is the "string_like_object" from parse_data_values?

    if col_type == b"n": # why are we treating it as bytes? Speed? I think this is evaluating if the next column is a number
        for row_index in row_indices:
            if fastnumbers.float(next(parse_data_values(row_index, line_length, query_col_coords, file_handles["data"]))) >= 0.1:  # This line confuses me, is it projecting all columns with a numberiv value greater than .1?
                yield row_index
    else:
        for row_index in row_indices:
            value = next(parse_data_values(row_index, line_length, query_col_coords, file_handles["data"])) # What is the purpose of the file handles?

            if value.startswith(b"A") or value.endswith(b"Z"): # Filters all data that starts with A or ends with Z?
                yield row_index
Exemple #9
0
def filter_row(line, col_coords, query_col_index):
    col_type = next(
        parse_data_values(query_col_index, max_column_type_length,
                          [(query_col_index, 0, 1)], file_handles["ct"]))

    if col_type == b"n":
        for coords in col_coords:
            if fastnumbers.float(line[coords[1]:coords[2]].rstrip()) >= 0.1:
                yield coords
    else:
        for coords in col_coords:
            value = line[coords[1]:coords[2]].rstrip()
            if value.startswith(b"A") or value.endswith(b"Z"):
                yield coords
Exemple #10
0
def variant_parser(row: str) -> VariantContainer:
    """
    This is a stub class that specifies how to parse a line. It could accept configuration in the future,
    eg diff column numbers if there was more than one file with the same data arranged in diff ways

    It does the work of finding the fields, and of turning the text file into numeric data where appropriate

    The parser is the piece tied to file format, so this must change if the file format changes!
    """

    fields = row.split('\t')
    # For now we clean up three fields exactly.
    # if data format changes!
    fields[0] = fields[0].replace('chr', '')  # chrom
    fields[1] = int(fields[1])  # pos
    fields[10] = float(fields[10])  # pvalue_nominal

    return VariantContainer(*fields)
    def filter_rows_numeric(self, row_indices, the_filter, operator_dict,
                            data_handle, cc_handle, mccl, ll):
        if the_filter.operator not in operator_dict:
            raise Exception("Invalid operator: " + oper)

        query_col_coords = list(
            parse_data_coords([the_filter.column_index], cc_handle, mccl))

        for row_index in row_indices:
            value = next(
                parse_data_values(row_index, ll, query_col_coords,
                                  data_handle)).rstrip()
            if value == b"":  # Is missing
                continue

            # See https://stackoverflow.com/questions/18591778/how-to-pass-an-operator-to-a-python-function
            if operator_dict[the_filter.operator](fastnumbers.float(value),
                                                  the_filter.query_value):
                yield row_index
Exemple #12
0
def parse_allele_frequency(
        *,
        freq: str = None,
        allele_count: str = None,
        n_samples: str = None,
        is_alt_effect: bool = True) -> ty.Union[float, None]:
    """
    Parse an allele frequency, OR convert counts to frequency.
    :param freq:
    :param allele_count:
    :param n_samples:
    :param is_alt_effect:
    :return:
    """
    if freq is not None and allele_count is not None:
        # A single uber-func is generally less than the performance penalty of calling two separate functions
        raise exceptions.ConfigurationException(
            'Frequency and allele count options are mutually exclusive')

    if freq is None and (allele_count in MISSING_VALUES or n_samples
                         in MISSING_VALUES):  # Allele count parsing
        return None
    elif freq is None and allele_count is not None:
        allele_freq = int(allele_count) / int(
            n_samples) / 2  # 2 alleles per sample
    elif freq in MISSING_VALUES:  # Frequency-based parsing
        return None
    else:
        allele_freq = float(freq)

    # No matter how the frequency is specified, this stuff is always done
    if allele_freq < 0 or allele_freq > 1:
        raise ValueError('Allele frequency is not in the allowed range')

    if not is_alt_effect:  # Orient the frequency to the alt allele
        return 1 - allele_freq
    else:
        return allele_freq
Exemple #13
0
    def __call__(self, row: str) -> VariantContainer:

        """
        This is a stub class that specifies how to parse a line. It could accept configuration in the future,
        eg diff column numbers if there was more than one file with the same data arranged in diff ways

        It does the work of finding the fields, and of turning the text file into numeric data where appropriate

        The parser is the piece tied to file format, so this must change if the file format changes!
        """
        fields: ty.List[ty.Any] = row.split("\t")
        # Revise if data format changes!
        # fields[1] = fields[1].replace("chr", "")  # chrom
        if self.tissue and self.study:
            # Tissue-and-study-specific files have two fewer columns (study and tissue),
            # and so the fields must be appended to match the number of fields in the all-tissue file
            tissuevar = self.tissue
            fields = [self.study, tissuevar] + fields
        else:
            tissuevar = fields[1]

        # Field numbers. See also: https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/tabix/Columns.md
        # 0: study
        # 1: tissue
        # 2: molecular_trait_id
        #  for spliceQTLs, this looks like 'ENSG00000008128.grp_1.contained.ENST00000356200'
        # 3: chromosome
        # 4: position (int)
        # 5: ref
        # 6: alt
        # 7: variant (chr_pos_ref_alt)
        # 8: ma_samples (int)
        # 9: maf (float)
        # 10: pvalue (float)
        # 11: beta (float)
        # 12: se (float)
        # 13: type (SNP, INDEL, etc)
        # 14: ac (allele count) (int)
        # 15: an (total number of alleles = 2 * sample size) (int)
        # 16: r2 (float)
        # 17: molecular_trait_object_id
        #  for spliceQTLs, this looks like 'ENSG00000008128.contained'
        # 18: gene_id (ENSG#)
        # 19: median_tpm (float)
        # 20: rsid
        if self.datatype == "ge":
            fields[2] = None
        fields[4] = int(fields[4])  # pos
        fields[8] = int(fields[8])  # ma_samples
        fields[9] = float(fields[9])  # maf
        fields[10] = parser_utils.parse_pval_to_log(
            fields[10], is_neg_log=False
        )  # pvalue_nominal --> serialize as log
        fields[11] = float(fields[11])  # beta
        fields[12] = float(fields[12])  # stderr_beta
        fields[14] = int(fields[14])  # allele_count
        fields[15] = int(fields[15])  # total_number_of_alleles
        try:
            fields[16] = float(fields[16])  # r2
        except ValueError:
            # TODO: Make the "NA" -> None check more explicit
            fields[16] = None
        fields[19] = float(fields[19])  # median_tpm  # FIXME: Handle NA case

        # Append build
        build = "GRCh38"

        # Append tss_distance
        gene_tss = self.tss_dict.get(fields[18].split(".")[0], float("nan"))
        tss_distance = math.copysign(1, gene_tss) * (fields[4] - abs(gene_tss))
        tss_position = -abs(gene_tss)

        # Append gene symbol
        geneSymbol = self.gene_json.get(
            fields[18].split(".")[0], "Unknown_gene"
        )

        # Add tissue grouping and sample size from GTEx
        # tissue_data = TISSUE_DATA.get(tissuevar, ("Unknown_Tissue", None))
        # fields.extend(tissue_data)

        # Append system information
        tissueSystem = TISSUES_TO_SYSTEMS.get(tissuevar, "Unknown")
        if fields[2] is not None:
            (_, _, _, transcript) = fields[2].split(".")
        else:
            transcript = None

        fields.extend(
            [
                build,
                tss_distance,
                tss_position,
                geneSymbol,
                tissueSystem,
                transcript,
            ]
        )
        return VariantContainer(*fields)
Exemple #14
0
file_path = sys.argv[1]
col_names_file_path = sys.argv[2]
out_file_path = sys.argv[3]
discrete_query_col_index = int(sys.argv[4])
num_query_col_index = int(sys.argv[5])
memory_map = True

col_index_range = getColIndicesToQuery(col_names_file_path, memory_map)

with open(file_path, 'rb') as my_file:
    if memory_map:
        my_file = mmap.mmap(my_file.fileno(), 0, prot=mmap.PROT_READ)

    with open(out_file_path, 'wb') as out_file:
        header_items = my_file.readline().rstrip(b"\n").split(b"\t")
        out_file.write(b"\t".join([header_items[i] for i in col_index_range]) + b"\n")

        match_count = 0

        for line in iter(my_file.readline, b""):
            line_items = line.rstrip(b"\n").split(b"\t")

            discrete_value = line_items[discrete_query_col_index]
            num_value = fastnumbers.float(line_items[num_query_col_index])

            if (discrete_value.startswith(b"A") or discrete_value.endswith(b"Z")) and num_value >= 0.1:
                out_file.write(b"\t".join([line_items[i] for i in col_index_range]) + b"\n")
                match_count += 1

    my_file.close()
Exemple #15
0
    def inner(line):
        # Return a stateful closure that does the actual work of parsing
        try:
            fields = line.strip().split(delimiter)
            if len(fields) == 1:
                raise exceptions.LineParseException(
                    'Unable to split line into separate fields. This line may have a missing or incorrect delimiter.'
                )

            # Fetch values
            ref = None
            alt = None
            if _marker_col is not None:
                chrom, pos, ref, alt = utils.parse_marker(fields[_marker_col])
            else:
                chrom = fields[_chrom_col]
                pos = fields[_pos_col]

            if chrom.startswith('chr'):
                chrom = chrom[3:]

            chrom = chrom.upper()

            # Explicit columns will override a value from the marker, by design
            if _ref_col is not None:
                ref = fields[_ref_col]

            if _alt_col is not None:
                alt = fields[_alt_col]

            pval = fields[_pvalue_col]

            # Some optional fields
            rsid = None
            beta = None
            stderr_beta = None
            alt_allele_freq = None
            allele_count = None
            n_samples = None

            if _rsid_col is not None:
                rsid = fields[_rsid_col]
                if rsid in MISSING_VALUES:
                    rsid = None
                elif not rsid.startswith('rs'):
                    rsid = 'rs' + rsid

            if _beta_col is not None:
                beta = fields[_beta_col]

            if _stderr_col is not None:
                stderr_beta = fields[_stderr_col]

            if _allele_freq_col is not None:
                alt_allele_freq = fields[_allele_freq_col]

            if _allele_count_col is not None:
                allele_count = fields[_allele_count_col]
                n_samples = fields[_n_samples_col]

            # Perform type coercion
            log_pval = utils.parse_pval_to_log(pval,
                                               is_neg_log=_is_neg_log_pvalue)

            try:
                pos = int(pos)
            except ValueError:
                # Some programs seem to write long positions using scientific notation, which int cannot handle
                try:
                    pos = int(float(pos))
                except ValueError:
                    # If we still can't parse, it's probably bad data
                    raise exceptions.LineParseException(
                        'Positions should be specified as integers. Could not parse value: {}'
                        .format(pos))

            if beta is not None:
                beta = None if beta in MISSING_VALUES else float(beta)
            if stderr_beta is not None:
                stderr_beta = None if stderr_beta in MISSING_VALUES else float(
                    stderr_beta)

            if _allele_freq_col or _allele_count_col:
                alt_allele_freq = utils.parse_allele_frequency(
                    freq=alt_allele_freq,
                    allele_count=allele_count,
                    n_samples=n_samples,
                    is_alt_effect=_is_alt_effect)

            # Some old GWAS files simply won't provide ref or alt information, and the parser will need to do without
            if ref in MISSING_VALUES:
                ref = None

            if isinstance(ref, str):
                ref = ref.upper()

            if alt in MISSING_VALUES:
                alt = None

            if isinstance(alt, str):
                alt = alt.upper()

            result = container(chrom, pos, rsid, ref, alt, log_pval, beta,
                               stderr_beta, alt_allele_freq)
        except Exception as e:
            raise exceptions.LineParseException(str(e), line=line)
        return result
Exemple #16
0
def is_match(value, value_type):
    if value_type == b"n":
        return fastnumbers.float(value) >= 0.1
    else:
        return value.startswith(b"A") or value.endswith(b"Z")
Exemple #17
0
import msgpack
import os
import random
import string
import sys
import fastnumbers

dimensions = int(sys.argv[1])
out_file_path = sys.argv[2]

random.seed(0)

acgt = ["A", "C", "G", "T"]

num_homo_ref = int(fastnumbers.float(dimensions) * 0.7)
num_het = int(fastnumbers.float(dimensions) * 0.2)
num_homo_alt = int(fastnumbers.float(dimensions) * 0.1)

genotype_options = []
for ref_allele in acgt:
    for alt_allele in acgt:
        if ref_allele == alt_allele:
            continue

        homo_ref_gt = ref_allele * 2
        het_gt = ref_allele + alt_allele
        homo_alt_gt = alt_allele * 2

        genotype_options.append((homo_ref_gt, het_gt, homo_alt_gt))

num_genotype_options = len(genotype_options)
Exemple #18
0
    def __call__(self, row: str) -> CIContainer:
        # Columns in the raw credible_sets data:
        # phenotype_id: this corresponds to genes in gene expression data
        #               and both gene and credible inteval in Txrevise data
        # variant_id: in chrom_pos_ref_alt format; we don't use this
        # chr
        # pos
        # ref
        # alt
        # cs_id: this is simply {phenotype_id}_{cs_index}
        # cs_index: credible set label, either L1 or L2
        # finemapped_region: a range for region tested, in chrom:start-end format
        # pip: generated using SuSie
        # z: z-score
        # cs_min_r2
        # cs_avg_r2
        # cs_size: credible set size, i.e. the number of variants contained in this credible set
        # posterior_mean: posterior effect size
        # posterior_sd: posterior standard deviation
        # cs_log10bf: log10 of the Bayes Factor for this credible set
        ### Extra columns added by joining main QTL data ###
        # ma_samples
        # maf
        # pvalue
        # beta
        # se
        # type
        # ac
        # an
        # r2
        # mol_trait_obj_id
        # gid
        # median_tpm
        # rsid
        fields: ty.List[ty.Any] = row.split("\t")
        if self.study and self.tissue:
            # Tissue-and-study-specific files have two fewer columns (study and tissue),
            # and so the fields must be appended to match the number of fields in the all-tissue file
            fields = [self.study, self.tissue] + fields
        fields[5] = int(fields[5])  # pos
        fields[11] = float(fields[11])  # pip
        fields[12] = float(fields[12])  # z
        fields[13] = float(fields[13])  # cs_min_r2
        fields[14] = float(fields[14])  # cs_avg_r2
        fields[15] = int(fields[15])  # cs_size
        fields[16] = float(fields[16])  # posterior_mean
        fields[17] = float(fields[17])  # posteriof_sd
        fields[18] = float(fields[18])  # cs_log10bf
        # Extra fields from joined file
        if len(fields) > 19:
            fields[19] = int(fields[19])  # ma_samples
            fields[20] = float(fields[20])  # maf
            fields[21] = float(fields[21])  # pvalue
            fields[22] = float(fields[22])  # beta
            fields[23] = float(fields[23])  # se
            fields[25] = int(fields[25])  # ac
            fields[26] = int(fields[26])  # an
            fields[30] = float(fields[30])  # median_tpm

        return CIContainer(*fields)
Exemple #19
0
from random import shuffle
import matplotlib.pyplot as plt, numpy as np, os, random, re, sqlite3 as sql

# Track filename
file = 'track-5.sq3'
os.popen('cp central_new_jersey/P4/%s .' % file)

# Open and store file
conn = sql.connect(file)
cur = conn.cursor()
cur.execute('SELECT * FROM speeds')
original = cur.fetchall()
original_speeds = [i[0] for i in original]

# Distance travelled by car
dist = float(os.popen('ruby elastic_pathing.rb ./%s ./map.sq3' % file).read().split(' ')[1])


# Default threshold
DEF_THRESH = 20
# Default threshold exceeding number
ex_def_thresh = 0
for i in original_speeds:
	if i >= 20: ex_def_thresh += 1

# Run Elastic Pathing with shuffling (and rounding, and diff)
def shuffle_round_diff(skip_length=40, iterations=10, will_round=False, nearest=0, will_diff=False, diff_min=0, diff_max=10):
	skips = [1]
	for i in range(1, len(original)):
		if i % skip_length == 0: skips.append(i)
	skips.append(len(original))