def print_enrichment_chart(file_handle, vals, title): try: import matplotlib.pyplot as plt except ImportError: print("Error while printing. To use this functionality you need to have matplotlib installed.", file=sys.stderr) else: fig, ax1 = plt.subplots() xs = list(range(len(vals))) ys = vals ax1.plot(xs, ys) bar_ys = [int(ys[0] > 0)] for i in range(1, len(ys)): bar_ys.append(int(ys[i] > ys[i - 1])) bar_ys = [bar_ys] pos = ax1.axes.get_position() ax0 = fig.add_axes([pos.x0, pos.y1, pos.width, 0.1]) ax0.imshow(bar_ys, cmap=plt.cm.Blues, interpolation='nearest') ax0.axes.get_yaxis().set_visible(False) ax0.axes.get_xaxis().set_visible(False) ax0.set_title(title) plt.savefig(file_handle, bbox_inches=0) plt.close()
def test_limits(self): """Check line graphs.""" #TODO - Fix GD so that the same min/max is used for all three lines? points = 1000 scale = math.pi * 2.0 / points data1 = [math.sin(x*scale) for x in range(points)] data2 = [math.cos(x*scale) for x in range(points)] data3 = [2*math.sin(2*x*scale) for x in range(points)] gdd = Diagram('Test Diagram', circular=False, y=0.01, yt=0.01, yb=0.01, x=0.01, xl=0.01, xr=0.01) gdt_data = gdd.new_track(1, greytrack=False) gds_data = gdt_data.new_set("graph") for data_values, name, color in zip([data1, data2, data3], ["sin", "cos", "2sin2"], ["red", "green", "blue"]): data = list(zip(range(points), data_values)) gds_data.new_graph(data, "", style="line", color = color, altcolor = color, center = 0) gdd.draw(format='linear', tracklines=False, pagesize=(15*cm, 15*cm), fragments=1, start=0, end=points) gdd.write(os.path.join('Graphics', "line_graph.pdf"), "pdf") #Circular diagram gdd.draw(tracklines=False, pagesize=(15*cm, 15*cm), circular=True, # Data designed to be periodic start=0, end=points, circle_core=0.5) gdd.write(os.path.join('Graphics', "line_graph_c.pdf"), "pdf")
def make_instances_from_counts(self): """Creates "fake" instances for a motif created from a count matrix. In case the sums of counts are different for different columnes, the shorter columns are padded with background. """ alpha = "".join(self.alphabet.letters) # col[i] is a column taken from aligned motif instances col = [] self.has_instances = True self.instances = [] s = sum(self.counts[nuc][0] for nuc in self.alphabet.letters) for i in range(self.length): col.append("") for n in self.alphabet.letters: col[i] = col[i] + n * (self.counts[n][i]) if len(col[i]) < s: print("WARNING, column too short %i %i" % (len(col[i]), s)) col[i] += (alpha * s)[:(s - len(col[i]))] # print("column %i, %s" % (i, col[i])) # iterate over instances for i in range(s): inst = "" # start with empty seq for j in range(self.length): # iterate over positions inst += col[j][i] # print("%i %s" % (i,inst) inst = Seq(inst, self.alphabet) self.add_instance(inst) return self.instances
def format_phylip(self, handle): """Write data in Phylip format to a given file-like object or handle. The output stream is the input distance matrix format used with Phylip programs (e.g. 'neighbor'). See: http://evolution.genetics.washington.edu/phylip/doc/neighbor.html :Parameters: handle : file or file-like object A writeable file handle or other object supporting the 'write' method, such as StringIO or sys.stdout. On Python 3, should be open in text mode. """ handle.write(" {0}\n".format(len(self.names))) # Phylip needs space-separated, vertically aligned columns name_width = max(12, max(map(len, self.names)) + 1) value_fmts = ("{" + str(x) + ":.4f}" for x in range(1, len(self.matrix) + 1)) row_fmt = "{0:" + str(name_width) + "s}" + " ".join(value_fmts) + "\n" for i, (name, values) in enumerate(zip(self.names, self.matrix)): # Mirror the matrix values across the diagonal mirror_values = (self.matrix[j][i] for j in range(i + 1, len(self.matrix))) fields = itertools.chain([name], values, mirror_values) handle.write(row_fmt.format(*fields))
def lowess(x, y, f=2. / 3., iter=3): """lowess(x, y, f=2./3., iter=3) -> yest Lowess smoother: Robust locally weighted regression. The lowess function fits a nonparametric regression curve to a scatterplot. The arrays x and y contain an equal number of elements; each pair (x[i], y[i]) defines a data point in the scatterplot. The function returns the estimated (smooth) values of y. The smoothing span is given by f. A larger value for f will result in a smoother curve. The number of robustifying iterations is given by iter. The function will run faster with a smaller number of iterations. x and y should be numpy float arrays of equal length. The return value is also a numpy float array of that length. e.g. >>> import numpy >>> x = numpy.array([4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, ... 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, ... 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, ... 20, 22, 23, 24, 24, 24, 24, 25], numpy.float) >>> y = numpy.array([2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, ... 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, ... 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, ... 64, 66, 54, 70, 92, 93, 120, 85], numpy.float) >>> result = lowess(x, y) >>> len(result) 50 >>> print("[%0.2f, ..., %0.2f]" % (result[0], result[-1])) [4.85, ..., 84.98] """ n = len(x) r = int(numpy.ceil(f * n)) h = [numpy.sort(abs(x - x[i]))[r] for i in range(n)] w = numpy.clip(abs(([x] - numpy.transpose([x])) / h), 0.0, 1.0) w = 1 - w * w * w w = w * w * w yest = numpy.zeros(n) delta = numpy.ones(n) for iteration in range(iter): for i in range(n): weights = delta * w[:, i] weights_mul_x = weights * x b1 = numpy.dot(weights, y) b2 = numpy.dot(weights_mul_x, y) A11 = sum(weights) A12 = sum(weights_mul_x) A21 = A12 A22 = numpy.dot(weights_mul_x, x) determinant = A11 * A22 - A12 * A21 beta1 = (A22 * b1 - A12 * b2) / determinant beta2 = (A11 * b2 - A21 * b1) / determinant yest[i] = beta1 + beta2 * x[i] residuals = y - yest s = median(abs(residuals)) delta[:] = numpy.clip(residuals / (6 * s), -1, 1) delta[:] = 1 - delta * delta delta[:] = delta * delta return yest
def forward_algorithm(self): """Calculate sequence probability using the forward algorithm. This implements the forward algorithm, as described on p57-58 of Durbin et al. Returns: o A dictionary containing the forward variables. This has keys of the form (state letter, position in the training sequence), and values containing the calculated forward variable. o The calculated probability of the sequence. """ # all of the different letters that the state path can be in state_letters = self._seq.states.alphabet.letters # -- initialize the algorithm # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # forward_var = {} # f_{0}(0) = 1 forward_var[(state_letters[0], -1)] = 1 # f_{k}(0) = 0, for k > 0 for k in range(1, len(state_letters)): forward_var[(state_letters[k], -1)] = 0 # -- now do the recursion step # loop over the training sequence # Recursion step: (i = 1 .. L) for i in range(len(self._seq.emissions)): # now loop over the letters in the state path for main_state in state_letters: # calculate the forward value using the appropriate # method to prevent underflow errors forward_value = self._forward_recursion(main_state, i, forward_var) if forward_value is not None: forward_var[(main_state, i)] = forward_value # -- termination step - calculate the probability of the sequence first_state = state_letters[0] seq_prob = 0 for state_item in state_letters: # f_{k}(L) forward_value = forward_var[(state_item, len(self._seq.emissions) - 1)] # a_{k0} transition_value = self._mm.transition_prob[(state_item, first_state)] seq_prob += forward_value * transition_value return forward_var, seq_prob
def calculate(self, sequence): """Returns the PWM score for a given sequence for all positions. Notes: - the sequence can only be a DNA sequence - the search is performed only on one strand - if the sequence and the motif have the same length, a single number is returned - otherwise, the result is a one-dimensional list or numpy array """ # TODO - Code itself tolerates ambiguous bases (as NaN). if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" % self.alphabet) if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" % sequence.alphabet) # TODO - Force uppercase here and optimise switch statement in C # by assuming upper case? sequence = str(sequence) m = self.length n = len(sequence) scores = [] # check if the fast C code can be used try: from . import _pwm except ImportError: # use the slower Python code otherwise # The C code handles mixed case so Python version must too: sequence = sequence.upper() for i in range(n - m + 1): score = 0.0 for position in range(m): letter = sequence[i + position] try: score += self[letter][position] except KeyError: score = float("nan") break scores.append(score) else: # get the log-odds matrix into a proper shape # (each row contains sorted (ACGT) log-odds values) logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] scores = _pwm.calculate(sequence, logodds) if len(scores) == 1: return scores[0] else: return scores
def load_seqrecord(self, record): """Load a Biopython SeqRecord into the database.""" bioentry_id = self._load_bioentry_table(record) self._load_bioentry_date(record, bioentry_id) self._load_biosequence(record, bioentry_id) self._load_comment(record, bioentry_id) self._load_dbxrefs(record, bioentry_id) references = record.annotations.get("references", ()) for reference, rank in zip(references, list(range(len(references)))): self._load_reference(reference, rank, bioentry_id) self._load_annotations(record, bioentry_id) for seq_feature_num in range(len(record.features)): seq_feature = record.features[seq_feature_num] self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
def test_illumina_to_sanger(self): """Mapping check for FASTQ Illumina (0 to 62) to Sanger (0 to 62)""" seq = "N"*63 qual = "".join(chr(64+q) for q in range(0, 63)) expected_phred = list(range(63)) in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual)) out_handle = StringIO() SeqIO.write(SeqIO.parse(in_handle, "fastq-illumina"), out_handle, "fastq-sanger") out_handle.seek(0) record = SeqIO.read(out_handle, "fastq-sanger") self.assertEqual(str(record.seq), seq) self.assertEqual(record.letter_annotations["phred_quality"], expected_phred)
def _calculate(self, sequence, m, n): # The C code handles mixed case so Python version must too: sequence = sequence.upper() scores = [] for i in range(n - m + 1): score = 0.0 for position in range(m): letter = sequence[i + position] try: score += self[letter][position] except KeyError: score = float("nan") break scores.append(score) return scores
def dist_pearson_at(self, other, offset): """Return the similarity score based on pearson correlation at the given offset.""" letters = self.alphabet sx = 0.0 # \sum x sy = 0.0 # \sum y sxx = 0.0 # \sum x^2 sxy = 0.0 # \sum x \cdot y syy = 0.0 # \sum y^2 norm = max(self.length, offset + other.length) * len(letters) for pos in range(min(self.length - offset, other.length)): xi = [self[letter, pos + offset] for letter in letters] yi = [other[letter, pos] for letter in letters] sx += sum(xi) sy += sum(yi) sxx += sum(x * x for x in xi) sxy += sum(x * y for x, y in zip(xi, yi)) syy += sum(y * y for y in yi) sx /= norm sy /= norm sxx /= norm sxy /= norm syy /= norm numerator = sxy - sx * sy denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy)) return numerator / denominator
def normalize(self, pseudocounts=None): """Create and return a position-weight matrix by normalizing the counts matrix. If pseudocounts is None (default), no pseudocounts are added to the counts. If pseudocounts is a number, it is added to the counts before calculating the position-weight matrix. Alternatively, the pseudocounts can be a dictionary with a key for each letter in the alphabet associated with the motif. """ counts = {} if pseudocounts is None: for letter in self.alphabet.letters: counts[letter] = [0.0] * self.length elif isinstance(pseudocounts, dict): for letter in self.alphabet.letters: counts[letter] = [float(pseudocounts[letter])] * self.length else: for letter in self.alphabet.letters: counts[letter] = [float(pseudocounts)] * self.length for i in range(self.length): for letter in self.alphabet.letters: counts[letter][i] += self[letter][i] # Actual normalization is done in the PositionWeightMatrix initializer return PositionWeightMatrix(self.alphabet, counts)
def std(self, background=None): """Standard deviation of the score of a motif.""" if background is None: background = dict.fromkeys(self._letters, 1.0) else: background = dict(background) total = sum(background.values()) for letter in self._letters: background[letter] /= total variance = 0.0 for i in range(self.length): sx = 0.0 sxx = 0.0 for letter in self._letters: logodds = self[letter, i] if math.isnan(logodds): continue if math.isinf(logodds) and logodds < 0: continue b = background[letter] p = b * math.pow(2, logodds) sx += p * logodds sxx += p * logodds * logodds sxx -= sx * sx variance += sxx variance = max(variance, 0) # to avoid roundoff problems return math.sqrt(variance)
def _get_perms(self, gene_list, perms_no): perms = [] permutation = list(gene_list) for _ in range(perms_no): random.shuffle(permutation) perms.append(list(permutation)) return perms
def pwm(self, laplace=True): """ returns the PWM computed for the set of instances if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions. """ if self._pwm_is_current: return self._pwm # we need to compute new pwm self._pwm = [] for i in range(self.length): dict = {} # filling the dict with 0's for letter in self.alphabet.letters: if laplace: dict[letter] = self.beta * self.background[letter] else: dict[letter] = 0.0 if self.has_counts: # taking the raw counts for letter in self.alphabet.letters: dict[letter] += self.counts[letter][i] elif self.has_instances: # counting the occurences of letters in instances for seq in self.instances: # dict[seq[i]]=dict[seq[i]]+1 try: dict[seq[i]] += 1 except KeyError: # we need to ignore non-alphabet letters pass self._pwm.append(FreqTable.FreqTable(dict, FreqTable.COUNT, self.alphabet)) self._pwm_is_current = 1 return self._pwm
def __str__(self): """Get a lower triangular matrix string.""" matrix_string = '\n'.join( [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]]) for i in range(0, len(self))]) matrix_string = matrix_string + "\n\t" + "\t".join(self.names) return matrix_string
def _crossover(self, x, no, locs): """Generalized Crossover Function: arguments: - x (int) - genome number [0|1] - no (organism,organism) - new organisms - locs (int list, int list) - lists of locations, [0, +n points+, bound] for each genome (sync'd with x) return type: sequence (to replace no[x]) """ s = no[x].genome[:locs[x][1]] for n in range(1, self._npoints): # flipflop between genome_0 and genome_1 mode = (x+n)%2 # _generate_locs gives us [0, +n points+, bound] # so we can iterate: { 0:loc(1) ... loc(n):bound } t = no[mode].genome[locs[mode][n]:locs[mode][n+1]] if (s): s = s + t else: s = t return s
def check_general_fails(self, filename, good_count): handle = open(filename, _universal_read_mode) tuples = QualityIO.FastqGeneralIterator(handle) for i in range(good_count): title, seq, qual = next(tuples) # Make sure no errors! self.assertRaises(ValueError, next, tuples) handle.close()
def _gen_random_array(n): """Return an array of n random numbers summing to 1.0 (PRIVATE).""" randArray = [random.random() for i in range(n)] total = sum(randArray) normalizedRandArray = [x / total for x in randArray] return normalizedRandArray
def _load_bai(handle): indexes = [] magic = handle.read(4) if magic != _BAI_magic: raise ValueError("BAM index files should start %r, not %r" % (_BAI_magic, magic)) assert 4 == struct.calcsize("<i") assert 8 == struct.calcsize("<Q") data = handle.read(4) n_ref = struct.unpack("<i", data)[0] # print("%i references" % n_ref) for n in range(n_ref): indexes.append(_load_ref_index(handle)) # This is missing on very old samtools index files, # and isn't in the SAM/BAM specifiction yet either. # This was reverse engineered vs "samtools idxstats" data = handle.read(8) if data: unmapped = struct.unpack("<Q", data)[0] # print("%i unmapped reads" % unmapped) else: unmapped = None # print("Index missing unmapped reads count") data = handle.read() if data: print("%i extra bytes" % len(data)) print(repr(data)) return indexes, unmapped
def matches_schema(pattern, schema, ambiguity_character='*'): """Determine whether or not the given pattern matches the schema. Arguments: o pattern - A string representing the pattern we want to check for matching. This pattern can contain ambiguity characters (which are assumed to be the same as those in the schema). o schema - A string schema with ambiguity characters. o ambiguity_character - The character used for ambiguity in the schema. """ if len(pattern) != len(schema): return 0 # check each position, and return a non match if the schema and pattern # are non ambiguous and don't match for pos in range(len(pattern)): if schema[pos] != ambiguity_character and \ pattern[pos] != ambiguity_character and \ pattern[pos] != schema[pos]: return 0 return 1
def insert(self, name, value, index=None): """Insert distances given the name and value. :Parameters: name : str name of a row/col to be inserted value : list a row/col of values to be inserted """ if isinstance(name, str): # insert at the given index or at the end if index is None: index = len(self) if not isinstance(index, int): raise TypeError("Invalid index type.") # insert name self.names.insert(index, name) # insert elements of 0, to be assigned self.matrix.insert(index, [0] * index) for i in range(index, len(self)): self.matrix[i].insert(index, 0) # assign value self[index] = value else: raise TypeError("Invalid name type.")
def representation(self, sequence): """Represent the given input sequence as a bunch of motif counts. Arguments: o sequence - A Bio.Seq object we are going to represent as schemas. This takes the sequence, searches for the motifs within it, and then returns counts specifying the relative number of times each motifs was found. The frequencies are in the order the original motifs were passed into the initializer. """ schema_counts = [] for schema in self._schemas: num_counts = self._converter.num_matches(schema, str(sequence)) schema_counts.append(num_counts) # normalize the counts to go between zero and one min_count = 0 max_count = max(schema_counts) # only normalize if we've actually found something, otherwise # we'll just return 0 for everything if max_count > 0: for count_num in range(len(schema_counts)): schema_counts[count_num] = (float(schema_counts[count_num]) - float(min_count)) / float(max_count) return schema_counts
def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id): """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE). Qualifiers should be a dictionary of the form: {key : [value1, value2]} """ tag_ontology_id = self._get_ontology_id("Annotation Tags") for qualifier_key in qualifiers: # Treat db_xref qualifiers differently to sequence annotation # qualifiers by populating the seqfeature_dbxref and dbxref # tables. Other qualifiers go into the seqfeature_qualifier_value # and (if new) term tables. if qualifier_key != "db_xref": qualifier_key_id = self._get_term_id(qualifier_key, ontology_id=tag_ontology_id) # now add all of the values to their table entries = qualifiers[qualifier_key] if not isinstance(entries, list): # Could be a plain string, or an int or a float. # However, we exect a list of strings here. entries = [entries] for qual_value_rank in range(len(entries)): qualifier_value = entries[qual_value_rank] sql = ( r"INSERT INTO seqfeature_qualifier_value " r" (seqfeature_id, term_id, rank, value) VALUES" r" (%s, %s, %s, %s)" ) self.adaptor.execute(sql, (seqfeature_id, qualifier_key_id, qual_value_rank + 1, qualifier_value)) else: # The dbxref_id qualifier/value sets go into the dbxref table # as dbname, accession, version tuples, with dbxref.dbxref_id # being automatically assigned, and into the seqfeature_dbxref # table as seqfeature_id, dbxref_id, and rank tuples self._load_seqfeature_dbxref(qualifiers[qualifier_key], seqfeature_id)
def intermediate_points(start, end, graph_data): """Generate intermediate points describing provided graph data.. Returns a list of (start, end, value) tuples describing the passed graph data as 'bins' between position midpoints. """ # print start, end, len(graph_data) newdata = [] # data in form (X0, X1, val) # add first block newdata.append((start, graph_data[0][0] + (graph_data[1][0] - graph_data[0][0]) / 2., graph_data[0][1])) # add middle set for index in range(1, len(graph_data) - 1): lastxval, lastyval = graph_data[index - 1] xval, yval = graph_data[index] nextxval, nextyval = graph_data[index + 1] newdata.append((lastxval + (xval - lastxval) / 2., xval + (nextxval - xval) / 2., yval)) # add last block newdata.append((xval + (nextxval - xval) / 2., end, graph_data[-1][1])) # print newdata[-1] # print newdata return newdata
def calculate_pseudocounts(motif): alphabet = motif.alphabet background = motif.background # It is possible to have unequal column sums so use the average # number of instances. total = 0 for i in range(motif.length): total += sum(float(motif.counts[letter][i]) for letter in alphabet.letters) avg_nb_instances = total / motif.length sq_nb_instances = math.sqrt(avg_nb_instances) if background: background = dict(background) else: background = dict.fromkeys(sorted(alphabet.letters), 1.0) total = sum(background.values()) pseudocounts = {} for letter in alphabet.letters: background[letter] /= total pseudocounts[letter] = sq_nb_instances * background[letter] return pseudocounts
def kolmogorov_smirnov_rank_test(gene_set, gene_list, adj_corr, plot=False): """ Rank test used in GSEA method. It measures dispersion of genes from gene_set over a gene_list. Every gene from gene_list has its weight specified by adj_corr, where adj_corr are gene weights (correlation with fenotype) already raised to the power of parameter p, changing weights importance. Plot define if method should return list of ES for each position in ranking, if plot=False (default) second returned object is None. Reference: http://www.pnas.org/content/102/43/15545.full """ cval = 0 Dn = 0 Nr = 0 N = len(gene_list) Nh = 0 for i in range(N): if gene_list[i] in gene_set: Nr += adj_corr[i] Nh += 1 if N == Nh: miss_pen = 1. else: miss_pen = float(1) / (N - Nh) stat_plot = N * [None] if plot: stat_plot = N * [None] else: stat_plot = None for i in range(N): if gene_list[i] in gene_set: cval += adj_corr[i] / Nr else: cval -= miss_pen if plot: stat_plot[i] = cval if abs(cval) > abs(Dn): Dn = cval return (Dn, stat_plot)
def compare_sequence(old, new): """Compare two Seq or DBSeq objects.""" assert len(old) == len(new), "%i vs %i" % (len(old), len(new)) assert str(old) == str(new) if isinstance(old, UnknownSeq): assert isinstance(new, UnknownSeq) else: assert not isinstance(new, UnknownSeq) ln = len(old) s = str(old) assert isinstance(s, str) # Don't check every single element; for long sequences # this takes far far far too long to run! # Test both positive and negative indices if ln < 50: indices = list(range(-ln, ln)) else: # A selection of end cases, and the mid point indices = [-ln, -ln + 1, -(ln // 2), -1, 0, 1, ln // 2, ln - 2, ln - 1] # Test element access, for i in indices: expected = s[i] assert expected == old[i] assert expected == new[i] # Test slices indices.append(ln) # check copes with overflows indices.append(ln + 1000) # check copes with overflows for i in indices: for j in indices: expected = s[i:j] assert expected == str(old[i:j]), \ "Slice %s vs %s" % (repr(expected), repr(old[i:j])) assert expected == str(new[i:j]), \ "Slice %s vs %s" % (repr(expected), repr(new[i:j])) # Slicing with step of 1 should make no difference. # Slicing with step 3 might be useful for codons. for step in [1, 3]: expected = s[i:j:step] assert expected == str(old[i:j:step]) assert expected == str(new[i:j:step]) # Check automatic end points expected = s[i:] assert expected == str(old[i:]) assert expected == str(new[i:]) expected = s[:i] assert expected == str(old[:i]) assert expected == str(new[:i]) # Check "copy" splice assert s == str(old[:]) assert s == str(new[:]) return True
def _gen_random_array(n): """ Return an array of n random numbers, where the elements of the array sum to 1.0""" randArray = [random.random() for i in range(n)] total = sum(randArray) normalizedRandArray = [x/total for x in randArray] return normalizedRandArray
def __init__(self, alphabet, counts): GenericPositionMatrix.__init__(self, alphabet, counts) for i in range(self.length): total = sum(float(self[letter][i]) for letter in alphabet.letters) for letter in alphabet.letters: self[letter][i] /= total for letter in alphabet.letters: self[letter] = tuple(self[letter])
def dist_product(self, other): """ A similarity measure taking into account a product probability of generating overlaping instances of two motifs """ max_p = 0.0 for offset in range(-self.length + 1, other.length): if offset < 0: p = self.dist_product_at(other, -offset) else: #offset>=0 p = other.dist_product_at(self, offset) if max_p < p: max_p = p max_o = -offset return 1 - max_p / self.dist_product_at(self, 0), max_o
def test_sanger_to_solexa(self): """Mapping check for FASTQ Sanger (0 to 93) to Solexa (-5 to 62)""" # The point of this test is the writing code doesn't actually use the # solexa_quality_from_phred function directly. For speed it uses a # cached dictionary of the mappings. seq = "N" * 94 qual = "".join(chr(33 + q) for q in range(0, 94)) expected_sol = [ min(62, int(round(QualityIO.solexa_quality_from_phred(q)))) for q in range(0, 94) ] in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual)) out_handle = StringIO() with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", BiopythonWarning) SeqIO.write(SeqIO.parse(in_handle, "fastq-sanger"), out_handle, "fastq-solexa") self.assertTrue(len(w) <= 1, w) out_handle.seek(0) record = SeqIO.read(out_handle, "fastq-solexa") self.assertEqual(str(record.seq), seq) self.assertEqual(record.letter_annotations["solexa_quality"], expected_sol)
def __str__(self, masked=False): """Return string representation of a motif.""" text = "" if self.instances is not None: text += str(self.instances) if masked: for i in range(self.length): if self.__mask[i]: text += "*" else: text += " " text += "\n" return text
def fmt_cdt(sample_ids, rows): """Format as CDT.""" outheader = ['GID', 'CLID', 'NAME', 'GWEIGHT'] + sample_ids header2 = ['AID', '', '', ''] header2.extend( ['ARRY' + str(i).zfill(3) + 'X' for i in range(len(sample_ids))]) outrows = [header2] for i, row in enumerate(rows): probe, values = row[0], row[1:] outrow = ['GENE%dX' % i, 'IMAGE:%d' % i, probe.label, 1] # or probe.gene? outrow.extend(values) outrows.append(outrow) return outheader, outrows
def test_solexa_quality_from_phred(self): """Mapping check for function solexa_quality_from_phred""" self.assertEqual(-5, round(QualityIO.solexa_quality_from_phred(0))) self.assertEqual(-5, round(QualityIO.solexa_quality_from_phred(1))) self.assertEqual(-2, round(QualityIO.solexa_quality_from_phred(2))) self.assertEqual(0, round(QualityIO.solexa_quality_from_phred(3))) self.assertEqual(2, round(QualityIO.solexa_quality_from_phred(4))) self.assertEqual(3, round(QualityIO.solexa_quality_from_phred(5))) self.assertEqual(5, round(QualityIO.solexa_quality_from_phred(6))) self.assertEqual(6, round(QualityIO.solexa_quality_from_phred(7))) self.assertEqual(7, round(QualityIO.solexa_quality_from_phred(8))) self.assertEqual(8, round(QualityIO.solexa_quality_from_phred(9))) for i in range(10, 100): self.assertEqual(i, round(QualityIO.solexa_quality_from_phred(i)))
def __init__(self, names, matrix=None): """Initialize matrix by a list of names and a list of lower triangular matrix data. """ # check names if isinstance(names, list) and all(isinstance(s, str) for s in names): if len(set(names)) == len(names): self.names = names else: raise ValueError("Duplicate names found") else: raise TypeError("'names' should be a list of strings") # check matrix if matrix is None: # create a new one with 0 if matrix is not assigned matrix = [[0] * i for i in range(1, len(self) + 1)] self.matrix = matrix else: # check if all elements are numbers if (isinstance(matrix, list) and all(isinstance(l, list) for l in matrix) and all(_is_numeric(n) for n in [item for sublist in matrix for item in sublist])): # check if the same length with names if len(matrix) == len(names): # check if is lower triangle format if [len(m) for m in matrix] == list(range(1, len(self) + 1)): self.matrix = matrix else: raise ValueError( "'matrix' should be in lower triangle format") else: raise ValueError( "'names' and 'matrix' should be the same size") else: raise TypeError("'matrix' should be a list of numerical lists")
def backward_algorithm(self): """Calculate sequence probability using the backward algorithm. This implements the backward algorithm, as described on p58-59 of Durbin et al. Returns: o A dictionary containing the backwards variables. This has keys of the form (state letter, position in the training sequence), and values containing the calculated backward variable. """ # all of the different letters that the state path can be in state_letters = self._seq.states.alphabet.letters # -- initialize the algorithm # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # backward_var = {} first_letter = state_letters[0] # b_{k}(L) = a_{k0} for all k for state in state_letters: backward_var[(state, len(self._seq.emissions) - 1)] = \ self._mm.transition_prob[(state, state_letters[0])] # -- recursion # first loop over the training sequence backwards # Recursion step: (i = L - 1 ... 1) all_indexes = list(range(len(self._seq.emissions) - 1)) all_indexes.reverse() for i in all_indexes: # now loop over the letters in the state path for main_state in state_letters: # calculate the backward value using the appropriate # method to prevent underflow errors backward_value = self._backward_recursion( main_state, i, backward_var) if backward_value is not None: backward_var[(main_state, i)] = backward_value # skip the termination step to avoid recalculations -- you should # get sequence probabilities using the forward algorithm return backward_var
def degenerate_consensus(self): # Following the rules adapted from # D. R. Cavener: "Comparison of the consensus sequence flanking # translational start sites in Drosophila and vertebrates." # Nucleic Acids Research 15(4): 1353-1361. (1987). # The same rules are used by TRANSFAC. degenerate_nucleotide = { 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', 'CGT': 'B', 'ACGT': 'N', } sequence = "" for i in range(self.length): def get(nucleotide): return self[nucleotide][i] nucleotides = sorted(self, key=get, reverse=True) counts = [self[c][i] for c in nucleotides] # Follow the Cavener rules: if counts[0] > sum(counts[1:]) and counts[0] > 2 * counts[1]: key = nucleotides[0] elif 4 * sum(counts[:2]) > 3 * sum(counts): key = "".join(sorted(nucleotides[:2])) elif counts[3] == 0: key = "".join(sorted(nucleotides[:3])) else: key = "ACGT" nucleotide = degenerate_nucleotide.get(key, key) sequence += nucleotide if isinstance(self.alphabet, Alphabet.DNAAlphabet): alpha = IUPAC.ambiguous_dna elif isinstance(self.alphabet, Alphabet.RNAAlphabet): alpha = IUPAC.ambiguous_rna elif isinstance(self.alphabet, Alphabet.ProteinAlphabet): alpha = IUPAC.protein else: raise Exception("Unknown alphabet") return Seq(sequence, alphabet=alpha)
def random_motif(self): """Create a random motif within the given parameters. This returns a single motif string with letters from the given alphabet. The size of the motif will be randomly chosen between max_size and min_size. """ motif_size = random.randrange(self._min_size, self._max_size) motif = "" for letter_num in range(motif_size): cur_letter = random.choice(self._alphabet.letters) motif += cur_letter return MutableSeq(motif, self._alphabet)
def __str__(self,masked=False): """ string representation of a motif. """ str = "" for inst in self.instances: str = str + inst.tostring() + "\n" if masked: for i in range(self.length): if self.mask[i]: str = str + "*" else: str = str + " " str = str + "\n" return str
def __delitem__(self, item): """Delete related distances by the index or name.""" index = None if isinstance(item, int): index = item elif isinstance(item, str): index = self.names.index(item) else: raise TypeError("Invalid index type.") # remove distances related to index for i in range(index + 1, len(self)): del self.matrix[i][index] del self.matrix[index] # remove name del self.names[index]
def test_solexa_to_sanger(self): """Mapping check for FASTQ Solexa (-5 to 62) to Sanger (0 to 62)""" #The point of this test is the writing code doesn't actually use the #solexa_quality_from_phred function directly. For speed it uses a #cached dictionary of the mappings. seq = "N" * 68 qual = "".join(chr(64 + q) for q in range(-5, 63)) expected_phred = [ round(QualityIO.phred_quality_from_solexa(q)) for q in range(-5, 63) ] in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual)) out_handle = StringIO() #Want to ignore the data loss warning #(on Python 2.6 we could check for it!) warnings.simplefilter('ignore', BiopythonWarning) SeqIO.write(SeqIO.parse(in_handle, "fastq-solexa"), out_handle, "fastq-sanger") warnings.filters.pop() out_handle.seek(0) record = SeqIO.read(out_handle, "fastq-sanger") self.assertEqual(str(record.seq), seq) self.assertEqual(record.letter_annotations["phred_quality"], expected_phred)
def mutate(self, organism): """Mutate the organism's genome.""" mutated_org = organism.copy() gene_choices = mutated_org.genome.alphabet.letters # potentially mutate any gene in the genome for gene_index in range(len(mutated_org.genome)): mutation_chance = self._mutation_rand.random() # if we have a mutation if mutation_chance <= self._mutation_rate: # get a new letter new_letter = self._switch_rand.choice(gene_choices) mutated_org.genome[gene_index] = new_letter return mutated_org
def anticonsensus(self): sequence = "" for i in range(self.length): try: minimum = float("inf") except ValueError: # On Python 2.5 or older that was handled in C code, # and failed on Windows XP 32bit minimum = 1E400 for letter in self.alphabet.letters: count = self[letter][i] if count < minimum: minimum = count sequence_letter = letter sequence += sequence_letter return Seq(sequence, self.alphabet)
def search_pwm(self, sequence, normalized=0, masked=0, threshold=0.0, both=True): """ a generator function, returning found hits in a given sequence with the pwm score higher than the threshold """ if both: rc = self.reverse_complement() sequence = str(sequence).upper() for pos in range(0, len(sequence) - self.length + 1): score = self.score_hit(sequence, pos, normalized, masked) if score > threshold: yield (pos, score) if both: rev_score = rc.score_hit(sequence, pos, normalized, masked) if rev_score > threshold: yield (-pos, rev_score)
def mutate(self, organism): """Mutate the organism's genome.""" mutated_org = organism.copy() gene_choices = mutated_org.genome.alphabet.letters mutation_chance = self._mutation_rand.random() if mutation_chance <= self._mutation_rate: # pick a gene position to mutate at mutation_pos = \ self._pos_rand.choice(list(range(len(mutated_org.genome)))) # get a new letter to replace the position at new_letter = self._switch_rand.choice(gene_choices) mutated_org.genome[mutation_pos] = new_letter return mutated_org
def add_instance(self, instance): """ adds new instance to the motif """ self._check_alphabet(instance.alphabet) self._check_length(len(instance)) if self.has_counts: for i in range(self.length): let=instance[i] self.counts[let][i]+=1 if self.has_instances or not self.has_counts: self.instances.append(instance) self.has_instances=True self._pwm_is_current = False self._log_odds_is_current = False
def log_odds(self, laplace=True): """ returns the logg odds matrix computed for the set of instances """ if self._log_odds_is_current: return self._log_odds # we need to compute new pwm self._log_odds = [] pwm=self.pwm(laplace) for i in range(self.length): d = {} for a in self.alphabet.letters: d[a]=math.log(pwm[i][a]/self.background[a], 2) self._log_odds.append(d) self._log_odds_is_current=1 return self._log_odds
def update(self, inputs): """Update the values of the nodes using given inputs. Arguments: - inputs -- A list of inputs into the network -- this must be equal to the number of nodes in the layer. """ if len(inputs) != len(self.values) - 1: raise ValueError("Inputs do not match input layer nodes.") # set the node values from the inputs for input_num in range(len(inputs)): self.values[input_num + 1] = inputs[input_num] # propagate the update to the next layer self._next_layer.update(self)
def consensus(self): """Return the consensus sequence.""" sequence = "" for i in range(self.length): try: maximum = float("-inf") except ValueError: # On Python 2.5 or older that was handled in C code, # and failed on Windows XP 32bit maximum = - 1E400 for letter in self.alphabet: count = self[letter][i] if count > maximum: maximum = count sequence_letter = letter sequence += sequence_letter return Seq(sequence)
def get_background(target_bed, access_bed, avg_bin_size, min_bin_size): """Generate background intervals from target intervals. Procedure: - Invert target intervals - Subtract the inverted targets from accessible regions - For each of the resulting regions: - Shrink by a fixed margin on each end - If it's smaller than min_bin_size, skip - Divide into equal-size (region_size/avg_bin_size) portions - Emit the (chrom, start, end) coords of each portion """ target_chroms = group_coords(RA.read(target_bed).coords()) if access_bed: # Chromosome accessible sequence regions are given -- use them access_chroms = group_coords(RA.read(access_bed).coords()) else: # Chromosome accessible sequence regions not known -- use heuristics # (chromosome length is endpoint of last probe; skip initial # <magic number> of bases that are probably telomeric) TELOMERE_SIZE = 150000 access_chroms = guess_chromosome_regions(target_chroms, TELOMERE_SIZE) backgrounds = find_background_regions(access_chroms, target_chroms, 2 * INSERT_SIZE) # Emit regions as antitarget bins according to avg_bin_size and min_bin_size # Do a set operation on backgrounds to avoid any duplicate regions for chrom, start, end in sorted(backgrounds, key=core.sorter_chrom_at(0)): span = end - start if span >= min_bin_size: nbins = int(round(span / avg_bin_size)) or 1 if nbins == 1: yield (chrom, start, end) else: # Divide the background region into equal-sized bins bin_size = span / nbins bin_start = start bin_end = None for i in range(1, nbins): bin_end = start + int(i * bin_size) yield (chrom, bin_start, bin_end) bin_start = bin_end yield (chrom, bin_start, end)
def _generate_locs(self, bound): """Generalized Location Generator. Arguments: - bound (int) - upper bound Returns: [0]+x_0...x_n+[bound] where n=self._npoints-1 and 0 < x_0 < x_1 ... < bound """ results = [] for increment in range(self._npoints): x = random.randint(1, bound - 1) while (x in results): # uniqueness x = random.randint(1, bound - 1) results.append(x) results.sort() # sorted return [0] + results + [bound] # [0, +n points+, bound]
def dist_pearson(self, other): """Return the similarity score based on pearson correlation for the given motif against self. We use the Pearson's correlation of the respective probabilities. """ if self.alphabet != other.alphabet: raise ValueError("Cannot compare motifs with different alphabets") max_p = -2 for offset in range(-self.length + 1, other.length): if offset < 0: p = self.dist_pearson_at(other, -offset) else: # offset>=0 p = other.dist_pearson_at(self, offset) if max_p < p: max_p = p max_o = -offset return 1 - max_p, max_o
def search(self, sequence, threshold=0.0, both=True): """ a generator function, returning found hits in a given sequence with the pwm score higher than the threshold """ sequence = sequence.upper() n = len(sequence) m = self.length if both: rc = self.reverse_complement() for position in range(0, n - m + 1): s = sequence[position:position + m] score = self.calculate(s) if score > threshold: yield (position, score) if both: score = rc.calculate(s) if score > threshold: yield (position - n, score)
def __init__(self, num_nodes, has_bias_node): """Initialize the layer. Arguments: - num_nodes -- The number of nodes that are contained in this layer. - has_bias_node -- Specify whether or not this node has a bias node. This node is not included in the number of nodes in the network, but is used in constructing and dealing with the network. """ # specify all of the nodes in the network if has_bias_node: lower_range = 0 else: lower_range = 1 self.nodes = list(range(lower_range, num_nodes + 1)) self.weights = {}
def make_counts_from_instances(self): """Creates the count matrix for a motif with instances. """ # make strings for "columns" of motifs # col[i] is a column taken from aligned motif instances counts={} for a in self.alphabet.letters: counts[a]=[] self.has_counts=True s = len(self.instances) for i in range(self.length): ci = dict((a, 0) for a in self.alphabet.letters) for inst in self.instances: ci[inst[i]]+=1 for a in self.alphabet.letters: counts[a].append(ci[a]) self.counts=counts return counts
def dist_dpq_at(self, other, offset): """ calculates the dist_dpq measure with a given offset. offset should satisfy 0<=offset<=len(self) """ def dpq(f1, f2, alpha): s=0 for n in alpha.letters: avg=(f1[n]+f2[n])/2 s+=f1[n]*math.log(f1[n]/avg, 2)+f2[n]*math.log(f2[n]/avg, 2) return math.sqrt(s) s=0 for i in range(max(self.length, offset+other.length)): f1=self[i] f2=other[i-offset] s+=dpq(f1, f2, self.alphabet) return s
def test_phred_quality_from_solexa(self): """Mapping check for function phred_quality_from_solexa""" self.assertEqual(1, round(QualityIO.phred_quality_from_solexa(-5))) self.assertEqual(1, round(QualityIO.phred_quality_from_solexa(-4))) self.assertEqual(2, round(QualityIO.phred_quality_from_solexa(-3))) self.assertEqual(2, round(QualityIO.phred_quality_from_solexa(-2))) self.assertEqual(3, round(QualityIO.phred_quality_from_solexa(-1))) self.assertEqual(3, round(QualityIO.phred_quality_from_solexa(0))) self.assertEqual(4, round(QualityIO.phred_quality_from_solexa(1))) self.assertEqual(4, round(QualityIO.phred_quality_from_solexa(2))) self.assertEqual(5, round(QualityIO.phred_quality_from_solexa(3))) self.assertEqual(5, round(QualityIO.phred_quality_from_solexa(4))) self.assertEqual(6, round(QualityIO.phred_quality_from_solexa(5))) self.assertEqual(7, round(QualityIO.phred_quality_from_solexa(6))) self.assertEqual(8, round(QualityIO.phred_quality_from_solexa(7))) self.assertEqual(9, round(QualityIO.phred_quality_from_solexa(8))) self.assertEqual(10, round(QualityIO.phred_quality_from_solexa(9))) for i in range(10, 100): self.assertEqual(i, round(QualityIO.phred_quality_from_solexa(i)))
def score_hit(self, sequence, position, normalized=0, masked=0): """ give the pwm score for a given position """ lo=self.log_odds() score = 0.0 for pos in range(self.length): a = sequence[position+pos] if not masked or self.mask[pos]: try: score += lo[pos][a] except: pass if normalized: if not masked: score/=self.length else: score/=len([x for x in self.mask if x]) return score
def log_odds(self, background=None): """ returns the Position-Specific Scoring Matrix. The Position-Specific Scoring Matrix (PSSM) contains the log-odds scores computed from the probability matrix and the background probabilities. If the background is None, a uniform background distribution is assumed. """ values = {} alphabet = self.alphabet if background is None: background = dict.fromkeys(self._letters, 1.0) else: background = dict(background) total = sum(background.values()) for letter in alphabet.letters: background[letter] /= total values[letter] = [] for i in range(self.length): for letter in alphabet.letters: b = background[letter] if b > 0: p = self[letter][i] if p > 0: logodds = math.log(p / b, 2) else: #TODO - Ensure this has unittest coverage! try: logodds = float("-inf") except ValueError: # On Python 2.5 or older that was handled in C code, # and failed on Windows XP 32bit logodds = -1E400 else: p = self[letter][i] if p > 0: logodds = float("inf") else: logodds = _nan values[letter].append(logodds) pssm = PositionSpecificScoringMatrix(alphabet, values) return pssm