def test_transform_multiple(): assert transform("ATG") == ( [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0], [0, 0.5, 0, -0.5, -1.0, -0.5, 0.0], ) assert transform("TTC") == ( [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0], [0, -0.5, -1.0, -1.5, -2.0, -2.5, -2.0], )
def test_end_y_value(s): assert transform(s, method="yau")[1][-1] == approx( (-(3 ** 0.5) / 2 * s.count("A")) + ((3 ** 0.5) / 2 * s.count("T")) + (0.5 * s.count("C")) + (-0.5 * s.count("G")) )
def test_endpoint(s): # "If the function n(Z) denotes the number of occurrences of nucleotide Z # in a given sequence, the end-point of the sequence lies at coordinate # position [(n(G) - n(C)), (n(T) - n(A))]" (Gates, J. theor. Biol. 1986) transformed = transform(s, method="gates") assert transformed[0][-1] == s.count("G") - s.count("C") assert transformed[1][-1] == s.count("T") - s.count("A")
def test_qi(s): transformed = transform(s, method="qi") for i in range(len(s)): try: assert transformed[1][i] == qi[s[i:i + 2]] except IndexError: pass
def transform_route(): sequence = request.form["seq"] seq_name = request.form["seq_name"] method = request.form["method"] logging.debug("Hashing seq") seq_hash = str(xxhash.xxh64(sequence).intdigest()) if LOCAL: exists = os.path.exists(f"data/{seq_hash}.{method}.parquet.sz") logging.debug(f"Found {seq_hash} locally") else: exists = exists_on_s3(f"{seq_hash}.{method}.parquet.sz") logging.debug(f"Found {seq_hash} on S3") if exists: if LOCAL: df = pd.read_parquet(f"data/{seq_hash}.{method}.parquet.sz") else: df = query_x_range(f"{seq_hash}.{method}.parquet.sz") else: logging.debug( f"No previous transformation for {seq_name} found. Transforming..." ) transformed = transform(sequence, method=method) logging.debug("Saving transformed data for " + seq_name) df = pd.DataFrame(dict(x=transformed[0], y=transformed[1])) df.to_parquet(f"data/{seq_hash}.{method}.parquet.sz") if not LOCAL: logging.debug(f"Uploading {seq_hash} to S3") upload(f"{seq_hash}.{method}.parquet.sz") logging.debug(f"Got the overview data for {seq_hash}") zone = df.values.tolist() return jsonify((seq_hash, downsample(zone)))
def test_C(): assert ( transform("C", method="gates") == transform("c", method="gates") == ([0, -1], [0, 0]) )
def test_G(): assert ( transform("G", method="gates") == transform("g", method="gates") == ([0, 1], [0, 0]) )
def test_bad_seq(): with pytest.raises(ValueError): transform("INVALID", method="yau")
def test_end_x_value(s): assert transform(s, method="yau")[0][-1] == approx( ((3 ** 0.5) / 2 * (s.count("C") + s.count("G"))) + (0.5 * (s.count("A") + s.count("T"))) )
def test_G(): assert ( transform("G", method="yau") == transform("g", method="yau") == ([0, (3 ** 0.5) / 2], [0, -0.5]) )
def test_basic(): assert transform("ATGC", method="yau-bp") == ([0, 1, 2, 3, 4], [0, -1, 0, -0.5, 0])
def test_end_y_value(s): assert transform(s, method="yau-bp")[1][-1] == approx( s.count("T") - s.count("A") + 0.5 * (s.count("C") - s.count("G")))
def test_invalid_seq(): assert transform("N") == ([0, 0.5, 1.0], [0, 0, 0])
def test_transform_C(): assert transform("C") == transform("c") == ([0, 0.5, 1.0], [0, -0.5, 0])
def test_transform_G(): assert transform("G") == transform("g") == ([0, 0.5, 1.0], [0, 0.5, 1.0])
def test_transform_T(): assert transform("T") == transform("t") == ([0, 0.5, 1.0], [0, -0.5, -1.0])
def test_end_x_value(s): transformed = transform(s, method="yau-bp") assert transformed[0][-1] == approx( s.count("A") + s.count("T") + s.count("G") + s.count("C"))
def test_A(): assert ( transform("A", method="gates") == transform("a", method="gates") == ([0, 0], [0, -1]) )
def test_length(s): transformed = transform(s, method="yau-bp") assert (len(transformed[0]) == len(transformed[1]) == len(s) + 1 ) # the extra 1 is for the starting (0, 0) coord
def test_invalid(): with pytest.raises(ValueError): transform("invalid", method="yau-bp")
def test_length(s): transformed = transform(s) assert (len(transformed[0]) == len(transformed[1]) == 2 * len(s) + 1 ) # the extra 1 is for the starting (0, 0) coord
def test_T(): assert ( transform("T", method="yau") == transform("t", method="yau") == ([0, 0.5], [0, (3 ** 0.5) / 2]) )
def test_invalid_method(): with pytest.raises(ValueError): transform("", method="invalid")
def test_C(): assert ( transform("C", method="yau") == transform("c", method="yau") == ([0, (3 ** 0.5) / 2], [0, 0.5]) )
def test_transform_A(): assert transform("A") == transform("a") == ([0, 0.5, 1.0], [0, 0.5, 0])
def visualize(fasta, width, palette, color, hide, bar, title, separate, cols, link_x, link_y, output, offline, method, dimensions, skip, mode): # check filetype if fasta is None: raise ValueError("Must provide FASTA file.") # handle selecting the palette palette = small_palettes[palette] # handle setting the dimensions automatically if not specified if not dimensions: dimensions = (750, 500) if len([record for _f in fasta for record in Fasta(_f)]) > len(palette) and mode != "file": if len(fasta) > 1 and mode == "auto": if not skip: print("Visualizing each file in separate color. To override, provide mode selection.") mode = "file" else: print("Visualizing each sequence in black.") color = False elif mode == "auto": mode = "seq" # get all the sequences seqs = [] color_counter = 0 warned = False for i, _f in enumerate(fasta): for j, seq in enumerate(Fasta(_f, sequence_always_upper=True)): seqs.append(Box(color=palette[color_counter + 1 if color_counter > 2 else 3][color_counter] if color else "black", name=_f if mode == "file" else seq.name, raw_seq=seq)) # check the length of the seq if len(seq) > 10000 and not skip and not warned: click.confirm("You are plotting long sequence ({} bp). This may be very slow. " "Do you want to continue?".format(len(seq)), abort=True) warned = True if mode == "seq": color_counter += 1 if mode == "file": color_counter += 1 # warn if plotting a large number of seqs if len(seqs) > 500 and not skip: click.confirm("You are plotting a large number of sequences ({}). This may be very slow. " "Do you want to continue?".format(len(seqs)), abort=True) # warn if using a bad method if max([len(seq.raw_seq) for seq in seqs]) > 25 and method in ["qi", "randic"] and not skip: click.confirm("This method is not well suited to a sequence of this length. " "Do you want to continue?", abort=True) axis_labels = { "squiggle": {"x": "position (BP)", "y": None}, "gates": {"x": "C-G axis", "y": "A-T axis"}, "yau": {"x": None, "y": None}, "yau-bp": {"x": "position (BP)", "y": None}, "randic": {"x": "position (BP)", "y": "nucleotide"}, "qi": {"x": "position (BP)", "y": "dinucleotide"} } # the number of figures to draw is either the number of sequences or files (or 1) if separate: if mode == "seq": fig_count = len(seqs) elif mode == "file": fig_count = len(fasta) else: fig_count = 1 fig = [] for i in range(fig_count): # link the axes, if requested if i > 0 and link_x: x_range = fig[i - 1].x_range else: x_range = None if i > 0 and link_y: y_range = fig[i - 1].y_range else: y_range = None # the y axes for randic and qi are bases if method == "randic": y_range = ["A", "T", "G", "C"] elif method == "qi": y_range = ['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT'] fig.append(figure(x_axis_label=axis_labels[method]["x"], y_axis_label=axis_labels[method]["y"], title=title, x_range=x_range, y_range=y_range, plot_width=dimensions[0], plot_height=dimensions[1])) # show a progress bar if processing multiple files if len(seqs) > 1 and bar: _seqs = tqdm(seqs, unit=" seqs", leave=False) else: _seqs = seqs for i, seq in enumerate(_seqs): # perform the actual transformation transformed = transform(str(seq.raw_seq), method=method) # figure (no pun intended) which figure to plot the data on if separate: if mode == "seq": _fig = fig[i] elif mode == "file": _fig = fig[fasta.index(seq.name)] # add a title to the plot _fig.title = annotations.Title() if mode == "seq": _fig.title.text = seq.name elif mode == "file": _fig.title.text = click.format_filename(seq.name, shorten=True) else: _fig = fig[0] _fig.title = annotations.Title() # if only plotting on one figure, set up the title if title: _fig.title.text = title elif len(seqs) > 1 and not title and len(fasta) == 1: _fig.title.text = click.format_filename(fasta[0], shorten=True) elif len(seqs) == 1: # if just plotting one sequence, title it with the name of the sequence _fig.title.text = seq.name # randic and qi method's have categorical y axes if method == "randic": y = list(seq.raw_seq) elif method == "qi": y = [seq.raw_seq[i:i + 2] for i in range(len(seq.raw_seq))] y = [str(i) for i in y if len(i) == 2] else: y = transformed[1] # figure out whether to add a legend if (separate or not color or mode == "file" or len(seqs) == 1) and not hide: legend = None else: legend = click.format_filename(seq.name, shorten=True) # optimization for comparing large FASTA files without hiding try: if mode == "file" and seqs[i + 1].color != seq.color and not separate: legend = click.format_filename(seq.name, shorten=True) except IndexError: if mode == "file" and not separate: legend = click.format_filename(seq.name, shorten=True) # do the actual plotting _fig.line(x=transformed[0], y=y, line_width=width, legend=legend, color=seq.color) # set up the legend _fig.legend.location = "top_left" if hide: _fig.legend.click_policy = "hide" # clean up the tqdm bar try: _seqs.close() except AttributeError: pass # lay out the figure if separate: plot = gridplot(fig, ncols=math.ceil(len(fig)**0.5) if cols == 0 else cols, toolbar_options=dict(logo=None)) # note that 0 denotes the automatic default else: plot = fig[0] if output is not None and output.endswith(".html"): output_file(output, title="Squiggle Visualization" if title is not None else title) save(plot, resources=INLINE if offline else None) else: show(plot)
def test_T(): assert ( transform("T", method="gates") == transform("t", method="gates") == ([0, 0], [0, 1]) )
def test_A(): assert ( transform("A", method="yau") == transform("a", method="yau") == ([0, 0.5], [0, -(3 ** 0.5) / 2]) )
def test_randic(s): transformed = transform(s, method="randic") for i, letter in enumerate(s): assert transformed[1][i] == randic[letter]