def test_weight(self): self.assertTrue( all(2.0 * equiprobable_distribution(4) == parse_prior( "automatic", unambiguous_dna_alphabet))) self.assertTrue( all(123.123 * equiprobable_distribution(4) == parse_prior( "auto", unambiguous_dna_alphabet, 123.123)))
def test_parse_prior_equiprobable(self): self.assertTrue( all(20.0 * equiprobable_distribution(20) == parse_prior( "equiprobable", unambiguous_protein_alphabet, weight=20.0))) self.assertTrue( all(1.2 * equiprobable_distribution(3) == parse_prior( " equiprobablE ", Alphabet("123"), 1.2)))
def test_parse_prior_equiprobable(self): self.assertTrue( all(20. * equiprobable_distribution(20) == parse_prior( 'equiprobable', unambiguous_protein_alphabet, weight=20.))) self.assertTrue( all(1.2 * equiprobable_distribution(3) == parse_prior( ' equiprobablE ', Alphabet('123'), 1.2)))
def test_parse_prior_float(self): self.assertTrue( all( equiprobable_distribution(4) == parse_prior( "0.5", unambiguous_dna_alphabet, 1.0))) self.assertTrue( all( equiprobable_distribution(4) == parse_prior( " 0.500 ", unambiguous_dna_alphabet, 1.0))) self.assertTrue( all( array((0.3, 0.2, 0.2, 0.3), float64) == parse_prior( " 0.40 ", unambiguous_dna_alphabet, 1.0)))
def test_parse_prior_percentage(self): # print(parse_prior('50%', unambiguous_dna_alphabet, 1.)) self.assertTrue( all( equiprobable_distribution(4) == parse_prior( "50%", unambiguous_dna_alphabet, 1.0))) self.assertTrue( all( equiprobable_distribution(4) == parse_prior( " 50.0 % ", unambiguous_dna_alphabet, 1.0))) self.assertTrue( all( array((0.3, 0.2, 0.2, 0.3), float64) == parse_prior( " 40.0 % ", unambiguous_dna_alphabet, 1.0)))
def test_parse_prior_error(): with pytest.raises(ValueError): parse_prior("0.5", unambiguous_protein_alphabet, weight=-10000.0) with pytest.raises(ValueError): s = "{'A':10, 'C':40, 'G':40, 'T':10}" parse_prior(s, unambiguous_protein_alphabet) with pytest.raises(ValueError): s = "{'A':'ljkasd', 'C':40, 'G':40, 'T':10}" parse_prior(s, unambiguous_dna_alphabet) with pytest.raises(ValueError): s = "asjnd" parse_prior(s, unambiguous_dna_alphabet)
def test_auto(self): self.assertTrue( all(2.0 * equiprobable_distribution(4) == parse_prior( "auto", unambiguous_dna_alphabet))) self.assertTrue( all(2.0 * equiprobable_distribution(4) == parse_prior( "automatic", unambiguous_dna_alphabet))) parse_prior("automatic", unambiguous_protein_alphabet) parse_prior("E. coli", unambiguous_dna_alphabet)
def main(htdocs_directory=None): logooptions = weblogo.LogoOptions() # A list of form fields. # The default for checkbox values must be False (irrespective of # the default in logooptions) since a checked checkbox returns 'true' # but an unchecked checkbox returns nothing. controls = [ Field('sequences', ''), Field('sequences_url', ''), Field( 'format', 'png', weblogo.formatters.get, options=[ 'png_print', 'png', 'jpeg', 'eps', 'pdf', 'svg', 'logodata' ], # TODO: Should copy list from __init__.formatters errmsg="Unknown format option."), Field('stacks_per_line', logooptions.stacks_per_line, int, errmsg='Invalid number of stacks per line.'), Field('stack_width', 'medium', weblogo.std_sizes.get, options=['small', 'medium', 'large'], errmsg='Invalid logo size.'), Field('alphabet', 'alphabet_auto', alphabets.get, options=[ 'alphabet_auto', 'alphabet_protein', 'alphabet_dna', 'alphabet_rna' ], errmsg="Unknown sequence type."), Field('unit_name', 'bits', options=[ 'probability', 'bits', 'nats', 'kT', 'kJ/mol', 'kcal/mol' ]), Field('first_index', 1, int_or_none), Field('logo_start', '', int_or_none), Field('logo_end', '', int_or_none), Field('composition', 'comp_auto', composition.get, options=[ 'comp_none', 'comp_auto', 'comp_equiprobable', 'comp_CG', 'comp_Celegans', 'comp_Dmelanogaster', 'comp_Ecoli', 'comp_Hsapiens', 'comp_Mmusculus', 'comp_Scerevisiae' ], errmsg="Illegal sequence composition."), Field('percentCG', '', float_or_none, errmsg="Invalid CG percentage."), Field('show_errorbars', False, truth), Field('logo_title', logooptions.logo_title), Field('logo_label', logooptions.logo_label), Field('show_xaxis', False, truth), Field('xaxis_label', logooptions.xaxis_label), Field('show_yaxis', False, truth), Field('yaxis_label', logooptions.yaxis_label, string_or_none), Field('yaxis_scale', logooptions.yaxis_scale, float_or_none, errmsg="The yaxis scale must be a positive number."), Field('yaxis_tic_interval', logooptions.yaxis_tic_interval, float_or_none), Field('show_ends', False, truth), Field('show_fineprint', False, truth), Field('color_scheme', 'color_auto', color_schemes.get, options=color_schemes.keys(), errmsg='Unknown color scheme'), Field('color0', ''), Field('symbols0', ''), Field('desc0', ''), Field('color1', ''), Field('symbols1', ''), Field('desc1', ''), Field('color2', ''), Field('symbols2', ''), Field('desc2', ''), Field('color3', ''), Field('symbols3', ''), Field('desc3', ''), Field('color4', ''), Field('symbols4', ''), Field('desc4', ''), Field('ignore_lower_case', False, truth), Field('scale_width', False, truth), ] form = {} for c in controls: form[c.name] = c form_values = cgilib.FieldStorage() # Send default form? if len(form_values) == 0 or "cmd_reset" in form_values: # Load default truth values now. form['show_errorbars'].value = logooptions.show_errorbars form['show_xaxis'].value = logooptions.show_xaxis form['show_yaxis'].value = logooptions.show_yaxis form['show_ends'].value = logooptions.show_ends form['show_fineprint'].value = logooptions.show_fineprint form['scale_width'].value = logooptions.scale_width send_form(controls, htdocs_directory=htdocs_directory) return # Get form content for c in controls: c.value = form_values.getfirst(c.name, c.default) options_from_form = [ 'format', 'stacks_per_line', 'stack_width', 'alphabet', 'unit_name', 'first_index', 'logo_start', 'logo_end', 'composition', 'show_errorbars', 'logo_title', 'logo_label', 'show_xaxis', 'xaxis_label', 'show_yaxis', 'yaxis_label', 'yaxis_scale', 'yaxis_tic_interval', 'show_ends', 'show_fineprint', 'scale_width' ] errors = [] for optname in options_from_form: try: value = form[optname].get_value() if value is not None: setattr(logooptions, optname, value) except ValueError as err: errors.append(err.args) # Construct custom color scheme custom = ColorScheme() for i in range(0, 5): color = form["color%d" % i].get_value() symbols = form["symbols%d" % i].get_value() desc = form["desc%d" % i].get_value() if color: try: custom.rules.append(SymbolColor(symbols, color, desc)) except ValueError: errors.append(('color%d' % i, "Invalid color: %s" % color)) if form["color_scheme"].value == 'color_custom': logooptions.color_scheme = custom else: try: logooptions.color_scheme = form["color_scheme"].get_value() except ValueError as err: errors.append(err.args) # FIXME: Ugly fix: Must check that sequence_file key exists # FIXME: Sending malformed or missing form keys should not cause a crash # sequences_file = form["sequences_file"] sequences_from_file = None if "sequences_file" in form_values: sequences_from_file = form_values.getvalue("sequences_file") sequences_from_textfield = form["sequences"].get_value() sequences_url = form["sequences_url"].get_value() sequences = None seq_file = None if sequences_from_file: if sequences_from_textfield or sequences_url: errors.append( ("sequences_file", "Cannot upload, sequence source conflict")) else: sequences = sequences_from_file seq_file = TextIOWrapper(BytesIO(sequences), encoding='utf-8') elif sequences_from_textfield: if sequences_url: errors.append( ("sequences", "Cannot upload, sequence source conflict")) else: # check SEQUENCES_MAXLENGT # If a user tries to paste a very large file into sequence textarea, # then WebLogo runs very slow for no apparently good reason. (Might be client side bug?) # So we limit the maximum sequence size. # Form field also limits size, but not necessarly respected. Also can truncate data # without warning, so we'll set textarea maximum to be larger than MAX_SEQUENCE_SIZE SEQUENCES_MAXLENGTH = 100000 if len(sequences_from_textfield) > SEQUENCES_MAXLENGTH: errors.append(( "sequences", "Sequence data too large for text input. Use file upload instead." )) controls[0] = Field('sequences', '') else: sequences = sequences_from_textfield seq_file = StringIO(sequences) elif sequences_url: from . import _from_URL_fileopen try: seq_file = _from_URL_fileopen(sequences_url) except ValueError: errors.append(("sequences_url", "Cannot parse URL")) except IOError: errors.append(("sequences_url", "Cannot load sequences from URL")) else: errors.append(( "sequences", "Please enter a multiple-sequence alignment in the box above, or select a " "file to upload.")) # If we have uncovered errors or we want the chance to edit the logo # ("cmd_edit" command from examples page) then we return the form now. # We do not proceed to the time consuming logo creation step unless # required by a 'create' or 'validate' command, and no errors have been # found yet. if errors or "cmd_edit" in form_values: send_form(controls, errors, htdocs_directory) return try: comp = form["composition"].get_value() percentCG = form["percentCG"].get_value() ignore_lower_case = ("ignore_lower_case" in form_values) if comp == 'percentCG': comp = str(percentCG / 100) from .matrix import Motif try: # Try reading data in transfac format first. # TODO Refactor this code motif = Motif.read_transfac(seq_file, alphabet=logooptions.alphabet) prior = weblogo.parse_prior(comp, motif.alphabet) data = weblogo.LogoData.from_counts(motif.alphabet, motif, prior) except ValueError: seqs = weblogo.read_seq_data(seq_file, alphabet=logooptions.alphabet, ignore_lower_case=ignore_lower_case) prior = weblogo.parse_prior(comp, seqs.alphabet) data = weblogo.LogoData.from_seqs(seqs, prior) logoformat = weblogo.LogoFormat(data, logooptions) format = form["format"].value logo = weblogo.formatters[format](data, logoformat) except ValueError as err: errors.append(err.args) except IOError as err: errors.append(err.args) except RuntimeError as err: errors.append(err.args) if errors or "cmd_validate" in form_values: send_form(controls, errors, htdocs_directory) return # # RETURN LOGO OVER HTTP # print("Content-Type:", mime_type[format]) # Content-Disposition: inline Open logo in browser window # Content-Disposition: attachment Download logo if "download" in form_values: print('Content-Disposition: attachment; ' 'filename="logo.%s"' % extension[format]) else: print('Content-Disposition: inline; ' 'filename="logo.%s"' % extension[format]) # Separate header from data print() sys.stdout.flush() # Finally, and at last, send the logo. sys.stdout.buffer.write(logo)
def test_explicit(self): s = "{'A':10, 'C':40, 'G':40, 'T':10}" p = array((10, 40, 40, 10), float64) * 2.0 / 100.0 self.assertTrue(all(p == parse_prior(s, unambiguous_dna_alphabet)))
def test_parse_prior_none(self): self.assertEqual(None, parse_prior(None, unambiguous_protein_alphabet)) self.assertEqual(None, parse_prior("none", unambiguous_protein_alphabet)) self.assertEqual(None, parse_prior("noNe", None))
def get_weblogos_ext(args): """ Build extended weblogos per convolutional filter with nucleotide coloring.""" s_max = 1 / args.gain s_min = -s_max samples = np.load(args.train_data, mmap_mode='r') gc_content = np.sum(np.mean(np.mean(samples, axis=1), axis=0)[1:3]) at_content = 1 - gc_content base_pseudocounts = np.array( [at_content, gc_content, gc_content, at_content]) / 2.0 # create output directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) letter_dict = dict({'A': 0, 'C': 1, 'G': 2, 'T': 3}) # nucleotide color scheme: blue - grey - red colormap = plt.cm.coolwarm # for each convolutional filter for file_fasta in os.listdir(args.fasta_dir): if bool(re.search("_motifs_filter_[0-9]+.*" + ".fasta", file_fasta)) and \ os.stat(args.fasta_dir + "/" + file_fasta).st_size > 0: c_filter = re.search("filter_[0-9]+", file_fasta).group() filter_index = c_filter.replace("filter_", "") print("Processing filter: " + filter_index) else: continue file_transfac = [] file_scores = [ filename for filename in os.listdir(args.scores_dir) if bool( re.search( "rel_filter_" + str(filter_index) + "_nucleotides\.csv", filename)) ] assert len( file_scores) < 2, "Multiple score files for filter {}".format( filter_index) # load transfac files if args.logo_dir: file_transfac = [ filename for filename in os.listdir(args.logo_dir) if bool( re.search( "filter_" + str(filter_index) + "_seq_weighting.transfac", filename)) ] if len(file_transfac) == 0: continue assert len(file_transfac ) < 2, "Multiple transfac files for filter {}".format( filter_index) # load nucleotide contribution scores contribution_scores = [] with open(args.scores_dir + "/" + file_scores[0], 'r') as csvfile: reader = csv.reader(csvfile) for ind, row in enumerate(reader): if ind % 2 == 1: scores = np.array(row, dtype=np.float32) contribution_scores.append(scores) # load motifs from fasta file try: fin = open(args.fasta_dir + "/" + file_fasta) seqs = read_seq_data(fin) except IOError: print("No data, skipping.") continue except ValueError: print("No data, skipping.") continue # load weighted count matrix from transfac file if args.logo_dir: fin = open(args.logo_dir + "/" + file_transfac[0]) motif = Motif.read_transfac(fin) prior = parse_prior(str(gc_content), motif.alphabet) data = LogoData.from_counts(motif.alphabet, motif, prior) out_png_name = args.out_dir + "/weblogo_extended_" + file_transfac[ 0].replace(".transfac", ".png") out_eps_name = args.out_dir + "/weblogo_extended_" + file_transfac[ 0].replace(".transfac", ".eps") else: prior = parse_prior(str(gc_content), seqs.alphabet) data = LogoData.from_seqs(seqs, prior) out_png_name = args.out_dir + "/weblogo_extended_" + file_fasta.replace( ".fasta", ".png") out_eps_name = args.out_dir + "/weblogo_extended_" + file_fasta.replace( ".fasta", ".eps") seq_names = [seq.name for seq in seqs] seen = set() seqs_unique = [ seqs[idx] for idx, seq_name in enumerate(seq_names) if seq_name not in seen and not seen.add(seq_name) ] assert len(contribution_scores) == len( seqs_unique ), "Numbers of contribution scores and sequences differ." # compute mean contribution score per nucleotide and logo position mean_scores = np.zeros((len(seqs_unique[0]), len(seqs.alphabet))) counts = np.zeros_like(data.counts.array) for r_id, read in enumerate(seqs_unique): for pos, base in enumerate(read): base = str(base) if base in letter_dict.keys(): mean_scores[ pos, letter_dict[base]] += contribution_scores[r_id][pos] counts[pos, letter_dict[base]] += 1 # add pseudocount to avoid divion by 0 motif_len = len(seqs_unique[0]) pseudocounts = np.reshape( np.concatenate([base_pseudocounts] * motif_len, axis=0), [motif_len, 4]) mean_scores /= (counts + pseudocounts) # normalize scores to [0, 255] and assign color according the selected color scheme norm_scores = ((mean_scores - s_min) / (s_max - s_min)) * 255 color_rules = [] for base in letter_dict.keys(): for pos in range(len(seqs[0])): custom_color = matplotlib.colors.rgb2hex( colormap(int(norm_scores[pos, letter_dict[base]]))) color_rules.append(SymbolIndexColor(base, [pos], custom_color)) # set logo options options = LogoOptions() options.logo_title = "filter " + str(filter_index) options.color_scheme = ColorScheme(color_rules) options.stack_width = std_sizes["large"] options.resolution = 300 # save filter logo l_format = LogoFormat(data, options) png = png_formatter(data, l_format) with open(out_png_name, 'wb') as out_file: out_file.write(png) eps = eps_formatter(data, l_format) with open(out_eps_name, 'wb') as out_file: out_file.write(eps)