Ejemplo n.º 1
0
 def test_weight(self):
     self.assertTrue(
         all(2.0 * equiprobable_distribution(4) == parse_prior(
             "automatic", unambiguous_dna_alphabet)))
     self.assertTrue(
         all(123.123 * equiprobable_distribution(4) == parse_prior(
             "auto", unambiguous_dna_alphabet, 123.123)))
Ejemplo n.º 2
0
    def test_parse_prior_equiprobable(self):
        self.assertTrue(
            all(20.0 * equiprobable_distribution(20) == parse_prior(
                "equiprobable", unambiguous_protein_alphabet, weight=20.0)))

        self.assertTrue(
            all(1.2 * equiprobable_distribution(3) == parse_prior(
                " equiprobablE  ", Alphabet("123"), 1.2)))
Ejemplo n.º 3
0
    def test_parse_prior_equiprobable(self):
        self.assertTrue(
            all(20. * equiprobable_distribution(20) == parse_prior(
                'equiprobable', unambiguous_protein_alphabet, weight=20.)))

        self.assertTrue(
            all(1.2 * equiprobable_distribution(3) == parse_prior(
                ' equiprobablE  ', Alphabet('123'), 1.2)))
Ejemplo n.º 4
0
    def test_parse_prior_float(self):
        self.assertTrue(
            all(
                equiprobable_distribution(4) == parse_prior(
                    "0.5", unambiguous_dna_alphabet, 1.0)))

        self.assertTrue(
            all(
                equiprobable_distribution(4) == parse_prior(
                    " 0.500 ", unambiguous_dna_alphabet, 1.0)))

        self.assertTrue(
            all(
                array((0.3, 0.2, 0.2, 0.3), float64) == parse_prior(
                    " 0.40 ", unambiguous_dna_alphabet, 1.0)))
Ejemplo n.º 5
0
    def test_parse_prior_percentage(self):
        # print(parse_prior('50%', unambiguous_dna_alphabet, 1.))
        self.assertTrue(
            all(
                equiprobable_distribution(4) == parse_prior(
                    "50%", unambiguous_dna_alphabet, 1.0)))

        self.assertTrue(
            all(
                equiprobable_distribution(4) == parse_prior(
                    " 50.0 % ", unambiguous_dna_alphabet, 1.0)))

        self.assertTrue(
            all(
                array((0.3, 0.2, 0.2, 0.3), float64) == parse_prior(
                    " 40.0 % ", unambiguous_dna_alphabet, 1.0)))
Ejemplo n.º 6
0
def test_parse_prior_error():
    with pytest.raises(ValueError):
        parse_prior("0.5", unambiguous_protein_alphabet, weight=-10000.0)

    with pytest.raises(ValueError):
        s = "{'A':10, 'C':40, 'G':40, 'T':10}"
        parse_prior(s, unambiguous_protein_alphabet)

    with pytest.raises(ValueError):
        s = "{'A':'ljkasd', 'C':40, 'G':40, 'T':10}"
        parse_prior(s, unambiguous_dna_alphabet)

    with pytest.raises(ValueError):
        s = "asjnd"
        parse_prior(s, unambiguous_dna_alphabet)
Ejemplo n.º 7
0
    def test_auto(self):
        self.assertTrue(
            all(2.0 * equiprobable_distribution(4) == parse_prior(
                "auto", unambiguous_dna_alphabet)))
        self.assertTrue(
            all(2.0 * equiprobable_distribution(4) == parse_prior(
                "automatic", unambiguous_dna_alphabet)))

        parse_prior("automatic", unambiguous_protein_alphabet)
        parse_prior("E. coli", unambiguous_dna_alphabet)
Ejemplo n.º 8
0
def main(htdocs_directory=None):
    logooptions = weblogo.LogoOptions()

    # A list of form fields.
    # The default for checkbox values must be False (irrespective of
    # the default in logooptions) since a checked checkbox returns 'true'
    # but an unchecked checkbox returns nothing.
    controls = [
        Field('sequences', ''),
        Field('sequences_url', ''),
        Field(
            'format',
            'png',
            weblogo.formatters.get,
            options=[
                'png_print', 'png', 'jpeg', 'eps', 'pdf', 'svg', 'logodata'
            ],
            # TODO: Should copy list from __init__.formatters
            errmsg="Unknown format option."),
        Field('stacks_per_line',
              logooptions.stacks_per_line,
              int,
              errmsg='Invalid number of stacks per line.'),
        Field('stack_width',
              'medium',
              weblogo.std_sizes.get,
              options=['small', 'medium', 'large'],
              errmsg='Invalid logo size.'),
        Field('alphabet',
              'alphabet_auto',
              alphabets.get,
              options=[
                  'alphabet_auto', 'alphabet_protein', 'alphabet_dna',
                  'alphabet_rna'
              ],
              errmsg="Unknown sequence type."),
        Field('unit_name',
              'bits',
              options=[
                  'probability', 'bits', 'nats', 'kT', 'kJ/mol', 'kcal/mol'
              ]),
        Field('first_index', 1, int_or_none),
        Field('logo_start', '', int_or_none),
        Field('logo_end', '', int_or_none),
        Field('composition',
              'comp_auto',
              composition.get,
              options=[
                  'comp_none', 'comp_auto', 'comp_equiprobable', 'comp_CG',
                  'comp_Celegans', 'comp_Dmelanogaster', 'comp_Ecoli',
                  'comp_Hsapiens', 'comp_Mmusculus', 'comp_Scerevisiae'
              ],
              errmsg="Illegal sequence composition."),
        Field('percentCG', '', float_or_none, errmsg="Invalid CG percentage."),
        Field('show_errorbars', False, truth),
        Field('logo_title', logooptions.logo_title),
        Field('logo_label', logooptions.logo_label),
        Field('show_xaxis', False, truth),
        Field('xaxis_label', logooptions.xaxis_label),
        Field('show_yaxis', False, truth),
        Field('yaxis_label', logooptions.yaxis_label, string_or_none),
        Field('yaxis_scale',
              logooptions.yaxis_scale,
              float_or_none,
              errmsg="The yaxis scale must be a positive number."),
        Field('yaxis_tic_interval', logooptions.yaxis_tic_interval,
              float_or_none),
        Field('show_ends', False, truth),
        Field('show_fineprint', False, truth),
        Field('color_scheme',
              'color_auto',
              color_schemes.get,
              options=color_schemes.keys(),
              errmsg='Unknown color scheme'),
        Field('color0', ''),
        Field('symbols0', ''),
        Field('desc0', ''),
        Field('color1', ''),
        Field('symbols1', ''),
        Field('desc1', ''),
        Field('color2', ''),
        Field('symbols2', ''),
        Field('desc2', ''),
        Field('color3', ''),
        Field('symbols3', ''),
        Field('desc3', ''),
        Field('color4', ''),
        Field('symbols4', ''),
        Field('desc4', ''),
        Field('ignore_lower_case', False, truth),
        Field('scale_width', False, truth),
    ]

    form = {}
    for c in controls:
        form[c.name] = c

    form_values = cgilib.FieldStorage()

    # Send default form?
    if len(form_values) == 0 or "cmd_reset" in form_values:
        # Load default truth values now.
        form['show_errorbars'].value = logooptions.show_errorbars
        form['show_xaxis'].value = logooptions.show_xaxis
        form['show_yaxis'].value = logooptions.show_yaxis
        form['show_ends'].value = logooptions.show_ends
        form['show_fineprint'].value = logooptions.show_fineprint
        form['scale_width'].value = logooptions.scale_width

        send_form(controls, htdocs_directory=htdocs_directory)
        return

    # Get form content
    for c in controls:
        c.value = form_values.getfirst(c.name, c.default)

    options_from_form = [
        'format', 'stacks_per_line', 'stack_width', 'alphabet', 'unit_name',
        'first_index', 'logo_start', 'logo_end', 'composition',
        'show_errorbars', 'logo_title', 'logo_label', 'show_xaxis',
        'xaxis_label', 'show_yaxis', 'yaxis_label', 'yaxis_scale',
        'yaxis_tic_interval', 'show_ends', 'show_fineprint', 'scale_width'
    ]

    errors = []
    for optname in options_from_form:
        try:
            value = form[optname].get_value()
            if value is not None:
                setattr(logooptions, optname, value)
        except ValueError as err:
            errors.append(err.args)

    # Construct custom color scheme
    custom = ColorScheme()
    for i in range(0, 5):
        color = form["color%d" % i].get_value()
        symbols = form["symbols%d" % i].get_value()
        desc = form["desc%d" % i].get_value()

        if color:
            try:
                custom.rules.append(SymbolColor(symbols, color, desc))
            except ValueError:
                errors.append(('color%d' % i, "Invalid color: %s" % color))

    if form["color_scheme"].value == 'color_custom':
        logooptions.color_scheme = custom
    else:
        try:
            logooptions.color_scheme = form["color_scheme"].get_value()
        except ValueError as err:
            errors.append(err.args)

    # FIXME: Ugly fix: Must check that sequence_file key exists
    # FIXME: Sending malformed or missing form keys should not cause a crash
    # sequences_file = form["sequences_file"]
    sequences_from_file = None
    if "sequences_file" in form_values:
        sequences_from_file = form_values.getvalue("sequences_file")

    sequences_from_textfield = form["sequences"].get_value()
    sequences_url = form["sequences_url"].get_value()

    sequences = None
    seq_file = None

    if sequences_from_file:
        if sequences_from_textfield or sequences_url:
            errors.append(
                ("sequences_file", "Cannot upload, sequence source conflict"))
        else:
            sequences = sequences_from_file
            seq_file = TextIOWrapper(BytesIO(sequences), encoding='utf-8')
    elif sequences_from_textfield:
        if sequences_url:
            errors.append(
                ("sequences", "Cannot upload, sequence source conflict"))
        else:
            # check SEQUENCES_MAXLENGT
            # If a user tries to paste a very large file into sequence textarea,
            # then WebLogo runs very slow for no apparently good reason. (Might be client side bug?)
            # So we limit the maximum sequence size.
            # Form field also limits size, but not necessarly respected. Also can truncate data
            # without warning, so we'll set textarea maximum to be larger than MAX_SEQUENCE_SIZE
            SEQUENCES_MAXLENGTH = 100000
            if len(sequences_from_textfield) > SEQUENCES_MAXLENGTH:
                errors.append((
                    "sequences",
                    "Sequence data too large for text input. Use file upload instead."
                ))
                controls[0] = Field('sequences', '')
            else:
                sequences = sequences_from_textfield
                seq_file = StringIO(sequences)

    elif sequences_url:
        from . import _from_URL_fileopen
        try:
            seq_file = _from_URL_fileopen(sequences_url)
        except ValueError:
            errors.append(("sequences_url", "Cannot parse URL"))
        except IOError:
            errors.append(("sequences_url", "Cannot load sequences from URL"))

    else:
        errors.append((
            "sequences",
            "Please enter a multiple-sequence alignment in the box above, or select a "
            "file to upload."))

    # If we have uncovered errors or we want the chance to edit the logo
    # ("cmd_edit" command from examples page) then we return the form now.
    # We do not proceed to the time consuming logo creation step unless
    # required by a 'create' or 'validate' command, and no errors have been
    # found yet.
    if errors or "cmd_edit" in form_values:
        send_form(controls, errors, htdocs_directory)
        return

    try:
        comp = form["composition"].get_value()
        percentCG = form["percentCG"].get_value()
        ignore_lower_case = ("ignore_lower_case" in form_values)
        if comp == 'percentCG':
            comp = str(percentCG / 100)

        from .matrix import Motif

        try:
            # Try reading data in transfac format first.
            # TODO Refactor this code
            motif = Motif.read_transfac(seq_file,
                                        alphabet=logooptions.alphabet)
            prior = weblogo.parse_prior(comp, motif.alphabet)
            data = weblogo.LogoData.from_counts(motif.alphabet, motif, prior)
        except ValueError:
            seqs = weblogo.read_seq_data(seq_file,
                                         alphabet=logooptions.alphabet,
                                         ignore_lower_case=ignore_lower_case)
            prior = weblogo.parse_prior(comp, seqs.alphabet)
            data = weblogo.LogoData.from_seqs(seqs, prior)

        logoformat = weblogo.LogoFormat(data, logooptions)
        format = form["format"].value
        logo = weblogo.formatters[format](data, logoformat)
    except ValueError as err:
        errors.append(err.args)
    except IOError as err:
        errors.append(err.args)
    except RuntimeError as err:
        errors.append(err.args)

    if errors or "cmd_validate" in form_values:
        send_form(controls, errors, htdocs_directory)
        return

    #
    #  RETURN LOGO OVER HTTP
    #

    print("Content-Type:", mime_type[format])
    # Content-Disposition: inline       Open logo in browser window
    # Content-Disposition: attachment   Download logo
    if "download" in form_values:
        print('Content-Disposition: attachment; '
              'filename="logo.%s"' % extension[format])
    else:
        print('Content-Disposition: inline; '
              'filename="logo.%s"' % extension[format])
    # Separate header from data
    print()
    sys.stdout.flush()

    # Finally, and at last, send the logo.
    sys.stdout.buffer.write(logo)
Ejemplo n.º 9
0
 def test_explicit(self):
     s = "{'A':10, 'C':40, 'G':40, 'T':10}"
     p = array((10, 40, 40, 10), float64) * 2.0 / 100.0
     self.assertTrue(all(p == parse_prior(s, unambiguous_dna_alphabet)))
Ejemplo n.º 10
0
 def test_parse_prior_none(self):
     self.assertEqual(None, parse_prior(None, unambiguous_protein_alphabet))
     self.assertEqual(None, parse_prior("none",
                                        unambiguous_protein_alphabet))
     self.assertEqual(None, parse_prior("noNe", None))
Ejemplo n.º 11
0
def get_weblogos_ext(args):
    """ Build extended weblogos per convolutional filter with nucleotide coloring."""
    s_max = 1 / args.gain
    s_min = -s_max

    samples = np.load(args.train_data, mmap_mode='r')
    gc_content = np.sum(np.mean(np.mean(samples, axis=1), axis=0)[1:3])
    at_content = 1 - gc_content
    base_pseudocounts = np.array(
        [at_content, gc_content, gc_content, at_content]) / 2.0

    # create output directory
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    letter_dict = dict({'A': 0, 'C': 1, 'G': 2, 'T': 3})
    # nucleotide color scheme: blue - grey - red
    colormap = plt.cm.coolwarm

    # for each convolutional filter
    for file_fasta in os.listdir(args.fasta_dir):
        if bool(re.search("_motifs_filter_[0-9]+.*" + ".fasta", file_fasta)) and \
                os.stat(args.fasta_dir + "/" + file_fasta).st_size > 0:
            c_filter = re.search("filter_[0-9]+", file_fasta).group()
            filter_index = c_filter.replace("filter_", "")
            print("Processing filter: " + filter_index)
        else:
            continue

        file_transfac = []
        file_scores = [
            filename for filename in os.listdir(args.scores_dir) if bool(
                re.search(
                    "rel_filter_" + str(filter_index) +
                    "_nucleotides\.csv", filename))
        ]
        assert len(
            file_scores) < 2, "Multiple score files for filter {}".format(
                filter_index)

        # load transfac files
        if args.logo_dir:
            file_transfac = [
                filename for filename in os.listdir(args.logo_dir) if bool(
                    re.search(
                        "filter_" + str(filter_index) +
                        "_seq_weighting.transfac", filename))
            ]
            if len(file_transfac) == 0:
                continue
            assert len(file_transfac
                       ) < 2, "Multiple transfac files for filter {}".format(
                           filter_index)

        # load nucleotide contribution scores
        contribution_scores = []
        with open(args.scores_dir + "/" + file_scores[0], 'r') as csvfile:
            reader = csv.reader(csvfile)
            for ind, row in enumerate(reader):
                if ind % 2 == 1:
                    scores = np.array(row, dtype=np.float32)
                    contribution_scores.append(scores)

        # load motifs from fasta file
        try:
            fin = open(args.fasta_dir + "/" + file_fasta)
            seqs = read_seq_data(fin)
        except IOError:
            print("No data, skipping.")
            continue
        except ValueError:
            print("No data, skipping.")
            continue

        # load weighted count matrix from transfac file
        if args.logo_dir:
            fin = open(args.logo_dir + "/" + file_transfac[0])
            motif = Motif.read_transfac(fin)
            prior = parse_prior(str(gc_content), motif.alphabet)
            data = LogoData.from_counts(motif.alphabet, motif, prior)
            out_png_name = args.out_dir + "/weblogo_extended_" + file_transfac[
                0].replace(".transfac", ".png")
            out_eps_name = args.out_dir + "/weblogo_extended_" + file_transfac[
                0].replace(".transfac", ".eps")
        else:
            prior = parse_prior(str(gc_content), seqs.alphabet)
            data = LogoData.from_seqs(seqs, prior)
            out_png_name = args.out_dir + "/weblogo_extended_" + file_fasta.replace(
                ".fasta", ".png")
            out_eps_name = args.out_dir + "/weblogo_extended_" + file_fasta.replace(
                ".fasta", ".eps")

        seq_names = [seq.name for seq in seqs]
        seen = set()
        seqs_unique = [
            seqs[idx] for idx, seq_name in enumerate(seq_names)
            if seq_name not in seen and not seen.add(seq_name)
        ]

        assert len(contribution_scores) == len(
            seqs_unique
        ), "Numbers of contribution scores and sequences differ."

        # compute mean contribution score per nucleotide and logo position
        mean_scores = np.zeros((len(seqs_unique[0]), len(seqs.alphabet)))
        counts = np.zeros_like(data.counts.array)
        for r_id, read in enumerate(seqs_unique):
            for pos, base in enumerate(read):
                base = str(base)
                if base in letter_dict.keys():
                    mean_scores[
                        pos,
                        letter_dict[base]] += contribution_scores[r_id][pos]
                    counts[pos, letter_dict[base]] += 1

        # add pseudocount to avoid divion by 0
        motif_len = len(seqs_unique[0])
        pseudocounts = np.reshape(
            np.concatenate([base_pseudocounts] * motif_len, axis=0),
            [motif_len, 4])
        mean_scores /= (counts + pseudocounts)

        # normalize scores to [0, 255] and assign color according the selected color scheme
        norm_scores = ((mean_scores - s_min) / (s_max - s_min)) * 255
        color_rules = []
        for base in letter_dict.keys():
            for pos in range(len(seqs[0])):
                custom_color = matplotlib.colors.rgb2hex(
                    colormap(int(norm_scores[pos, letter_dict[base]])))
                color_rules.append(SymbolIndexColor(base, [pos], custom_color))

        # set logo options
        options = LogoOptions()
        options.logo_title = "filter " + str(filter_index)
        options.color_scheme = ColorScheme(color_rules)
        options.stack_width = std_sizes["large"]
        options.resolution = 300

        # save filter logo
        l_format = LogoFormat(data, options)
        png = png_formatter(data, l_format)
        with open(out_png_name, 'wb') as out_file:
            out_file.write(png)
        eps = eps_formatter(data, l_format)
        with open(out_eps_name, 'wb') as out_file:
            out_file.write(eps)