Beispiel #1
0
def exclude_non_alpha(path, include=False):
    conn = sqlite3.connect(path)
    c = conn.cursor()
    match = [d[0] for d in util.get_dimensions(c) if not is_alpha_str(d[1])]
    for m in match:
        process_dimension(c, m, not include)
    c.close()
    conn.commit()
    conn.close()
Beispiel #2
0
def exclude_ngrams_shorter_than(path, N, include=False):
    conn = sqlite3.connect(path)
    c = conn.cursor()
    match = [
        d[0] for d in util.get_dimensions(c)
        if not is_ngram_longer_than(d[1], N)
    ]
    for m in match:
        process_dimension(c, m, not include)
    c.close()
    conn.commit()
    conn.close()
Beispiel #3
0
def exclude_shorter_than(path, N, include=False):
    """Exclude all dimensions with term shorter than N characters."""
    conn = sqlite3.connect(path)
    c = conn.cursor()
    short_dim = [d for d in util.get_dimensions(c) if len(d[1]) < N]
    for did, term, _, _ in short_dim:
        c.execute(
            """UPDATE Dimensions SET Exclude = ?
                WHERE DimensionId = ?""", (0 if include else 1, did))
    c.close()
    conn.commit()
    conn.close()
Beispiel #4
0
def exclude_regex(path):
    """Exclude all dimensions that match an exclusion regex."""
    conn = sqlite3.connect(path)
    c = conn.cursor()
    exclusion_regex = util.get_all_exclude_regex(c)
    all_dim = util.get_dimensions(c)
    cmd = 'UPDATE Dimensions SET Exclude = ? WHERE DimensionId = ?'
    for did, term, pos, exclude in all_dim:
        should_exclude = int(match_exclude_regex(pos, term, exclusion_regex))
        if exclude != should_exclude:
            c.execute(cmd, (should_exclude, did))
    c.close()
    conn.commit()
    conn.close()
Beispiel #5
0
def index(filename, options):
    """
    Perform indexing.  Each document is stemmed, and then the non-excluded
    dimensions are counted for that document.  The result is put into the
    DocumentsToDimensions table.
    """
    conn = sqlite3.connect(filename)
    c = conn.cursor()
    params = util.get_params(c, filename)
    stemmer = params['stemmer']
    print 'index(): stemmer: %s' % stemmer

    stemmers = { 'porter' : nltk.PorterStemmer(),
            'lancaster' : nltk.LancasterStemmer() }
    try:
        stemmer = stemmers[stemmer]
    except KeyError:
        print 'unsupported stemmer:', stemmer
        return 1

    all_dim = util.get_dimensions(c, 0)
    assert all_dim, "You must calculate dimensions prior to indexing."

    all_include = util.get_all_include_regex(c)

    c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents')
    num_total_docs = int(c.fetchone()[0])

    c.execute('DELETE FROM DocumentsToDimensions')

    c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'bigram'")
    nBigrams = int(c.fetchone()[0])
    print 'Number of bigrams: ', nBigrams
    do_bigrams = nBigrams > 0

    c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'trigram'")
    nTrigrams = int(c.fetchone()[0])
    print 'Number of trigrams: ', nTrigrams
    do_trigrams = nTrigrams > 0

    #
    # If the POS column contains "unigram", then it means we didn't perform POS tagging when calculating dimensions.
    #
    c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'unigram'")
    pos_tag = int(c.fetchone()[0]) == 0

    cmd = 'SELECT ED_ENC_NUM FROM Documents'
    if options.limit:
        cmd += ' LIMIT %d' % options.limit
        num_total_docs = min(options.limit, num_total_docs)
    #
    # TODO: why is fetchmany not working?
    #
    #document_ids = c.execute(cmd).fetchmany()
    document_ids = []
    for row in c.execute(cmd):
        document_ids.append(row[0])
    print "fetched %d document ids" % len(document_ids)
    num_batches = int(math.ceil(len(document_ids)/options.batch_size))

    #
    # Set up multiprocessing.
    #
    # MAIN_PROCESS -> document_id_queue -> WORKER_PROCESSES
    #
    # Each worker subprocess reads a document from the SQL database, processes it, and writes back to the database..
    #
    document_id_queue = multiprocessing.Queue()
    proc_queue = multiprocessing.Queue()
    for i in xrange(num_batches):
        start = i*options.batch_size
        end = start+options.batch_size
        document_id_queue.put(Batch(start, document_ids[start:end]))
    for i in range(options.subprocesses):
        document_id_queue.put(None)

    #
    # Terminate the SQL connection so that the subprocesses can use it.
    #
    conn.commit()
    conn.close()

    #
    # https://docs.python.org/2/library/array.html#module-array
    #
    counter = multiprocessing.Value("I")

    pr_list = []
    for i in range(options.subprocesses):
        args = (document_id_queue, filename, stemmer, all_include, pos_tag, do_bigrams, do_trigrams, all_dim, counter)
        p = multiprocessing.Process(target=worker_subprocess, args=args)
        p.start()
        pr_list.append(p)

    #
    # Wait for all worker subprocesses to complete.
    #
    for i, p in enumerate(pr_list):
        p.join()

    #
    # Calculate IDF weighting.
    #
    conn = sqlite3.connect(filename)
    c = conn.cursor()
    for dim_id, _, _ in all_dim:
        c.execute("""SELECT COUNT(DimensionId)
                FROM DocumentsToDimensions
                WHERE DimensionId = ?""", (dim_id,))
        freq = int(c.fetchone()[0])
        idf = log10(num_total_docs/(1+freq))
        c.execute(
                'UPDATE Dimensions SET IDF = ? WHERE DimensionId = ?',
                (idf, dim_id))

    #
    # Save and exit.
    #
    conn.commit()
    c.close()
Beispiel #6
0
def mrmr(c, path):
    """
    Perform automatic mRMR feature selection using the specified cursor.
    Changes are persisted to the database using the cursor.
    """
    params = util.get_params(c, path)
    #
    # mRMR feature selection
    #
    include_dim = set()
    exclude_dim = set()
    all_dim = util.get_dimensions(c, 0)

    c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents')
    num_total_docs = int(c.fetchone()[0])
    c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents WHERE Score > 0')
    num_positive_docs = int(c.fetchone()[0])
    c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents WHERE Score < 0')
    num_negative_docs = int(c.fetchone()[0])

    #
    # The part below is ported from filterFeatures() of reference.py
    #
    cu = params['C_UPPERCUTOFF'] * num_total_docs
    ccp = params['C_CLASSCUTOFF'] * num_positive_docs
    ccm = params['C_CLASSCUTOFF'] * num_negative_docs
    lcp = params['C_LOWERCUTOFF'] * num_positive_docs
    lcm = params['C_LOWERCUTOFF'] * num_negative_docs

    #
    # The original script didn't have any comments, so here's my guess of what
    # individual variables represent.
    #
    # cu        Upper cut-off.  If a feature occurs in more than cu documents,
    #           then it should be excluded.
    # ccp       Upper class cut-off for positive documents.
    # lcp       Lower class cut-off for positive documents.
    #           If the frequency of a feature within positive documents
    #           falls within this interval, then it should be excluded.
    # ccm       Upper class cut-off for negative documents.
    # lcm       Lower class cut-off for negative documents.
    #           If the frequency of a feature within negative documents
    #           falls within this interval, then it should be excluded.
    #

    for (dim_id, _, _) in all_dim:
        text_count, plus_count, minus_count = 0, 0, 0
        c.execute(
            """SELECT Score
                FROM Documents INNER JOIN DocumentsToDimensions
                ON Documents.ED_ENC_NUM = DocumentsToDimensions.ED_ENC_NUM
                WHERE DimensionId = ?""", (dim_id, ))
        for score in c:
            text_count += 1
            if score > 0:
                plus_count += 1
            elif score < 0:
                minus_count += 1

        if params['USE_UPPERCUTS'] and text_count > cu:
            exclude_dim.add(dim_id)
            if dim_id in include_dim:
                include_dim.remove(dim_id)
        elif params['USE_CLASSCUTS'] and minus_count > ccm and plus_count > ccp:
            exclude_dim.add(dim_id)
            if dim_id in include_dim:
                include_dim.remove(dim_id)
        elif params['USE_LOWERCUTS'] and minus_count < lcm and plus_count < lcp:
            exclude_dim.add(dim_id)
            if dim_id in include_dim:
                include_dim.remove(dim_id)
        else:
            if dim_id in exclude_dim:
                exclude_dim.remove(dim_id)
            include_dim.add(dim_id)
    #
    # end of ported code.
    #
    print('mRMR enabled:', len(include_dim), 'disabled:', len(exclude_dim))

    assert not include_dim.intersection(exclude_dim)
    for dim in include_dim:
        c.execute('UPDATE Dimensions SET Exclude = 0 WHERE DimensionId = ?',
                  (dim, ))
    for dim in exclude_dim:
        c.execute('UPDATE Dimensions SET Exclude = 1 WHERE DimensionId = ?',
                  (dim, ))
Beispiel #7
0
    def __setitem__(self, sym_name, input_expr):
        """Assigns a function or expression to a new symbol,
        performs unit conversion where appropriate

        """
        if not isinstance(sym_name, str):
            sym_name = str(sym_name)

        symbol, args, lhs_units, lhs_expr = self.parse_key(sym_name)

        if hasattr(input_expr, '__call__'):
            self.register_function(input_expr, symbol, lhs_expr, lhs_units)

        else:
            if self.verbose:
                print(
                    "\n\nPARSING WITH UNIFY",
                    lhs_expr,
                    symbol,
                    lhs_units,
                    len(lhs_units),
                    type(lhs_units))
                print('symbol registry:', self.symbol_registry)

            rhs_expr = self.parse_value(input_expr, self.symbol_registry)

            if self.verbose:
                print('parsed rhs_expr', rhs_expr)

            if not isinstance(symbol, Symbol):
                if isinstance(lhs_expr, Symbol):
                    symbol = Function(lhs_expr)(*tuple(rhs_expr.free_symbols))
                else: #lhs is already a function
                    symbol = lhs_expr
                lhs_str = str(symbol)
                sym_name = sym_name.replace(str(lhs_expr), lhs_str)
            if self.verbose:
                print('unit registry contents:')
                for k, v in self.unit_registry.items():
                    print('\t', k, type(k), v)
            if '[' in sym_name:
                if self.verbose:
                    print('updating unit registry with {} -> {}'.format(sym_name, rhs_expr))
                rhs = rhs_expr
                arg_units = get_arg_units(rhs_expr, self.unit_registry)
                if self.verbose:
                    print(arg_units)
                sym_name = self.update_unit_registry(sym_name, arg_units)
                if self.verbose:
                    print('unit registry update returned', sym_name, self.unit_registry.get(symbol))
            else:

                if self.verbose:
                    print(sym_name,
                          symbol,
                          'had no units. Getting units from {}'.format(rhs_expr))

                expr_unit = get_expr_unit(rhs_expr, self.unit_registry, self.verbose)
                arg_units = get_arg_units(rhs_expr, self.unit_registry)


                if self.verbose:
                    print('registering {} with {} {}'.format(symbol, expr_unit, arg_units))

                if (symbol not in self.unit_registry) and (expr_unit is not None):
                    self.unit_registry[symbol] = symbol.subs(arg_units)
                    self.unit_registry[symbol.subs(arg_units)] = expr_unit


                if expr_unit is not None:
                    expr_dimensions = Dimension(get_dimensions(expr_unit))
                    if expr_dimensions != Dimension(1):
                        lhs_units = str(get_abbrev(get_expr_unit(
                            expr_unit,
                            self.unit_registry,
                            self.verbose)))
                    else:
                        lhs_units = ''

                if self.verbose:
                    print('registered lhs_units', lhs_units)

                rhs = rhs_expr
                sym_name = str(sym_name)



            if len(lhs_units) > 0:
                if self.verbose:
                    print('about to unify lhs_units {} {} with {}'.format(
                        lhs_units, type(lhs_units), rhs))

                expr = unify(
                    Eq(parse_expr(sym_name), rhs),
                    self.unit_registry,
                    # to_symbol = symbol,
                    verbose=self.verbose)
                rhs_expr = expr.rhs

            if self.verbose:
                print('symbol after unify', symbol, type(symbol), rhs_expr)
                print('unit registry to resolve units:')
                for k,v in self.unit_registry.items():
                    print('\t{}:{}'.format(k,v))

            units = get_expr_unit(symbol, self.unit_registry)
            if Dimension(get_dimensions(units)) != Dimension(1):
                units = get_abbrev(units)
                if units is not None:
                    units = str(units)
                else:
                    units = ''
            else:
                units = ''

            if self.verbose:
                print('units after resolve', symbol, units)
                for k, v in self.unit_registry.items():
                    print('\t{}: {}'.format(k, v))

            rhs_args = rhs_expr.free_symbols

            symbol = self.check_or_replace_symbol(symbol, rhs_args, rhs_expr)
            self.validate_function(symbol, rhs_expr)

            composition = {str(k_): self[k_] for k_ in self}
            arg_units = {}
            if symbol in self.unit_registry:
                unit_args = self.unit_registry[symbol]
                if unit_args is not None:
                    if len(unit_args.args) == len(symbol.args):
                        for arg, unit in zip(symbol.args, unit_args.args):
                            arg_units[str(arg)] = str(get_abbrev(unit))
            func = self.vectorize_function(symbol, rhs_expr, composition)
            meta = dict(units=units, arg_units=arg_units)
            func.meta = meta
            func.data = None
            self.register_signature(symbol, units, lhs_expr, rhs_expr)
            func._repr_latex_ = lambda: self.func_latex(str(type(symbol)), mode='inline')
            super(Kamodo, self).__setitem__(symbol, func)
            super(Kamodo, self).__setitem__(type(symbol), self[symbol])
            self.register_symbol(symbol)
Beispiel #8
0
def index(filename, nlp):
    """
    Perform indexing.  Each document is stemmed, and then the non-excluded
    dimensions are counted for that document.  The result is put into the
    DocumentsToDimensions table.
    """
    conn = sqlite3.connect(filename)
    c = conn.cursor()
    params = util.get_params(c, filename)
    stemmer = params['stemmer']
    print('index(): stemmer: %s' % stemmer)

    all_dim = util.get_dimensions(c, 0)
    assert all_dim, "You must calculate dimensions prior to indexing."

    all_include = util.get_all_include_regex(c)

    c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents')
    num_total_docs = int(c.fetchone()[0])

    c.execute('DELETE FROM DocumentsToDimensions')

    c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'bigram'")
    nBigrams = int(c.fetchone()[0])
    print('Number of bigrams: ', nBigrams)
    do_bigrams = nBigrams > 0

    c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'trigram'")
    nTrigrams = int(c.fetchone()[0])
    print('Number of trigrams: ', nTrigrams)
    do_trigrams = nTrigrams > 0

    #
    # If the POS column contains "unigram", then it means we didn't perform POS tagging when calculating dimensions.
    #
    c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'unigram'")
    pos_tag = int(c.fetchone()[0]) == 0

    cmd = 'SELECT ED_ENC_NUM FROM Documents'
    # if options.limit:
    #    cmd += ' LIMIT %d' % options.limit
    #    num_total_docs = min(options.limit, num_total_docs)

    #
    # TODO: why is fetchmany not working?
    #
    #document_ids = c.execute(cmd).fetchmany()
    document_ids = []
    for row in c.execute(cmd):
        document_ids.append(row[0])
    print("fetched %d document ids" % len(document_ids))

    #
    # Terminate the SQL connection so that the subprocesses can use it.
    #
    conn.commit()
    conn.close()

    #
    # https://docs.python.org/2/library/array.html#module-array
    #

    main_process(nlp, document_ids, filename, stemmer, all_include, pos_tag,
                 do_bigrams, do_trigrams, all_dim)

    conn = sqlite3.connect(filename)
    c = conn.cursor()
    for dim_id, _, _ in all_dim:
        c.execute(
            """SELECT COUNT(DimensionId)
                FROM DocumentsToDimensions
                WHERE DimensionId = ?""", (dim_id, ))
        freq = int(c.fetchone()[0])
        idf = log10(num_total_docs / (1 + freq))
        c.execute('UPDATE Dimensions SET IDF = ? WHERE DimensionId = ?',
                  (idf, dim_id))

    #
    # Save and exit.
    #
    conn.commit()
    c.close()
Beispiel #9
0
def create_alignment(layout, reference_name, upload_object, upload_timestamp,
                     sample_object, sample_timestamp, palette_name,
                     reference_layout, consensus_sequence):
    '''Create alignment'''
    if int(upload_timestamp) > int(sample_timestamp):
        seq_object = upload_object
        if seq_object is None:
            return ''
        seq_lines = parse_seq_object(seq_object)
    else:
        if sample_object is None:
            return ''
        seq_lines = sample_object

    if consensus_sequence:
        consensus_sequence = True
    else:
        consensus_sequence = False
    names, seqs, conservation = parse_sequences(seq_lines, consensus_sequence)

    x, y, n_seqs, sequence_length = get_dimensions(seqs)

    try:
        ordered_names, ordered_seqs = get_msa_order(reference_name, names,
                                                    seqs)
    except KeyError:
        ordered_names, ordered_seqs = names, seqs

    palette = COLOR_DIC[palette_name]

    # I am sort of misusing the checkbox for the alignment layout. Really this
    # should be returning True/False rather than [True] and []

    if reference_layout:
        reference_layout = True
    else:
        refrence_layout = False

    text_values, text_colors, block_values, block_colors = \
    alignment_layout(ordered_seqs, layout, palette, reference_layout, BASE_DIC)

    trace = go.Heatmap(
        z=block_values,
        colorscale=block_colors,
        showscale=False,
    )

    steps = [{
        'args': ['xaxis', {
            'range': [-0.5 + e, 30.5 + e]
        }],
        'method': 'relayout',
        'label': ''
    } for e in range(sequence_length - 30)]

    webgl_text = {
        'type': 'scattergl',
        'mode': 'text',
        'x': x,
        'y': y,
        'text': text_values,
        'yaxis': 'y2',
        'textfont': {
            'size': 18,
            'color': text_colors
        }
    }

    bar_trace = {
        'type': 'bar',
        'x': list(range(sequence_length)),
        'y': conservation,
        'marker': {
            'color': '#1e90ff'
        }
    }

    fig = tools.make_subplots(rows=2,
                              cols=1,
                              shared_xaxes=True,
                              vertical_spacing=0.001)
    fig.append_trace(trace, 2, 1)
    fig.append_trace(bar_trace, 1, 1)

    fig = fig.to_plotly_json()

    fig['data'].append(webgl_text)

    sliders = [dict(minorticklen=0, tickwidth=0, active=0, steps=steps)]
    fig['layout'] = dict(sliders=sliders,
                         yaxis2=dict(autorange='reversed',
                                     ticks='',
                                     ticksuffix='  ',
                                     ticktext=ordered_names,
                                     tickvals=list(
                                         np.arange(0, len(block_values))),
                                     showticklabels=True),
                         yaxis=dict(ticks='',
                                    ticksuffix='  ',
                                    showticklabels=False,
                                    domain=[0.7, 1]),
                         margin=go.layout.Margin(l=200, r=50, b=0, t=50,
                                                 pad=0),
                         height=((n_seqs * 50) + 100),
                         xaxis={'range': [-0.5, 30.5]},
                         showlegend=False)

    height = (fig['layout']['height'] - fig['layout']['margin']['t'] -
              fig['layout']['margin']['b'])
    y1_height = 65  #px
    fig['layout']['yaxis']['domain'] = [1 - y1_height / height, 1]
    fig['layout']['yaxis2']['domain'] = [0, 1.01 - (y1_height / height)]
    return fig