def exclude_non_alpha(path, include=False): conn = sqlite3.connect(path) c = conn.cursor() match = [d[0] for d in util.get_dimensions(c) if not is_alpha_str(d[1])] for m in match: process_dimension(c, m, not include) c.close() conn.commit() conn.close()
def exclude_ngrams_shorter_than(path, N, include=False): conn = sqlite3.connect(path) c = conn.cursor() match = [ d[0] for d in util.get_dimensions(c) if not is_ngram_longer_than(d[1], N) ] for m in match: process_dimension(c, m, not include) c.close() conn.commit() conn.close()
def exclude_shorter_than(path, N, include=False): """Exclude all dimensions with term shorter than N characters.""" conn = sqlite3.connect(path) c = conn.cursor() short_dim = [d for d in util.get_dimensions(c) if len(d[1]) < N] for did, term, _, _ in short_dim: c.execute( """UPDATE Dimensions SET Exclude = ? WHERE DimensionId = ?""", (0 if include else 1, did)) c.close() conn.commit() conn.close()
def exclude_regex(path): """Exclude all dimensions that match an exclusion regex.""" conn = sqlite3.connect(path) c = conn.cursor() exclusion_regex = util.get_all_exclude_regex(c) all_dim = util.get_dimensions(c) cmd = 'UPDATE Dimensions SET Exclude = ? WHERE DimensionId = ?' for did, term, pos, exclude in all_dim: should_exclude = int(match_exclude_regex(pos, term, exclusion_regex)) if exclude != should_exclude: c.execute(cmd, (should_exclude, did)) c.close() conn.commit() conn.close()
def index(filename, options): """ Perform indexing. Each document is stemmed, and then the non-excluded dimensions are counted for that document. The result is put into the DocumentsToDimensions table. """ conn = sqlite3.connect(filename) c = conn.cursor() params = util.get_params(c, filename) stemmer = params['stemmer'] print 'index(): stemmer: %s' % stemmer stemmers = { 'porter' : nltk.PorterStemmer(), 'lancaster' : nltk.LancasterStemmer() } try: stemmer = stemmers[stemmer] except KeyError: print 'unsupported stemmer:', stemmer return 1 all_dim = util.get_dimensions(c, 0) assert all_dim, "You must calculate dimensions prior to indexing." all_include = util.get_all_include_regex(c) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents') num_total_docs = int(c.fetchone()[0]) c.execute('DELETE FROM DocumentsToDimensions') c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'bigram'") nBigrams = int(c.fetchone()[0]) print 'Number of bigrams: ', nBigrams do_bigrams = nBigrams > 0 c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'trigram'") nTrigrams = int(c.fetchone()[0]) print 'Number of trigrams: ', nTrigrams do_trigrams = nTrigrams > 0 # # If the POS column contains "unigram", then it means we didn't perform POS tagging when calculating dimensions. # c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'unigram'") pos_tag = int(c.fetchone()[0]) == 0 cmd = 'SELECT ED_ENC_NUM FROM Documents' if options.limit: cmd += ' LIMIT %d' % options.limit num_total_docs = min(options.limit, num_total_docs) # # TODO: why is fetchmany not working? # #document_ids = c.execute(cmd).fetchmany() document_ids = [] for row in c.execute(cmd): document_ids.append(row[0]) print "fetched %d document ids" % len(document_ids) num_batches = int(math.ceil(len(document_ids)/options.batch_size)) # # Set up multiprocessing. # # MAIN_PROCESS -> document_id_queue -> WORKER_PROCESSES # # Each worker subprocess reads a document from the SQL database, processes it, and writes back to the database.. # document_id_queue = multiprocessing.Queue() proc_queue = multiprocessing.Queue() for i in xrange(num_batches): start = i*options.batch_size end = start+options.batch_size document_id_queue.put(Batch(start, document_ids[start:end])) for i in range(options.subprocesses): document_id_queue.put(None) # # Terminate the SQL connection so that the subprocesses can use it. # conn.commit() conn.close() # # https://docs.python.org/2/library/array.html#module-array # counter = multiprocessing.Value("I") pr_list = [] for i in range(options.subprocesses): args = (document_id_queue, filename, stemmer, all_include, pos_tag, do_bigrams, do_trigrams, all_dim, counter) p = multiprocessing.Process(target=worker_subprocess, args=args) p.start() pr_list.append(p) # # Wait for all worker subprocesses to complete. # for i, p in enumerate(pr_list): p.join() # # Calculate IDF weighting. # conn = sqlite3.connect(filename) c = conn.cursor() for dim_id, _, _ in all_dim: c.execute("""SELECT COUNT(DimensionId) FROM DocumentsToDimensions WHERE DimensionId = ?""", (dim_id,)) freq = int(c.fetchone()[0]) idf = log10(num_total_docs/(1+freq)) c.execute( 'UPDATE Dimensions SET IDF = ? WHERE DimensionId = ?', (idf, dim_id)) # # Save and exit. # conn.commit() c.close()
def mrmr(c, path): """ Perform automatic mRMR feature selection using the specified cursor. Changes are persisted to the database using the cursor. """ params = util.get_params(c, path) # # mRMR feature selection # include_dim = set() exclude_dim = set() all_dim = util.get_dimensions(c, 0) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents') num_total_docs = int(c.fetchone()[0]) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents WHERE Score > 0') num_positive_docs = int(c.fetchone()[0]) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents WHERE Score < 0') num_negative_docs = int(c.fetchone()[0]) # # The part below is ported from filterFeatures() of reference.py # cu = params['C_UPPERCUTOFF'] * num_total_docs ccp = params['C_CLASSCUTOFF'] * num_positive_docs ccm = params['C_CLASSCUTOFF'] * num_negative_docs lcp = params['C_LOWERCUTOFF'] * num_positive_docs lcm = params['C_LOWERCUTOFF'] * num_negative_docs # # The original script didn't have any comments, so here's my guess of what # individual variables represent. # # cu Upper cut-off. If a feature occurs in more than cu documents, # then it should be excluded. # ccp Upper class cut-off for positive documents. # lcp Lower class cut-off for positive documents. # If the frequency of a feature within positive documents # falls within this interval, then it should be excluded. # ccm Upper class cut-off for negative documents. # lcm Lower class cut-off for negative documents. # If the frequency of a feature within negative documents # falls within this interval, then it should be excluded. # for (dim_id, _, _) in all_dim: text_count, plus_count, minus_count = 0, 0, 0 c.execute( """SELECT Score FROM Documents INNER JOIN DocumentsToDimensions ON Documents.ED_ENC_NUM = DocumentsToDimensions.ED_ENC_NUM WHERE DimensionId = ?""", (dim_id, )) for score in c: text_count += 1 if score > 0: plus_count += 1 elif score < 0: minus_count += 1 if params['USE_UPPERCUTS'] and text_count > cu: exclude_dim.add(dim_id) if dim_id in include_dim: include_dim.remove(dim_id) elif params['USE_CLASSCUTS'] and minus_count > ccm and plus_count > ccp: exclude_dim.add(dim_id) if dim_id in include_dim: include_dim.remove(dim_id) elif params['USE_LOWERCUTS'] and minus_count < lcm and plus_count < lcp: exclude_dim.add(dim_id) if dim_id in include_dim: include_dim.remove(dim_id) else: if dim_id in exclude_dim: exclude_dim.remove(dim_id) include_dim.add(dim_id) # # end of ported code. # print('mRMR enabled:', len(include_dim), 'disabled:', len(exclude_dim)) assert not include_dim.intersection(exclude_dim) for dim in include_dim: c.execute('UPDATE Dimensions SET Exclude = 0 WHERE DimensionId = ?', (dim, )) for dim in exclude_dim: c.execute('UPDATE Dimensions SET Exclude = 1 WHERE DimensionId = ?', (dim, ))
def __setitem__(self, sym_name, input_expr): """Assigns a function or expression to a new symbol, performs unit conversion where appropriate """ if not isinstance(sym_name, str): sym_name = str(sym_name) symbol, args, lhs_units, lhs_expr = self.parse_key(sym_name) if hasattr(input_expr, '__call__'): self.register_function(input_expr, symbol, lhs_expr, lhs_units) else: if self.verbose: print( "\n\nPARSING WITH UNIFY", lhs_expr, symbol, lhs_units, len(lhs_units), type(lhs_units)) print('symbol registry:', self.symbol_registry) rhs_expr = self.parse_value(input_expr, self.symbol_registry) if self.verbose: print('parsed rhs_expr', rhs_expr) if not isinstance(symbol, Symbol): if isinstance(lhs_expr, Symbol): symbol = Function(lhs_expr)(*tuple(rhs_expr.free_symbols)) else: #lhs is already a function symbol = lhs_expr lhs_str = str(symbol) sym_name = sym_name.replace(str(lhs_expr), lhs_str) if self.verbose: print('unit registry contents:') for k, v in self.unit_registry.items(): print('\t', k, type(k), v) if '[' in sym_name: if self.verbose: print('updating unit registry with {} -> {}'.format(sym_name, rhs_expr)) rhs = rhs_expr arg_units = get_arg_units(rhs_expr, self.unit_registry) if self.verbose: print(arg_units) sym_name = self.update_unit_registry(sym_name, arg_units) if self.verbose: print('unit registry update returned', sym_name, self.unit_registry.get(symbol)) else: if self.verbose: print(sym_name, symbol, 'had no units. Getting units from {}'.format(rhs_expr)) expr_unit = get_expr_unit(rhs_expr, self.unit_registry, self.verbose) arg_units = get_arg_units(rhs_expr, self.unit_registry) if self.verbose: print('registering {} with {} {}'.format(symbol, expr_unit, arg_units)) if (symbol not in self.unit_registry) and (expr_unit is not None): self.unit_registry[symbol] = symbol.subs(arg_units) self.unit_registry[symbol.subs(arg_units)] = expr_unit if expr_unit is not None: expr_dimensions = Dimension(get_dimensions(expr_unit)) if expr_dimensions != Dimension(1): lhs_units = str(get_abbrev(get_expr_unit( expr_unit, self.unit_registry, self.verbose))) else: lhs_units = '' if self.verbose: print('registered lhs_units', lhs_units) rhs = rhs_expr sym_name = str(sym_name) if len(lhs_units) > 0: if self.verbose: print('about to unify lhs_units {} {} with {}'.format( lhs_units, type(lhs_units), rhs)) expr = unify( Eq(parse_expr(sym_name), rhs), self.unit_registry, # to_symbol = symbol, verbose=self.verbose) rhs_expr = expr.rhs if self.verbose: print('symbol after unify', symbol, type(symbol), rhs_expr) print('unit registry to resolve units:') for k,v in self.unit_registry.items(): print('\t{}:{}'.format(k,v)) units = get_expr_unit(symbol, self.unit_registry) if Dimension(get_dimensions(units)) != Dimension(1): units = get_abbrev(units) if units is not None: units = str(units) else: units = '' else: units = '' if self.verbose: print('units after resolve', symbol, units) for k, v in self.unit_registry.items(): print('\t{}: {}'.format(k, v)) rhs_args = rhs_expr.free_symbols symbol = self.check_or_replace_symbol(symbol, rhs_args, rhs_expr) self.validate_function(symbol, rhs_expr) composition = {str(k_): self[k_] for k_ in self} arg_units = {} if symbol in self.unit_registry: unit_args = self.unit_registry[symbol] if unit_args is not None: if len(unit_args.args) == len(symbol.args): for arg, unit in zip(symbol.args, unit_args.args): arg_units[str(arg)] = str(get_abbrev(unit)) func = self.vectorize_function(symbol, rhs_expr, composition) meta = dict(units=units, arg_units=arg_units) func.meta = meta func.data = None self.register_signature(symbol, units, lhs_expr, rhs_expr) func._repr_latex_ = lambda: self.func_latex(str(type(symbol)), mode='inline') super(Kamodo, self).__setitem__(symbol, func) super(Kamodo, self).__setitem__(type(symbol), self[symbol]) self.register_symbol(symbol)
def index(filename, nlp): """ Perform indexing. Each document is stemmed, and then the non-excluded dimensions are counted for that document. The result is put into the DocumentsToDimensions table. """ conn = sqlite3.connect(filename) c = conn.cursor() params = util.get_params(c, filename) stemmer = params['stemmer'] print('index(): stemmer: %s' % stemmer) all_dim = util.get_dimensions(c, 0) assert all_dim, "You must calculate dimensions prior to indexing." all_include = util.get_all_include_regex(c) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents') num_total_docs = int(c.fetchone()[0]) c.execute('DELETE FROM DocumentsToDimensions') c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'bigram'") nBigrams = int(c.fetchone()[0]) print('Number of bigrams: ', nBigrams) do_bigrams = nBigrams > 0 c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'trigram'") nTrigrams = int(c.fetchone()[0]) print('Number of trigrams: ', nTrigrams) do_trigrams = nTrigrams > 0 # # If the POS column contains "unigram", then it means we didn't perform POS tagging when calculating dimensions. # c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'unigram'") pos_tag = int(c.fetchone()[0]) == 0 cmd = 'SELECT ED_ENC_NUM FROM Documents' # if options.limit: # cmd += ' LIMIT %d' % options.limit # num_total_docs = min(options.limit, num_total_docs) # # TODO: why is fetchmany not working? # #document_ids = c.execute(cmd).fetchmany() document_ids = [] for row in c.execute(cmd): document_ids.append(row[0]) print("fetched %d document ids" % len(document_ids)) # # Terminate the SQL connection so that the subprocesses can use it. # conn.commit() conn.close() # # https://docs.python.org/2/library/array.html#module-array # main_process(nlp, document_ids, filename, stemmer, all_include, pos_tag, do_bigrams, do_trigrams, all_dim) conn = sqlite3.connect(filename) c = conn.cursor() for dim_id, _, _ in all_dim: c.execute( """SELECT COUNT(DimensionId) FROM DocumentsToDimensions WHERE DimensionId = ?""", (dim_id, )) freq = int(c.fetchone()[0]) idf = log10(num_total_docs / (1 + freq)) c.execute('UPDATE Dimensions SET IDF = ? WHERE DimensionId = ?', (idf, dim_id)) # # Save and exit. # conn.commit() c.close()
def create_alignment(layout, reference_name, upload_object, upload_timestamp, sample_object, sample_timestamp, palette_name, reference_layout, consensus_sequence): '''Create alignment''' if int(upload_timestamp) > int(sample_timestamp): seq_object = upload_object if seq_object is None: return '' seq_lines = parse_seq_object(seq_object) else: if sample_object is None: return '' seq_lines = sample_object if consensus_sequence: consensus_sequence = True else: consensus_sequence = False names, seqs, conservation = parse_sequences(seq_lines, consensus_sequence) x, y, n_seqs, sequence_length = get_dimensions(seqs) try: ordered_names, ordered_seqs = get_msa_order(reference_name, names, seqs) except KeyError: ordered_names, ordered_seqs = names, seqs palette = COLOR_DIC[palette_name] # I am sort of misusing the checkbox for the alignment layout. Really this # should be returning True/False rather than [True] and [] if reference_layout: reference_layout = True else: refrence_layout = False text_values, text_colors, block_values, block_colors = \ alignment_layout(ordered_seqs, layout, palette, reference_layout, BASE_DIC) trace = go.Heatmap( z=block_values, colorscale=block_colors, showscale=False, ) steps = [{ 'args': ['xaxis', { 'range': [-0.5 + e, 30.5 + e] }], 'method': 'relayout', 'label': '' } for e in range(sequence_length - 30)] webgl_text = { 'type': 'scattergl', 'mode': 'text', 'x': x, 'y': y, 'text': text_values, 'yaxis': 'y2', 'textfont': { 'size': 18, 'color': text_colors } } bar_trace = { 'type': 'bar', 'x': list(range(sequence_length)), 'y': conservation, 'marker': { 'color': '#1e90ff' } } fig = tools.make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.001) fig.append_trace(trace, 2, 1) fig.append_trace(bar_trace, 1, 1) fig = fig.to_plotly_json() fig['data'].append(webgl_text) sliders = [dict(minorticklen=0, tickwidth=0, active=0, steps=steps)] fig['layout'] = dict(sliders=sliders, yaxis2=dict(autorange='reversed', ticks='', ticksuffix=' ', ticktext=ordered_names, tickvals=list( np.arange(0, len(block_values))), showticklabels=True), yaxis=dict(ticks='', ticksuffix=' ', showticklabels=False, domain=[0.7, 1]), margin=go.layout.Margin(l=200, r=50, b=0, t=50, pad=0), height=((n_seqs * 50) + 100), xaxis={'range': [-0.5, 30.5]}, showlegend=False) height = (fig['layout']['height'] - fig['layout']['margin']['t'] - fig['layout']['margin']['b']) y1_height = 65 #px fig['layout']['yaxis']['domain'] = [1 - y1_height / height, 1] fig['layout']['yaxis2']['domain'] = [0, 1.01 - (y1_height / height)] return fig