def compute_boundary(trace, eps=0.1): refinements = mdt.volume_guided_refinement([rectangle], phi(trace)) return list( fn.pluck(1, fn.first( fn.dropwhile(lambda x: -min(fn.pluck(0, x)) > eps, refinements))))
def get_reps(zip_code): representatives_ids_url = "https://congress.api.sunlightfoundation.com/legislators/locate?zip=" + zip_code + "&fields=bioguide_id&apikey=a5871887a24348d1a40d969832721c91" representative_ids_list = pluck("bioguide_id", requests.get(representatives_ids_url).json()['results']) bill_ids_url = "https://congress.api.sunlightfoundation.com/bills?query=NASA&fields=bill_id&last_vote_at__exists=true&apikey=a5871887a24348d1a40d969832721c91" bill_ids = "|".join(pluck("bill_id", requests.get(bill_ids_url).json()['results'])) all_votes_url = "https://congress.api.sunlightfoundation.com/votes?bill_id__in=" + bill_ids + "&fields=voters&vote_type=passage&apikey=a5871887a24348d1a40d969832721c91" all_votes = pluck("voters", requests.get(all_votes_url).json()['results']) scores = dict(total=0, voters={}) for vote in all_votes: scores['total'] += 1 for voter, value in vote.iteritems(): if voter in representative_ids_list: if not voter in scores['voters'].keys(): scores['voters'][voter] = value['voter']; scores['voters'][voter]['score'] = 0 if value['vote'] == "Nay": scores['voters'][voter]['score'] -= 1 elif value['vote'] == "Yea": scores['voters'][voter]['score'] += 1 pprint.pprint(json.dumps(scores), width=1) return json.dumps(scores)
def top_k_custom_vector_similarity(query_docs, candidate_docs, k): """Given most similar docs by looking at custom trained vector means""" if not all(['embedding_mean' in example for example in np.concatenate((query_docs, candidate_docs))]): raise ValueError('Not all embedding means have been calculated') distances = cosine_distances(pluck('embedding_mean', query_docs), pluck('embedding_mean', candidate_docs)) closest_docs = np.argsort(distances, axis=1)[:, :k] return closest_docs
def apply_model_stream(docs): global model ids = _.pluck('id', docs) msgs = map(lambda x: _.flatten([x])[0], _.pluck('msg', docs)) z = model['vect'].transform(msgs) z = model['ch2'].transform(z) z = model['tfidf'].transform(z) pred = model['clf'].predict_proba(z) for i in range(0, len(ids)): yield { "_id" : ids[i], "_type" : config['elasticsearch']['_type'], "_index" : config['elasticsearch']['_to_index'], "_op_type" : "update", "doc" : { '__meta__' : { 'tri_pred' : { 'neg' : float(pred[i][0]), 'neut' : float(pred[i][1]), 'pos' : float(pred[i][2]) } } } }
def apply_model_stream(docs): global model ids = _.pluck('id', docs) msgs = map(lambda x: _.flatten([x])[0], _.pluck('msg', docs)) z = model['vect'].transform(msgs) z = model['ch2'].transform(z) z = model['tfidf'].transform(z) pred = model['clf'].predict_proba(z) for i in range(0, len(ids)): yield { "_id": ids[i], "_type": config['elasticsearch']['_type'], "_index": config['elasticsearch']['_to_index'], "_op_type": "update", "doc": { '__meta__': { 'tri_pred': { 'neg': float(pred[i][0]), 'neut': float(pred[i][1]), 'pos': float(pred[i][2]) } } } }
def record_model_stats(self, predict_proba, model_index): """Add to a dataframe with data on each model's accuracy on each pool""" for pool_index, pool in enumerate(self.test_pools): prediction_df = results_df(predict_proba, pluck('content', pool), pluck('label', pool)) prediction_df['model'] = model_index prediction_df['pool'] = pool_index self.model_results = pd.concat((self.model_results, prediction_df))
def test_make_signal(): sig1 = signal(DATA1, start=0, end=4, tag='x') sig2 = signal(DATA1, start=1, end=2, tag='x') assert sig1 | sig2 == sig1 assert set(fn.pluck(0, DATA1)) == set(sig1.times()) assert set(fn.pluck(0, DATA1)) > set(sig2.times()) assert len(sig2.times()) == 1 assert set(fn.pluck(1, DATA1)) == {v['x'] for v in sig1.values()}
def find_bb(bounds): bounds_1D = reduce(operator.concat, bounds) lbs = [(np.array( [k.bot for k in fn.pluck(i, list(fn.pluck(0, bounds_1D)))])).min() for i in range(len(bounds_1D[0]))] ubs = [(np.array( [k.top for k in fn.pluck(i, list(fn.pluck(0, bounds_1D)))])).max() for i in range(len(bounds_1D[0]))] return np.array([[ubs[i] - lbs[i]] for i in range(len(lbs))]), lbs, ubs
def log_accuracy(self, predict_proba, model_number): """Log model accuracy""" train_accuracy = results_df(predict_proba, pluck('content', self.training_data), pluck('label', self.training_data)).correct.mean() test_accuracy = self.model_results[self.model_results.model == model_number].correct.mean() self.log('Train accuracy: {:.2f}'.format(train_accuracy)) self.log('Test accuracy: {:.2f}'.format(test_accuracy))
def mysql_to_file(cls): conn = client.conn() try: sql = "show table status" tables_infos = conn.query(sql) data = {} os.popen('rm ./public/toml/* -f') os.popen('rm ./public/json/* -f') xls_table_map = get_xls_table_map() table_names = _.pluck("table_name", xls_table_map) for k in tables_infos: if k.Name in table_names: data["tableInfos"] = k sql = "SELECT * FROM " + k.Name result = (conn.query(sql)) data["record"] = result sql = "SHOW FULL FIELDS FROM " + k.Name result1 = conn.query(sql) data["fieldInfo"] = result1 cls.toToml(data) cls.toJson(data) zipTool.zip_dir("./public/toml", "./public/toml.zip") zipTool.zip_dir("./public/json", "./public/json.zip") return True except: traceback.print_exc() return False
def load_data(self, depth=1): if depth == 0: return self.json = self._client._get('paper/{}'.format(self.paperId)) data = { 'authors': [ Author(id, self._client) for id in funcy.pluck('authorId', self.json['authors']) if id is not None ], 'citationVelocity': self.json['citationVelocity'], 'doi': self.json['doi'], 'influentialCitationCount': self.json['influentialCitationCount'], 'title': self.json['title'], 'url': self.json['url'], 'venue': self.json['venue'], 'year': self.json['year'], } self.data = data if depth - 1 > 0: for author in self.data['authors']: author.load_data(depth=depth - 1)
def lexicographic_opt(func, ordering, tol): dim = len(ordering) assert set(fn.pluck(0, ordering)) == set(range(dim)) tol /= dim # Need to compensate for multiple binsearches. rec = refine.bounding_box( domain=mdtr.unit_rec(dim), oracle=func ) # If polarity is True, set initial value at bounding.top. # O.w. use bounding.bot. base = tuple((rec.top if p else rec.bot)[i] for i, p in sorted(ordering)) res_rec = mdtr.to_rec(zip(base, base)) for idx, polarity in ordering: oracle = func rec = mdtr.to_rec( (0, 1) if i == idx else (p, p) for i, p in enumerate(base) ) result_type, res_cand = binsearch(rec, oracle, eps=tol) if result_type == SearchResultType.NON_TRIVIAL: res_rec = res_cand base = res_rec.bot return res_rec
def load_data(self, depth=1): if depth == 0: return self.json = self._client._get('author/{}'.format(self.authorId)) data = { 'papers': [ Paper(id, self._client) for id in funcy.pluck('paperId', self.json['papers']) if id is not None ], 'citationVelocity': self.json['citationVelocity'], 'influentialCitationCount': self.json['influentialCitationCount'], 'name': self.json['name'], 'url': self.json['url'], } self.data = data if depth - 1 > 0: for paper in self.data['papers']: paper.load_data(depth=depth - 1)
def to_signal(ts_mapping) -> DiscreteSignal: if isinstance(ts_mapping, DiscreteSignal): return ts_mapping start = min(fn.pluck(0, fn.cat(ts_mapping.values()))) signals = (signal(v, start, OO, tag=k) for k, v in ts_mapping.items()) return reduce(op.or_, signals)
def add_prediction_info(predict_proba, data): """Add entry for predictions and entropy to data""" prediction_df = results_df(predict_proba, pluck('content', data)) for (row, predictions) in zip(data, prediction_df.to_dict(orient='records')): row.update(predictions) return data
def pool_n(n, comments): """Pool every n comments""" pool = lambda comments: ' '.join(pluck('body', comments)) pools = [] for i in range(0, len(comments), n): pools.append(pool(comments[i: min(len(comments), i+n)])) if len(pools[-1]) < n: pools = pools[:-1] return pools
def infer(concept_class, demos, brute_force=False): candidates = list(traverse(concept_class, demos)) assert len(candidates) == 6 if not brute_force: candidates = list(filter(is_candidate(demos), candidates)) candidates = list(fn.pluck(0, candidates)) print([score_candidate(demos)(c) for c in candidates]) return max(candidates, key=score_candidate(demos))
def pool_n(n, comments): """Pool every n comments""" pool = lambda comments: ' '.join(pluck('body', comments)) pools = [] for i in range(0, len(comments), n): pools.append(pool(comments[i:min(len(comments), i + n)])) if len(pools[-1]) < n: pools = pools[:-1] return pools
def _find_addresses(self, family): addresses = [] for interface in interfaces(): try: if_data = ifaddresses(interface)[family] addresses += pluck("addr", if_data) except KeyError: pass return [a.rsplit("%", 1)[0] for a in addresses]
def get_times(x, tau, lo=None, hi=None): end = min(v.domain.end() for v in x.values()) hi = hi + tau if hi + tau <= end else end lo = lo + tau if lo + tau <= end else end if lo > hi: return [] elif hi == lo: return [lo] all_times = fn.cat(v.slice(lo, hi).items() for v in x.values()) return sorted(set(fn.pluck(0, all_times)))
def gather_stage(self, job): latest_url = _gl_url(job.source.url, 'latest') authors_url = _gl_url(job.source.url, 'authors') documents_url = _gl_url(job.source.url, 'document_author') latest_publication_date = model.Session.query( func.max(HarvestObject.metadata_modified_date)).filter( HarvestObject.source == job.source).scalar() if latest_publication_date: date_is_newer = F.partial(lt, latest_publication_date.isoformat()) latest = must_be_ok(requests.get(latest_url), self._save_gather_error, 'Cannot fetch latest list', job).json() fresh_publications = filter( F.compose(date_is_newer, itemgetter('created')), latest) if len(fresh_publications) < len(latest): return self._create_harvest_objects( F.pluck('id', fresh_publications), job) authors = must_be_ok(requests.get(authors_url), self._save_gather_error, 'Cannot fetch authors list', job).json() ids = set() log.debug("Collecting documents from %d authors", len(authors)) for i, author in enumerate(authors, 1): log.debug('Fetching %d of %d record: %s', i, len(authors), author['name']) documents = F.pluck( 'id', must_be_ok( requests.get(documents_url, params={'id': author['name']}), self._save_gather_error, 'Cannot fetch documents for author <%s>' % author['name'], job).json()) for document in documents: ids.add(document) return self._create_harvest_objects(list(ids), job)
def iter_data(data): if force: return iter(data) # Filter notes for excluding duplicates exist_notes = self.get_notes(pluck('user_id', data)) for row in data: user_id, body = str(row['user_id']), row['body'] if user_id not in exist_notes: yield row continue bodies = map(normalize_note, pluck('body', exist_notes[user_id])) if normalize_note(body) not in bodies: yield row continue logger.debug( 'The note with this body already exists: %r', row)
def __init__(self, train, test): self.x_train, self.x_test = pluck('content', train), pluck('content', test) self.y_train, self.y_test = pluck('label', train), pluck('label', test) self.train_ids = pluck('id', train) self.test_ids = pluck('id', test) self.transform = DocToWordIndices().fit(self.x_train)
def get_all_tickers(): query = { "size" : 0, "aggs" : { "ticker" : { "terms" : { "field" : "ticker", "size" : 0 } } } } res = client.search(index = INDEX, doc_type = TYPE, body = query) return _.pluck('key', res['aggregations']['ticker']['buckets'])
def get_all_tickers(): query = { "size": 0, "aggs": { "ticker": { "terms": { "field": "ticker", "size": 0 } } } } res = client.search(index=INDEX, doc_type=TYPE, body=query) return _.pluck('key', res['aggregations']['ticker']['buckets'])
def from_sklearn_model(cls, name, dataset, valid_percent, model, split_random_seed=None, class_labels=None): """Constructor that splits data into train/valid and trains an sklearn model Args: name: (str) Name of the visualization (e.g. "movie review conv net") dataset: (dataset.TextDataset) Dataset container. valid_percent: (Float) Percentage of data to be held for visualization model: (sklearn pipeline) Model with .fit() and .predictproba(x) methods split_random_seed: (int) Random seed to be used when splitting data class_labels: (List) List of class names """ split_random_seed = split_random_seed or randint(0, 2**32 - 1) train_data, valid_data = train_test_split( dataset.data, test_size=valid_percent, random_state=split_random_seed) model.fit(pluck('content', train_data), pluck('label', train_data)) return cls(name, dataset, valid_data, model.predict_proba, class_labels, train_data)
async def expand_multiple_items(item_list, relations): for field, (type_, subfields) in relations.items(): path = field.split('.') field = path[-1] items_with_field = _find_items_having_path(item_list, path) related_ids = list(compact(pluck(field, items_with_field))) if not related_ids: continue related_items = await fetch_multiple_items(type_, related_ids, subfields) related_by_id = {item['id']: item for item in compact(related_items)} for item in items_with_field: item[field] = related_by_id.get(item[field]) return item_list
def admin(): if 'google_token' in session: auth_tok = session['google_token'] me = session['message'] flash("Welcome" + " " + me.get('name') + "!") else: session['message'] = {'email': ''} auth_tok = {'access_token': '', 'refresh_token': ''} superAdmins = flatten(pluck("adminUsers", app.config.get('STATIONS'))) print repr(superAdmins) users = [] for user in mongo.db.users.find(): users.append(user) return render_template('admin.html', access_token=auth_tok, refresh_token=auth_tok, session=session['message'], config=current_app.config, users=users, superAdmins=superAdmins)
def _get_addresses(self, family): try: return self.__cached_addresses[family] except KeyError: from netifaces import interfaces, ifaddresses addresses = self.__cached_addresses[family] = set() for interface in interfaces(): try: ifdata = ifaddresses(interface)[family] ifaddrs = map(lambda x: x.split("%")[0], pluck("addr", ifdata)) addresses.update(ifaddrs) except KeyError: pass return addresses
def is_true_QBF(e, quantifiers): quantifiers = simplify_quantifier_prefix(quantifiers) aig = cmn.extract_aig(e) assert sum(map(len, fn.pluck(1, quantifiers))) == len(aig.inputs) if len(quantifiers) is 1: # solve with SAT if quantifiers[0][0] is 'a': return is_valid(aig) else: assert quantifiers[0][0] is 'e' return is_satisfiable(aig) elif len(quantifiers) is 2: # 2QBF true_return_code = CadetCodes.QBF_IS_TRUE.value if quantifiers[-1][0] is 'a': e = ~aiger.BoolExpr(aig) aig = e.aig true_return_code = CadetCodes.QBF_IS_FALSE.value return _call_cadet(aig, quantifiers[1][1]) == true_return_code else: raise NotImplementedError('Cannot handle general QBF at the moment')
def bounding_box(domain, oracle, eps=1e-5): """Compute Bounding box. TODO: clean up""" # TODO: remove r input and assume unit rec. edges = [mdts.binsearch(r2, oracle, eps=eps) for r2 in box_edges(domain)] rtypes = fn.pluck(0, edges) if all(t == mdts.SearchResultType.TRIVIALLY_FALSE for t in rtypes): return domain elif all(t == mdts.SearchResultType.TRIVIALLY_TRUE for t in rtypes): return mdtr.to_rec(domain.dim*[[0, 0]]) itvls = [r for t, r in edges if t == mdts.SearchResultType.NON_TRIVIAL] def box_to_include(r): return domain.backward_cone(r.top) & domain.forward_cone(r.bot) bbox, *recs = fn.lmap(box_to_include, itvls) for r in recs: bbox = bbox.sup(r) return bbox
def _find_address(self, address, family): localhost = None for interface in interfaces(): try: if_data = ifaddresses(interface)[family] addresses = pluck("addr", if_data) except KeyError: pass else: if addresses: for a in addresses: if a not in ("::1", "127.0.0.1"): return a.rsplit("%", 1)[0] if a: localhost = a if localhost: return localhost raise Exception("Failed to guess host for interface '%s'", address)
def __init__(self, dataset, model_fitting_func, sampling_strategy, pool_fractions, sample_fractions, test_frac, name=None, sort_by=None, sort_by_reverse=False, seed=1): """Instantiate experiment Args: dataset: TextDataSet for the experiment model_fitting_func: Function from data -> (predict_proba function) model sampling_strategy: Function from (n, data) -> the n examples selected for training pool_fractions: List of floats indicating pool sizes sample_fractions: List of floats indicating how many samples are taken from each pool test_frac: (float) Fraction of each pool to reserve for testing sort_by: (str) Key to sort by when creating pools (e.g. publication date) sort_by_reverse: (bool) If True, sort by descending order when creating pools seed: (int) Seed for random state """ self.dataset = dataset self.name = name or self.dataset.name if sort_by is not None: self.dataset.data = sorted(self.dataset.data, key=pluck(sort_by), reverse=sort_by_reverse) self.train_pools, self.test_pools = create_pools( self.dataset.data, pool_fractions, test_frac, seed) self.sample_from_pool = sampling_strategy self.fit_model = model_fitting_func self.sample_fractions = sample_fractions # initialize the training data with the first pool self.training_data = self.train_pools[0] self.model_results = pd.DataFrame()
def get_data(self): all_transactions = filter( lambda t: t["type"] in ("request-changes", "accept"), cat(pluck("transactions", self.raw_data)), ) accept_transactions, reject_transactions = split( lambda t: t["type"] == "accept", all_transactions) most_accepting_author, most_accepting_count = Counter( count_by(itemgetter("authorPHID"), accept_transactions)).most_common(1)[0] most_rejecting_author, most_rejecting_count = Counter( count_by(itemgetter("authorPHID"), reject_transactions)).most_common(1)[0] return ( { "author": self.users_mapping[most_accepting_author], "count": most_accepting_count, }, { "author": self.users_mapping[most_rejecting_author], "count": most_rejecting_count, }, )
def __init__(self, train, test, embedding_size=128, lstm_cell_size=64, embedding_dropout_prob=.5, dropout_prob=.5, kernel_l2=0.01): self.x_train, self.x_test = pluck('content', train), pluck('content', test) self.y_train, self.y_test = pluck('label', train), pluck('label', test) self.train_ids = pluck('id', train) self.test_ids = pluck('id', test) self.transform = DocToWordIndices().fit(self.x_train) self.x_train = self.transform.transform(self.x_train) self.x_test = self.transform.transform(self.x_test) vocab_size = np.max(self.x_train) + 1 # vocab and classes are 0 indexed n_labels = int(np.max(self.y_train)) + 1 self.y_train, self.y_test = to_categorical(self.y_train), to_categorical(self.y_test) self.model = Sequential() self.model.add(Embedding(vocab_size, embedding_size, input_length=self.x_train.shape[1])) self.model.add(Dropout(embedding_dropout_prob)) self.model.add(Bidirectional(LSTM(lstm_cell_size, kernel_regularizer=L1L2(l1=0.0, l2=kernel_l2)))) self.model.add(Dropout(dropout_prob)) self.model.add(Dense(n_labels, activation='softmax')) self.model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
def sample(self, Y, K=2.5e6, h=2e3, α_hat=None, Z_hat=None): K, h = int(K), int(h) result = defaultdict(list) try: # compute MLE and center it at origin if α_hat is None or Z_hat is None: α_hat, Z_hat = self.mle(Y) Z_hat = Z_hat - np.mean(Z_hat, axis=1).reshape(-1, 1) result['α_hat'] = α_hat result['Z_hat'] = Z_hat Z_k = Z_hat.copy() α_k = α_hat.copy() def update_Z(α_k, Z_k, detail=False): Z_dag = self.qZ(Z_k) # compute acceptance probability n1 = self.loglikelihood(Y, α_k, Z_dag) d1 = self.loglikelihood(Y, α_k, Z_k) n2 = self.logpz(Z_dag) d2 = self.logpz(Z_k) p = np.exp(n1 + n2 - d1 - d2) a = min(1, p) # accept/reject step u = scipy.stats.uniform.rvs() accepted = u < a if accepted: Z_k = Z_dag else: pass if detail: _detail = { 'Z_dag': Z_dag, 'n1': n1, 'd1': d1, 'n2': n2, 'd2': d2, 'p': p, 'a': a, 'u': u, 'accepted': accepted, } else: _detail = None return Z_k, _detail def update_α(α_k, Z_k, detail=False): α_dag = self.qα(α_k) # compute acceptance probability n1 = self.loglikelihood(Y, α_dag, Z_k) d1 = self.loglikelihood(Y, α_k, Z_k) n2 = self.logpa(α_dag) d2 = self.logpa(α_k) p = np.exp(n1 + n2 - d1 - d2) a = min(1, p) # accept/reject step u = scipy.stats.uniform.rvs() accepted = u < a if accepted: α_k = α_dag else: pass if detail: _detail = { 'α_dag': α_dag, 'n1': n1, 'd1': d1, 'n2': n2, 'd2': d2, 'p': p, 'a': a, 'u': u, 'accepted': accepted, } else: _detail = None return α_k, _detail logger.info('Sampling...') for k in tqdm(range(K)): # record results every h iterations record_iter = k % h == 0 Z_k, Z_detail = update_Z(α_k, Z_k, detail=record_iter) α_k, α_detail = update_α(α_k, Z_k, detail=record_iter) if record_iter: Z_k_star = self._compute_procrustean_transformation( Z_k, Z_hat) result['α'].append(α_k) result['Z'].append(Z_k_star) result['α_detail'].append(α_detail) result['Z_detail'].append(Z_detail) acceptance_rates = { k: np.mean(list(funcy.pluck( 'accepted', result['{}_detail'.format(k)]))) for k in ['α', 'Z'] } logger.info('Sampling...DONE') logger.info('Acceptance rates: {!r}'.format(acceptance_rates)) return result except KeyboardInterrupt: logger.info('Sampling...ABORTED') logger.info('Acceptance rates: {!r}'.format(acceptance_rates)) return result
def zip_pluck(d, keys, enumerate=False): args = [pluck(k, d) for k in keys] if enumerate: args = [count(), *args] return zip(*args)
def dense_data_and_labels(data_and_labels): X = hstack(pluck(0, data_and_labels)).T dense_labels = hstack([label*ones(data.shape[1]) for data, label in data_and_labels]) return X, dense_labels
if type(x['date']) == type([]): x['date'] = x['date'][0] return x def add_boarddate_and_indicator(x): try: x['boarddate'] = x['board_id'] + '_' + x['date'].split(' ')[0] x['hv'] = x['boarddate'] in boarddates except: x['boarddate'] = None x['hv'] = None return x hv = sc.textFile('data/high_volume_days.csv').map(lambda x: x.split(',')) hv = hv.map(lambda x: {'board' : x[0], 'date' : x[1], 'count' : x[2], 'boarddate' : x[0] + '_' + x[1]}) boarddates = _.pluck('boarddate', hv.collect()) inpath = 'data/post_dump_spark.json.txt' raw = sc.textFile(inpath).map(lambda x: json.loads(x)).map(clean).cache() rdd = raw.filter(lambda x: x.get('date', None))\ .filter(lambda x: x.get('msg', None))\ .map(add_boarddate_and_indicator)\ .cache()
def __init__(self, train, test, **model_options): """Create a conv net with keras Args: train: List of train examples test: List of test (validation) examples """ embedding_size = model_options.get('embedding_size', 128) filter_sizes = model_options.get('filter_sizes', [2, 3, 4]) n_filters = model_options.get('n_filters', 25) pool_size = model_options.get('pool_size', 4) hidden_dims = model_options.get('hidden_dims', 128) dropout_prob = model_options.get('dropout_prob', .5) conv_l2 = model_options.get('conv_l2', .05) fc_l2 = model_options.get('fc_l2', .05) balance_classes = model_options.get('balance_classes', False) self.train_labels = pluck('label', train) self.x_train, self.x_test = pluck('content', train), pluck('content', test) self.y_train, self.y_test = pluck('label', train), pluck('label', test) self.train_ids = pluck('id', train) self.test_ids = pluck('id', test) self.transform = DocToWordIndices().fit(self.x_train) self.x_train = self.transform.transform(self.x_train) self.x_test = self.transform.transform(self.x_test) self.vocab_size = np.max(self.x_train) + 1 # vocab and classes are 0 indexed self.n_labels = int(np.max(self.y_train)) + 1 self.y_train, self.y_test = to_categorical(self.y_train), to_categorical(self.y_test) self.sequence_length = self.x_train.shape[1] self.n_labels = self.y_train.shape[1] self.balance_classes = balance_classes conv_input = Input(shape=(self.sequence_length, embedding_size)) convs = [] for filter_size in filter_sizes: conv = Conv1D(activation="relu", padding="valid", strides=1, filters=n_filters, kernel_size=filter_size, kernel_regularizer=L1L2(l1=0.0, l2=conv_l2))(conv_input) pool = MaxPooling1D(pool_size=pool_size)(conv) flatten = Flatten()(pool) convs.append(flatten) if len(filter_sizes) > 1: conv_output = concatenate(convs) else: conv_output = convs[0] conv_layer = Model(inputs=conv_input, outputs=conv_output) self.model = Sequential() self.model.add(Embedding(self.vocab_size, embedding_size, input_length=self.sequence_length, weights=None)) self.model.add(conv_layer) self.model.add(Dense(hidden_dims, kernel_regularizer=L1L2(l1=0.0, l2=fc_l2))) self.model.add(Dropout(dropout_prob)) self.model.add(Activation('relu')) self.model.add(Dense(self.n_labels, activation='softmax')) self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
def same_sources(s1, s2): ''' Do two states have the same sources (same names and same arguments)? ''' return ( fn.pluck('name', s1['sources']) == fn.pluck('name', s2['sources']) and fn.pluck('args', s1['sources']) == fn.pluck('args', s2['sources']) )