Ejemplo n.º 1
0
 def compute_boundary(trace, eps=0.1):
     refinements = mdt.volume_guided_refinement([rectangle], phi(trace))
     return list(
         fn.pluck(1,
                  fn.first(
                      fn.dropwhile(lambda x: -min(fn.pluck(0, x)) > eps,
                                   refinements))))
Ejemplo n.º 2
0
def get_reps(zip_code):
    representatives_ids_url = "https://congress.api.sunlightfoundation.com/legislators/locate?zip=" + zip_code + "&fields=bioguide_id&apikey=a5871887a24348d1a40d969832721c91"
    representative_ids_list = pluck("bioguide_id", requests.get(representatives_ids_url).json()['results'])

    bill_ids_url = "https://congress.api.sunlightfoundation.com/bills?query=NASA&fields=bill_id&last_vote_at__exists=true&apikey=a5871887a24348d1a40d969832721c91"
    bill_ids = "|".join(pluck("bill_id", requests.get(bill_ids_url).json()['results']))

    all_votes_url = "https://congress.api.sunlightfoundation.com/votes?bill_id__in=" + bill_ids + "&fields=voters&vote_type=passage&apikey=a5871887a24348d1a40d969832721c91"
    all_votes = pluck("voters", requests.get(all_votes_url).json()['results'])

    scores = dict(total=0, voters={})
    for vote in all_votes:
        scores['total'] += 1
        for voter, value in vote.iteritems():
            if voter in representative_ids_list:
                if not voter in scores['voters'].keys():
                    scores['voters'][voter] = value['voter'];
                    scores['voters'][voter]['score'] = 0
                if value['vote'] == "Nay":
                    scores['voters'][voter]['score'] -= 1
                elif value['vote'] == "Yea":
                    scores['voters'][voter]['score'] += 1

    pprint.pprint(json.dumps(scores), width=1)
    return json.dumps(scores)
Ejemplo n.º 3
0
def top_k_custom_vector_similarity(query_docs, candidate_docs, k):
    """Given most similar docs by looking at custom trained vector means"""
    if not all(['embedding_mean' in example for example in np.concatenate((query_docs, candidate_docs))]):
        raise ValueError('Not all embedding means have been calculated')
    distances = cosine_distances(pluck('embedding_mean', query_docs), pluck('embedding_mean', candidate_docs))
    closest_docs = np.argsort(distances, axis=1)[:, :k]
    return closest_docs
def apply_model_stream(docs):
    global model
    
    ids  = _.pluck('id', docs)
    msgs = map(lambda x: _.flatten([x])[0], _.pluck('msg', docs))
    
    z    = model['vect'].transform(msgs)
    z    = model['ch2'].transform(z)
    z    = model['tfidf'].transform(z)
    pred = model['clf'].predict_proba(z)
    
    for i in range(0, len(ids)):
        yield {
            "_id"      : ids[i],
            "_type"    : config['elasticsearch']['_type'],
            "_index"   : config['elasticsearch']['_to_index'],
            "_op_type" : "update",
            "doc" : {
                '__meta__' : {
                    'tri_pred' : {
                        'neg'  : float(pred[i][0]),
                        'neut' : float(pred[i][1]),
                        'pos'  : float(pred[i][2])
                    }
                }
            }
        }
Ejemplo n.º 5
0
def apply_model_stream(docs):
    global model

    ids = _.pluck('id', docs)
    msgs = map(lambda x: _.flatten([x])[0], _.pluck('msg', docs))

    z = model['vect'].transform(msgs)
    z = model['ch2'].transform(z)
    z = model['tfidf'].transform(z)
    pred = model['clf'].predict_proba(z)

    for i in range(0, len(ids)):
        yield {
            "_id": ids[i],
            "_type": config['elasticsearch']['_type'],
            "_index": config['elasticsearch']['_to_index'],
            "_op_type": "update",
            "doc": {
                '__meta__': {
                    'tri_pred': {
                        'neg': float(pred[i][0]),
                        'neut': float(pred[i][1]),
                        'pos': float(pred[i][2])
                    }
                }
            }
        }
Ejemplo n.º 6
0
 def record_model_stats(self, predict_proba, model_index):
     """Add to a dataframe with data on each model's accuracy on each pool"""
     for pool_index, pool in enumerate(self.test_pools):
         prediction_df = results_df(predict_proba, pluck('content', pool),
                                    pluck('label', pool))
         prediction_df['model'] = model_index
         prediction_df['pool'] = pool_index
         self.model_results = pd.concat((self.model_results, prediction_df))
Ejemplo n.º 7
0
def test_make_signal():
    sig1 = signal(DATA1, start=0, end=4, tag='x')
    sig2 = signal(DATA1, start=1, end=2, tag='x')
    assert sig1 | sig2 == sig1

    assert set(fn.pluck(0, DATA1)) == set(sig1.times())
    assert set(fn.pluck(0, DATA1)) > set(sig2.times())
    assert len(sig2.times()) == 1
    assert set(fn.pluck(1, DATA1)) == {v['x'] for v in sig1.values()}
Ejemplo n.º 8
0
 def find_bb(bounds):
     bounds_1D = reduce(operator.concat, bounds)
     lbs = [(np.array(
         [k.bot for k in fn.pluck(i, list(fn.pluck(0, bounds_1D)))])).min()
            for i in range(len(bounds_1D[0]))]
     ubs = [(np.array(
         [k.top for k in fn.pluck(i, list(fn.pluck(0, bounds_1D)))])).max()
            for i in range(len(bounds_1D[0]))]
     return np.array([[ubs[i] - lbs[i]] for i in range(len(lbs))]), lbs, ubs
Ejemplo n.º 9
0
    def log_accuracy(self, predict_proba, model_number):
        """Log model accuracy"""
        train_accuracy = results_df(predict_proba,
                                    pluck('content', self.training_data),
                                    pluck('label',
                                          self.training_data)).correct.mean()
        test_accuracy = self.model_results[self.model_results.model ==
                                           model_number].correct.mean()

        self.log('Train accuracy: {:.2f}'.format(train_accuracy))
        self.log('Test accuracy: {:.2f}'.format(test_accuracy))
Ejemplo n.º 10
0
 def mysql_to_file(cls):
     conn = client.conn()
     try:
         sql = "show table status"
         tables_infos = conn.query(sql)
         data = {}
         os.popen('rm ./public/toml/* -f')
         os.popen('rm ./public/json/* -f')
         xls_table_map = get_xls_table_map()
         table_names = _.pluck("table_name", xls_table_map)
         for k in tables_infos:
             if k.Name in table_names:
                 data["tableInfos"] = k
                 sql = "SELECT * FROM " + k.Name
                 result = (conn.query(sql))
                 data["record"] = result
                 sql = "SHOW FULL FIELDS FROM " + k.Name
                 result1 = conn.query(sql)
                 data["fieldInfo"] = result1
                 cls.toToml(data)
                 cls.toJson(data)
         zipTool.zip_dir("./public/toml", "./public/toml.zip")
         zipTool.zip_dir("./public/json", "./public/json.zip")
         return True
     except:
         traceback.print_exc()
         return False
Ejemplo n.º 11
0
    def load_data(self, depth=1):
        if depth == 0:
            return

        self.json = self._client._get('paper/{}'.format(self.paperId))
        data = {
            'authors': [
                Author(id, self._client)
                for id in funcy.pluck('authorId', self.json['authors'])
                if id is not None
            ],
            'citationVelocity':
            self.json['citationVelocity'],
            'doi':
            self.json['doi'],
            'influentialCitationCount':
            self.json['influentialCitationCount'],
            'title':
            self.json['title'],
            'url':
            self.json['url'],
            'venue':
            self.json['venue'],
            'year':
            self.json['year'],
        }
        self.data = data

        if depth - 1 > 0:
            for author in self.data['authors']:
                author.load_data(depth=depth - 1)
Ejemplo n.º 12
0
def lexicographic_opt(func, ordering, tol):
    dim = len(ordering)
    assert set(fn.pluck(0, ordering)) == set(range(dim))
    tol /= dim  # Need to compensate for multiple binsearches.

    rec = refine.bounding_box(
        domain=mdtr.unit_rec(dim),
        oracle=func
    )
    # If polarity is True, set initial value at bounding.top.
    # O.w. use bounding.bot.
    base = tuple((rec.top if p else rec.bot)[i] for i, p in sorted(ordering))

    res_rec = mdtr.to_rec(zip(base, base))
    for idx, polarity in ordering:
        oracle = func
        rec = mdtr.to_rec(
            (0, 1) if i == idx else (p, p) for i, p in enumerate(base)
        )
        result_type, res_cand = binsearch(rec, oracle, eps=tol)

        if result_type == SearchResultType.NON_TRIVIAL:
            res_rec = res_cand
            base = res_rec.bot

    return res_rec
Ejemplo n.º 13
0
    def load_data(self, depth=1):
        if depth == 0:
            return

        self.json = self._client._get('author/{}'.format(self.authorId))
        data = {
            'papers': [
                Paper(id, self._client)
                for id in funcy.pluck('paperId', self.json['papers'])
                if id is not None
            ],
            'citationVelocity':
            self.json['citationVelocity'],
            'influentialCitationCount':
            self.json['influentialCitationCount'],
            'name':
            self.json['name'],
            'url':
            self.json['url'],
        }
        self.data = data

        if depth - 1 > 0:
            for paper in self.data['papers']:
                paper.load_data(depth=depth - 1)
Ejemplo n.º 14
0
def to_signal(ts_mapping) -> DiscreteSignal:
    if isinstance(ts_mapping, DiscreteSignal):
        return ts_mapping

    start = min(fn.pluck(0, fn.cat(ts_mapping.values())))
    signals = (signal(v, start, OO, tag=k) for k, v in ts_mapping.items())
    return reduce(op.or_, signals)
Ejemplo n.º 15
0
def add_prediction_info(predict_proba, data):
    """Add entry for predictions and entropy to data"""
    prediction_df = results_df(predict_proba, pluck('content', data))
    for (row, predictions) in zip(data,
                                  prediction_df.to_dict(orient='records')):
        row.update(predictions)
    return data
Ejemplo n.º 16
0
def pool_n(n, comments):
    """Pool every n comments"""
    pool = lambda comments: ' '.join(pluck('body', comments))
    pools = []
    for i in range(0, len(comments), n):
        pools.append(pool(comments[i: min(len(comments), i+n)]))
    if len(pools[-1]) < n:
        pools = pools[:-1]
    return pools
Ejemplo n.º 17
0
def infer(concept_class, demos, brute_force=False):
    candidates = list(traverse(concept_class, demos))
    assert len(candidates) == 6
    if not brute_force:
        candidates = list(filter(is_candidate(demos), candidates))

    candidates = list(fn.pluck(0, candidates))
    print([score_candidate(demos)(c) for c in candidates])
    return max(candidates, key=score_candidate(demos))
Ejemplo n.º 18
0
def pool_n(n, comments):
    """Pool every n comments"""
    pool = lambda comments: ' '.join(pluck('body', comments))
    pools = []
    for i in range(0, len(comments), n):
        pools.append(pool(comments[i:min(len(comments), i + n)]))
    if len(pools[-1]) < n:
        pools = pools[:-1]
    return pools
    def _find_addresses(self, family):
        addresses = []

        for interface in interfaces():
            try:
                if_data = ifaddresses(interface)[family]
                addresses += pluck("addr", if_data)
            except KeyError:
                pass

        return [a.rsplit("%", 1)[0] for a in addresses]
Ejemplo n.º 20
0
def get_times(x, tau, lo=None, hi=None):
    end = min(v.domain.end() for v in x.values())
    hi = hi + tau if hi + tau <= end else end
    lo = lo + tau if lo + tau <= end else end

    if lo > hi:
        return []
    elif hi == lo:
        return [lo]

    all_times = fn.cat(v.slice(lo, hi).items() for v in x.values())
    return sorted(set(fn.pluck(0, all_times)))
    def gather_stage(self, job):
        latest_url = _gl_url(job.source.url, 'latest')
        authors_url = _gl_url(job.source.url, 'authors')
        documents_url = _gl_url(job.source.url, 'document_author')
        latest_publication_date = model.Session.query(
            func.max(HarvestObject.metadata_modified_date)).filter(
                HarvestObject.source == job.source).scalar()

        if latest_publication_date:
            date_is_newer = F.partial(lt, latest_publication_date.isoformat())
            latest = must_be_ok(requests.get(latest_url),
                                self._save_gather_error,
                                'Cannot fetch latest list', job).json()

            fresh_publications = filter(
                F.compose(date_is_newer, itemgetter('created')), latest)
            if len(fresh_publications) < len(latest):
                return self._create_harvest_objects(
                    F.pluck('id', fresh_publications), job)
        authors = must_be_ok(requests.get(authors_url),
                             self._save_gather_error,
                             'Cannot fetch authors list', job).json()

        ids = set()
        log.debug("Collecting documents from %d authors", len(authors))
        for i, author in enumerate(authors, 1):
            log.debug('Fetching %d of %d record: %s', i, len(authors),
                      author['name'])
            documents = F.pluck(
                'id',
                must_be_ok(
                    requests.get(documents_url, params={'id': author['name']}),
                    self._save_gather_error,
                    'Cannot fetch documents for author <%s>' % author['name'],
                    job).json())

            for document in documents:
                ids.add(document)
        return self._create_harvest_objects(list(ids), job)
Ejemplo n.º 22
0
        def iter_data(data):
            if force:
                return iter(data)

            # Filter notes for excluding duplicates
            exist_notes = self.get_notes(pluck('user_id', data))

            for row in data:
                user_id, body = str(row['user_id']), row['body']

                if user_id not in exist_notes:
                    yield row
                    continue

                bodies = map(normalize_note,
                             pluck('body', exist_notes[user_id]))
                if normalize_note(body) not in bodies:
                    yield row
                    continue

                logger.debug(
                    'The note with this body already exists: %r', row)
Ejemplo n.º 23
0
    def __init__(self, train, test):
        self.x_train, self.x_test = pluck('content', train), pluck('content', test)
        self.y_train, self.y_test = pluck('label', train), pluck('label', test)

        self.train_ids = pluck('id', train)
        self.test_ids = pluck('id', test)

        self.transform = DocToWordIndices().fit(self.x_train)
Ejemplo n.º 24
0
def get_all_tickers():
	query = {
		"size" : 0,
		"aggs" : {
			"ticker" : {
				"terms" : {
					"field" : "ticker",
					"size"  : 0
				}
			}
		}
	}
	res = client.search(index = INDEX, doc_type = TYPE, body = query)
	return _.pluck('key', res['aggregations']['ticker']['buckets'])
Ejemplo n.º 25
0
def get_all_tickers():
    query = {
        "size": 0,
        "aggs": {
            "ticker": {
                "terms": {
                    "field": "ticker",
                    "size": 0
                }
            }
        }
    }
    res = client.search(index=INDEX, doc_type=TYPE, body=query)
    return _.pluck('key', res['aggregations']['ticker']['buckets'])
Ejemplo n.º 26
0
 def from_sklearn_model(cls,
                        name,
                        dataset,
                        valid_percent,
                        model,
                        split_random_seed=None,
                        class_labels=None):
     """Constructor that splits data into train/valid and trains an sklearn model
     Args:
         name: (str) Name of the visualization (e.g. "movie review conv net")
         dataset: (dataset.TextDataset) Dataset container.
         valid_percent: (Float) Percentage of data to be held for visualization
         model: (sklearn pipeline) Model with .fit() and .predictproba(x) methods
         split_random_seed: (int) Random seed to be used when splitting data
         class_labels: (List) List of class names
     """
     split_random_seed = split_random_seed or randint(0, 2**32 - 1)
     train_data, valid_data = train_test_split(
         dataset.data,
         test_size=valid_percent,
         random_state=split_random_seed)
     model.fit(pluck('content', train_data), pluck('label', train_data))
     return cls(name, dataset, valid_data, model.predict_proba,
                class_labels, train_data)
Ejemplo n.º 27
0
async def expand_multiple_items(item_list, relations):
    for field, (type_, subfields) in relations.items():
        path = field.split('.')
        field = path[-1]
        items_with_field = _find_items_having_path(item_list, path)

        related_ids = list(compact(pluck(field, items_with_field)))
        if not related_ids:
            continue

        related_items = await fetch_multiple_items(type_, related_ids, subfields)
        related_by_id = {item['id']: item for item in compact(related_items)}

        for item in items_with_field:
            item[field] = related_by_id.get(item[field])

    return item_list
Ejemplo n.º 28
0
def admin():
    if 'google_token' in session:
        auth_tok = session['google_token']
        me = session['message']
        flash("Welcome" + " " + me.get('name') + "!")
    else:
        session['message'] = {'email': ''}
        auth_tok = {'access_token': '', 'refresh_token': ''}
    superAdmins = flatten(pluck("adminUsers",
                                app.config.get('STATIONS')))
    print repr(superAdmins)
    users = []
    for user in mongo.db.users.find():
            users.append(user)
    return render_template('admin.html', access_token=auth_tok,
                           refresh_token=auth_tok, session=session['message'],
                           config=current_app.config, users=users,
                           superAdmins=superAdmins)
Ejemplo n.º 29
0
    def _get_addresses(self, family):
        try:
            return self.__cached_addresses[family]
        except KeyError:
            from netifaces import interfaces, ifaddresses

            addresses = self.__cached_addresses[family] = set()

            for interface in interfaces():
                try:
                    ifdata = ifaddresses(interface)[family]
                    ifaddrs = map(lambda x: x.split("%")[0],
                                  pluck("addr", ifdata))
                    addresses.update(ifaddrs)
                except KeyError:
                    pass

            return addresses
Ejemplo n.º 30
0
async def expand_multiple_items(item_list, relations):
    for field, (type_, subfields) in relations.items():
        path = field.split('.')
        field = path[-1]
        items_with_field = _find_items_having_path(item_list, path)

        related_ids = list(compact(pluck(field, items_with_field)))
        if not related_ids:
            continue

        related_items = await fetch_multiple_items(type_, related_ids,
                                                   subfields)
        related_by_id = {item['id']: item for item in compact(related_items)}

        for item in items_with_field:
            item[field] = related_by_id.get(item[field])

    return item_list
Ejemplo n.º 31
0
def is_true_QBF(e, quantifiers):
    quantifiers = simplify_quantifier_prefix(quantifiers)
    aig = cmn.extract_aig(e)
    assert sum(map(len, fn.pluck(1, quantifiers))) == len(aig.inputs)
    if len(quantifiers) is 1:  # solve with SAT
        if quantifiers[0][0] is 'a':
            return is_valid(aig)
        else:
            assert quantifiers[0][0] is 'e'
            return is_satisfiable(aig)
    elif len(quantifiers) is 2:  # 2QBF
        true_return_code = CadetCodes.QBF_IS_TRUE.value
        if quantifiers[-1][0] is 'a':
            e = ~aiger.BoolExpr(aig)
            aig = e.aig
            true_return_code = CadetCodes.QBF_IS_FALSE.value
        return _call_cadet(aig, quantifiers[1][1]) == true_return_code
    else:
        raise NotImplementedError('Cannot handle general QBF at the moment')
Ejemplo n.º 32
0
def bounding_box(domain, oracle, eps=1e-5):
    """Compute Bounding box. TODO: clean up"""
    # TODO: remove r input and assume unit rec.
    edges = [mdts.binsearch(r2, oracle, eps=eps) for r2 in box_edges(domain)]

    rtypes = fn.pluck(0, edges)
    if all(t == mdts.SearchResultType.TRIVIALLY_FALSE for t in rtypes):
        return domain
    elif all(t == mdts.SearchResultType.TRIVIALLY_TRUE for t in rtypes):
        return mdtr.to_rec(domain.dim*[[0, 0]])

    itvls = [r for t, r in edges if t == mdts.SearchResultType.NON_TRIVIAL]

    def box_to_include(r):
        return domain.backward_cone(r.top) & domain.forward_cone(r.bot)

    bbox, *recs = fn.lmap(box_to_include, itvls)
    for r in recs:
        bbox = bbox.sup(r)

    return bbox
    def _find_address(self, address, family):
        localhost = None

        for interface in interfaces():
            try:
                if_data = ifaddresses(interface)[family]
                addresses = pluck("addr", if_data)
            except KeyError:
                pass
            else:
                if addresses:
                    for a in addresses:
                        if a not in ("::1", "127.0.0.1"):
                            return a.rsplit("%", 1)[0]
                        if a:
                            localhost = a

        if localhost:
            return localhost

        raise Exception("Failed to guess host for interface '%s'", address)
Ejemplo n.º 34
0
    def __init__(self,
                 dataset,
                 model_fitting_func,
                 sampling_strategy,
                 pool_fractions,
                 sample_fractions,
                 test_frac,
                 name=None,
                 sort_by=None,
                 sort_by_reverse=False,
                 seed=1):
        """Instantiate experiment
        Args:
            dataset: TextDataSet for the experiment
            model_fitting_func: Function from data -> (predict_proba function) model
            sampling_strategy: Function from (n, data) -> the n examples selected for training
            pool_fractions: List of floats indicating pool sizes
            sample_fractions: List of floats indicating how many samples are taken from each pool
            test_frac: (float) Fraction of each pool to reserve for testing
            sort_by: (str) Key to sort by when creating pools (e.g. publication date)
            sort_by_reverse: (bool) If True, sort by descending order when creating pools
            seed: (int) Seed for random state
        """
        self.dataset = dataset
        self.name = name or self.dataset.name

        if sort_by is not None:
            self.dataset.data = sorted(self.dataset.data,
                                       key=pluck(sort_by),
                                       reverse=sort_by_reverse)
        self.train_pools, self.test_pools = create_pools(
            self.dataset.data, pool_fractions, test_frac, seed)
        self.sample_from_pool = sampling_strategy
        self.fit_model = model_fitting_func

        self.sample_fractions = sample_fractions

        # initialize the training data with the first pool
        self.training_data = self.train_pools[0]
        self.model_results = pd.DataFrame()
Ejemplo n.º 35
0
    def get_data(self):
        all_transactions = filter(
            lambda t: t["type"] in ("request-changes", "accept"),
            cat(pluck("transactions", self.raw_data)),
        )
        accept_transactions, reject_transactions = split(
            lambda t: t["type"] == "accept", all_transactions)
        most_accepting_author, most_accepting_count = Counter(
            count_by(itemgetter("authorPHID"),
                     accept_transactions)).most_common(1)[0]
        most_rejecting_author, most_rejecting_count = Counter(
            count_by(itemgetter("authorPHID"),
                     reject_transactions)).most_common(1)[0]

        return (
            {
                "author": self.users_mapping[most_accepting_author],
                "count": most_accepting_count,
            },
            {
                "author": self.users_mapping[most_rejecting_author],
                "count": most_rejecting_count,
            },
        )
Ejemplo n.º 36
0
    def __init__(self, train, test, embedding_size=128, lstm_cell_size=64, embedding_dropout_prob=.5,
                 dropout_prob=.5, kernel_l2=0.01):

        self.x_train, self.x_test = pluck('content', train), pluck('content', test)
        self.y_train, self.y_test = pluck('label', train), pluck('label', test)

        self.train_ids = pluck('id', train)
        self.test_ids = pluck('id', test)

        self.transform = DocToWordIndices().fit(self.x_train)
        self.x_train = self.transform.transform(self.x_train)
        self.x_test = self.transform.transform(self.x_test)

        vocab_size = np.max(self.x_train) + 1  # vocab and classes are 0 indexed
        n_labels = int(np.max(self.y_train)) + 1
        self.y_train, self.y_test = to_categorical(self.y_train), to_categorical(self.y_test)

        self.model = Sequential()
        self.model.add(Embedding(vocab_size, embedding_size, input_length=self.x_train.shape[1]))
        self.model.add(Dropout(embedding_dropout_prob))
        self.model.add(Bidirectional(LSTM(lstm_cell_size, kernel_regularizer=L1L2(l1=0.0, l2=kernel_l2))))
        self.model.add(Dropout(dropout_prob))
        self.model.add(Dense(n_labels, activation='softmax'))
        self.model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
Ejemplo n.º 37
0
    def sample(self, Y, K=2.5e6, h=2e3, α_hat=None, Z_hat=None):
        K, h = int(K), int(h)
        result = defaultdict(list)

        try:
            # compute MLE and center it at origin
            if α_hat is None or Z_hat is None:
                α_hat, Z_hat = self.mle(Y)
            Z_hat = Z_hat - np.mean(Z_hat, axis=1).reshape(-1, 1)

            result['α_hat'] = α_hat
            result['Z_hat'] = Z_hat

            Z_k = Z_hat.copy()
            α_k = α_hat.copy()

            def update_Z(α_k, Z_k, detail=False):
                Z_dag = self.qZ(Z_k)

                # compute acceptance probability
                n1 = self.loglikelihood(Y, α_k, Z_dag)
                d1 = self.loglikelihood(Y, α_k, Z_k)
                n2 = self.logpz(Z_dag)
                d2 = self.logpz(Z_k)
                p = np.exp(n1 + n2 - d1 - d2)
                a = min(1, p)

                # accept/reject step
                u = scipy.stats.uniform.rvs()
                accepted = u < a
                if accepted:
                    Z_k = Z_dag
                else:
                    pass

                if detail:
                    _detail = {
                        'Z_dag': Z_dag, 'n1': n1, 'd1': d1, 'n2': n2,
                        'd2': d2, 'p': p, 'a': a, 'u': u, 'accepted': accepted,
                    }
                else:
                    _detail = None

                return Z_k, _detail

            def update_α(α_k, Z_k, detail=False):
                α_dag = self.qα(α_k)

                # compute acceptance probability
                n1 = self.loglikelihood(Y, α_dag, Z_k)
                d1 = self.loglikelihood(Y, α_k, Z_k)
                n2 = self.logpa(α_dag)
                d2 = self.logpa(α_k)
                p = np.exp(n1 + n2 - d1 - d2)
                a = min(1, p)

                # accept/reject step
                u = scipy.stats.uniform.rvs()
                accepted = u < a
                if accepted:
                    α_k = α_dag
                else:
                    pass

                if detail:
                    _detail = {
                        'α_dag': α_dag, 'n1': n1, 'd1': d1, 'n2': n2,
                        'd2': d2, 'p': p, 'a': a, 'u': u, 'accepted': accepted,
                    }
                else:
                    _detail = None

                return α_k, _detail

            logger.info('Sampling...')
            for k in tqdm(range(K)):
                # record results every h iterations
                record_iter = k % h == 0

                Z_k, Z_detail = update_Z(α_k, Z_k, detail=record_iter)
                α_k, α_detail = update_α(α_k, Z_k, detail=record_iter)

                if record_iter:
                    Z_k_star = self._compute_procrustean_transformation(
                        Z_k, Z_hat)
                    result['α'].append(α_k)
                    result['Z'].append(Z_k_star)
                    result['α_detail'].append(α_detail)
                    result['Z_detail'].append(Z_detail)

            acceptance_rates = {
                k: np.mean(list(funcy.pluck(
                    'accepted', result['{}_detail'.format(k)])))
                for k in ['α', 'Z']
            }
            logger.info('Sampling...DONE')
            logger.info('Acceptance rates: {!r}'.format(acceptance_rates))

            return result
        except KeyboardInterrupt:
            logger.info('Sampling...ABORTED')
            logger.info('Acceptance rates: {!r}'.format(acceptance_rates))
            return result
Ejemplo n.º 38
0
def zip_pluck(d, keys, enumerate=False):
    args = [pluck(k, d) for k in keys]
    if enumerate:
        args = [count(), *args]
    return zip(*args)
Ejemplo n.º 39
0
def dense_data_and_labels(data_and_labels):
    X = hstack(pluck(0, data_and_labels)).T
    dense_labels = hstack([label*ones(data.shape[1]) for data, label in
                           data_and_labels])
    return X, dense_labels
Ejemplo n.º 40
0
	
	if type(x['date']) == type([]):
		x['date'] = x['date'][0]
	
	return x


def add_boarddate_and_indicator(x):
	try:
		x['boarddate'] = x['board_id'] + '_' + x['date'].split(' ')[0]
		x['hv']        = x['boarddate'] in boarddates
	except:
		x['boarddate'] = None
		x['hv']        = None
	
	return x


hv = sc.textFile('data/high_volume_days.csv').map(lambda x: x.split(','))
hv = hv.map(lambda x: {'board' : x[0], 'date' : x[1], 'count' : x[2], 'boarddate' : x[0] + '_' + x[1]})
boarddates = _.pluck('boarddate', hv.collect())

inpath = 'data/post_dump_spark.json.txt'
raw    = sc.textFile(inpath).map(lambda x: json.loads(x)).map(clean).cache()

rdd = raw.filter(lambda x: x.get('date', None))\
	.filter(lambda x: x.get('msg', None))\
	.map(add_boarddate_and_indicator)\
	.cache()

Ejemplo n.º 41
0
    def __init__(self, train, test, **model_options):
        """Create a conv net with keras
        Args:
            train: List of train examples
            test: List of test (validation) examples
        """
        embedding_size = model_options.get('embedding_size', 128)
        filter_sizes = model_options.get('filter_sizes', [2, 3, 4])
        n_filters = model_options.get('n_filters', 25)
        pool_size = model_options.get('pool_size', 4)
        hidden_dims = model_options.get('hidden_dims', 128)
        dropout_prob = model_options.get('dropout_prob', .5)
        conv_l2 = model_options.get('conv_l2', .05)
        fc_l2 = model_options.get('fc_l2', .05)
        balance_classes = model_options.get('balance_classes', False)

        self.train_labels = pluck('label', train)
        self.x_train, self.x_test = pluck('content', train), pluck('content', test)
        self.y_train, self.y_test = pluck('label', train), pluck('label', test)

        self.train_ids = pluck('id', train)
        self.test_ids = pluck('id', test)

        self.transform = DocToWordIndices().fit(self.x_train)
        self.x_train = self.transform.transform(self.x_train)
        self.x_test = self.transform.transform(self.x_test)

        self.vocab_size = np.max(self.x_train) + 1  # vocab and classes are 0 indexed
        self.n_labels = int(np.max(self.y_train)) + 1
        self.y_train, self.y_test = to_categorical(self.y_train), to_categorical(self.y_test)

        self.sequence_length = self.x_train.shape[1]
        self.n_labels = self.y_train.shape[1]
        self.balance_classes = balance_classes

        conv_input = Input(shape=(self.sequence_length, embedding_size))
        convs = []
        for filter_size in filter_sizes:
            conv = Conv1D(activation="relu", padding="valid",
                          strides=1, filters=n_filters, kernel_size=filter_size,
                          kernel_regularizer=L1L2(l1=0.0, l2=conv_l2))(conv_input)
            pool = MaxPooling1D(pool_size=pool_size)(conv)
            flatten = Flatten()(pool)
            convs.append(flatten)

        if len(filter_sizes) > 1:
            conv_output = concatenate(convs)
        else:
            conv_output = convs[0]

        conv_layer = Model(inputs=conv_input, outputs=conv_output)

        self.model = Sequential()
        self.model.add(Embedding(self.vocab_size, embedding_size,
                                 input_length=self.sequence_length, weights=None))

        self.model.add(conv_layer)
        self.model.add(Dense(hidden_dims, kernel_regularizer=L1L2(l1=0.0, l2=fc_l2)))
        self.model.add(Dropout(dropout_prob))
        self.model.add(Activation('relu'))
        self.model.add(Dense(self.n_labels, activation='softmax'))

        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
Ejemplo n.º 42
0
def same_sources(s1, s2):
    ''' Do two states have the same sources (same names and same arguments)?  '''
    return (
        fn.pluck('name', s1['sources']) == fn.pluck('name', s2['sources']) and
        fn.pluck('args', s1['sources']) == fn.pluck('args', s2['sources'])
    )