Example #1
0
def best(players, name, threshold=0.1):
    def str2grams(s, n=2, blank="#", noWordOrder=True):
        blanks = blank * (n - 1)
        s = s.upper()
        s = blanks + (blanks.join(s.split(' ')) if noWordOrder else s) + blanks

        return set(
            [
                s[i:i+n] for i in xrange(len(s)-n+1)
            ] if len(s) >= n else [s]
        )

    def jaccard(x, y):
        return len(x & y) / len(x | y)

    bestPlayer, bestSim = None, 0.0
    for player in players:
        sim = jaccard(str2grams(player), str2grams(name))

        if sim > bestSim:
            bestPlayer, bestSim = player, sim

        if sim == 1.0:
            break

    if bestSim < threshold:
        print2(colored("Mismatch: " + str((bestPlayer, name)), "yellow"))
        return name

    return bestPlayer
Example #2
0
def cache_function_last_result(
        f,
        msg1="[Evaluation] Reusing previous result of <name>",
        msg2="[Evaluation] Recalculating and memorizing result of <name>"):
    key = f.__name__
    print2("[Evaluation] Results of %s will be cached." % key)

    def same_objects_on_lists(l1, l2):
        return len(l1) == len(l2) and 0 == sum(
            (e1 is not e2) for e1, e2 in zip(l1, l2))

    def caching_function(*args):

        args2result = _last_result_cache.get(key, ([], None))
        prev_args = args2result[0]
        if same_objects_on_lists(prev_args, args):
            if msg1 is not None: print2(msg1.replace("<name>", key))
        else:
            if msg2 is not None: print2(msg2.replace("<name>", key))
            result = f(*args)
            _last_result_cache[key] = args2result = (args, result)
        return args2result[1]

    caching_function.__name__ = key
    return caching_function
Example #3
0
    def caching_function(*args):

        args2result = _last_result_cache.get(key, ([], None))
        prev_args = args2result[0]
        if same_objects_on_lists(prev_args, args):
            if msg1 is not None: print2(msg1.replace("<name>", key))
        else:
            if msg2 is not None: print2(msg2.replace("<name>", key))
            result = f(*args)
            _last_result_cache[key] = args2result = (args, result)
        return args2result[1]
Example #4
0
 def optimal_h_for_risk(self, ys):
     if self.RISK_OPTIMAL_H_NUMERICALLY:
         print2(
             "[Evaluation] Numerically optimizing h for qRisk. May take time..."
         )
         utility = lambda h, y: -self.loss(h, y)
         h = optimal_h_numerically(
             ys,
             utility,
             data_mask=self.y_mask,
             start=
             None,  #self.optimal_h_bayes_estimator(ys), #start from Bayes estimator for Risk
             max_niter=self.EVAL_MAX_NITER,
             tol=self.EVAL_SGD_PREC,
             tol_goal=-1,
             debug=True,
             lr=self.EVAL_LR)
     else:
         h = self.optimal_h_bayes_estimator(ys)
     print2("[Evaluation:optimal_h_for_risk] h=%s" % str(h)[:200])
     return h
Example #5
0
 def optimal_h_for_gain(self, ys):
     if self.GAIN_OPTIMAL_H_NUMERICALLY:
         print2(
             "[Evaluation] Numerically optimizing h for qGain. May take time..."
         )
         h = optimal_h_numerically(
             ys,
             self.utility,
             data_mask=self.y_mask,
             start=
             None,  #self.optimal_h_bayes_estimator(ys), #start from Bayes estimator for Risk
             max_niter=self.EVAL_MAX_NITER,
             tol=self.EVAL_SGD_PREC,
             tol_goal=-1,
             debug=True,
             lr=self.EVAL_LR)
         #h = optimal_h_numerically_scipy(ys, self.utility, data_mask=self.y_mask,
         #                      max_niter=self.EVAL_MAX_NITER, tol=self.EVAL_SGD_PREC, tol_goal=-1,
         #                      lr=self.EVAL_LR, start=None, optimizer="COBYLA",
         #                      verbose=True, debug=False, sparse_verbose=True)
     else:
         h = self.optimal_h_bayes_estimator(ys)
     print2("[Evaluation:optimal_h_for_gain] h=%s" % str(h)[:200])
     return h
Example #6
0
        Flattents the first two dimensions 
        (samples of y for different thetas) from sample_predictive_y0.
    """
    return flatten_first_two_dims(
        sample_predictive_y0(qw, qz, nsamples_theta, nsamples_y))


# # Constructing losses and utilities

# In[19]:

# mask used to select points to the utility-dependent term: use only training data
utility_term_mask = training_mask

loss, optimal_h_bayes_estimator = losses.LossFactory(**globals()).create(LOSS)
print2("> <%s> loss: %s with (analytical/Bayes estimator) h: %s" %
       (LOSS, loss.__name__, optimal_h_bayes_estimator.__name__))

u = losses.UtilityFactory(**globals()).create(UTIL, loss)
print2("> utility: %s" % u.__name__)

# In[20]:

utility_term_factory = utility_term_estimation.UtilityAggregatorFactory()

# # Evaluation

# In[21]:

train_measures = evaluation.Measures(
    x,
    loss,
Example #7
0
def optimal_h_numerically_scipy(ys,
                                u,
                                weights=None,
                                utility_aggregator=gain_weighted,
                                data_mask=None,
                                max_niter=10000,
                                tol=1e-4,
                                tol_goal=-1,
                                lr=0.01,
                                start=None,
                                optimizer="COBYLA",
                                verbose=False,
                                debug=False,
                                sparse_verbose=True):
    """ Using numerical optimization (SciPy) finds optimal h for utility-dependent term expressed by utility_aggregator.

        Compatible with optimal_h_numerically.
    """
    printv = lambda txt: (print2("[optimal_h_numerically_scipy]" + txt)
                          if verbose else None)
    if sparse_verbose and not verbose:
        printv = lambda txt: sparse_print(
            "optimal_h_numerically_scipy", "[optimal_h_numerically_scipy]" +
            txt, 100)
    printd = lambda txt: (print2("[optimal_h_numerically_scipy]" + txt)
                          if debug else None)

    assert len( signature(utility_aggregator).parameters )==3, \
            "[optimal_h_numerically_scipy] Your utility_aggregator=%s takes wrong number of params! does not support weights?" % utility_aggregator

    env = torch if "cpu" in str(ys.device) else torch.cuda
    if data_mask is None:
        data_mask = (torch.ones(ys.shape[1:]) if len(ys.shape) > 1 else
                     torch.tensor(1)).type(env.ByteTensor)

    if weights is None: weights = torch.ones_like(ys)
    weights /= weights.sum(0)  #enforce normalization
    weights = torch.tensor(tonumpy(weights), requires_grad=False)

    y = torch.tensor(tonumpy(ys), requires_grad=False)

    if start is None:
        h = (y * weights).sum(0)  #start from E(y)
    elif start is None or not is_valid(torch.tensor(start)):
        printd("start point is invalid. ignoring!")
        h = (y * weights).sum(0)  #start from E(y)
    else:
        h = start

    start = time.time()
    x0 = tonumpy(h).flatten()
    fun = lambda h: -utility_aggregator(
        u(torch.tensor(h.reshape(y.shape[1:]), dtype=y.dtype), y), weights,
        data_mask).item()
    result = _scipy_minimize(fun,
                             x0,
                             method=optimizer,
                             max_niter=max_niter,
                             tol=tol,
                             tol_goal=tol_goal,
                             lr=lr,
                             debug=debug)
    if verbose:
        printv(
            "[%.4f][optimizer=%s ys=%s max_niter=%i tol=%s tol_goal=%s lr=%s u=%s->%s] %s"
            % (time.time() - start, optimizer, tuple(ys.shape), max_niter, tol,
               tol_goal, lr, u.__name__, utility_aggregator.__name__,
               str(result).replace("\n", ";")[:200]))
    return torch.tensor(result["x"].reshape(y.shape[1:]), dtype=ys.dtype)
Example #8
0
def optimal_h_numerically_ty(ys,
                             u,
                             utility_aggregator=gain,
                             data_mask=None,
                             max_niter=10000,
                             tol=1e-4,
                             tol_goal=-1,
                             lr=0.01,
                             start=None,
                             optimizer=torch.optim.Adam,
                             verbose=False,
                             debug=False,
                             sparse_verbose=True):
    """ Using numerical optimization finds optimal h for utility-dependent term expressed by utility_aggregator.
    
        Args:
            ys: Samples matrix. The dimensionality should match what utility_aggregator takes:
                #y-samples x #theta-samples x data-size.
            u:  Utility function u(h, ys) -> utilities matrix (the same shape as ys).
            utility_aggregator: A function that calculate utility-dependent term. 
                                Should take exactly 2 params: utilites and data_mask.
            data_mask: A mask passed to utility_aggregator.
    """
    printv = lambda txt: (print2("[optimal_h_numerically_ty] " + txt)
                          if verbose else None)
    if sparse_verbose and not verbose:
        printv = lambda txt: sparse_print(
            "optimal_h_numerically_ty", "[optimal_h_numerically_ty]" + txt, 100
        )
    printd = lambda txt: (print2("[optimal_h_numerically_ty] " + txt)
                          if debug else None)

    assert len( signature(utility_aggregator).parameters )==2, \
            "[optimal_h_numerically_ty] Your utility_aggregator=%s takes wrong number of params! perhaps requires weights?" % utility_aggregator

    env = torch if "cpu" in str(ys.device) else torch.cuda
    if data_mask is None:  #No data mask provided. Using all data points
        data_mask = (torch.ones(ys.shape[2:]) if len(ys.shape) > 2 else
                     torch.tensor(1)).type(env.ByteTensor)

    y = torch.tensor(tonumpy(ys), requires_grad=False)

    if start is None:
        h = y.mean(0).mean(0).clone().detach().requires_grad_(
            True)  #start from E(y)
    elif start is None or not is_valid(torch.tensor(start)):
        printd("start point is invalid. ignoring!")
        h = y.mean(0).mean(0).clone().detach().requires_grad_(
            True)  #start from E(y)
    else:
        h = torch.tensor(tonumpy(start), requires_grad=True)

    optimizer = optimizer([h], lr=lr)
    prev_h, prev_goal = torch.tensor(tonumpy(h)), float("inf")
    start = time.time()
    for i in range(max_niter):

        goal = -utility_aggregator(u(h, y), data_mask)
        optimizer.zero_grad()
        goal.backward(retain_graph=False)
        optimizer.step()

        #check for convergence:
        if (prev_h - h).abs().max() <= tol:
            printv(
                "[%.2f][ys=%s max_niter=%i tol=%s tol_goal=%s lr=%s u=%s->%s] Finished at %i. iter (tolerance reached): obj=%.4f max-err=%.8f mean-err=%.8f"
                % (time.time() - start, tuple(
                    ys.shape), max_niter, tol, tol_goal, lr, u.__name__,
                   utility_aggregator.__name__, i + 1, goal.item(),
                   (prev_h - h).abs().max(), (prev_h - h).abs().mean()))
            break
        if abs(prev_goal - goal.item()) <= tol_goal:
            printv(
                "[%.2f][ys=%s max_niter=%i tol=%s tol_goal=%s lr=%s u=%s->%s] Finished at %i. iter (objective tolerance reached): obj=%.4f max-err=%.8f mean-err=%.8f"
                % (time.time() - start, tuple(
                    ys.shape), max_niter, tol, tol_goal, lr, u.__name__,
                   utility_aggregator.__name__, i + 1, goal.item(),
                   (prev_h - h).abs().max(), (prev_h - h).abs().mean()))
            break
        if i >= max_niter - 1:
            printv(
                "[%.2f][ys=%s max_niter=%i tol=%s tol_goal=%s lr=%s u=%s->%s] Finished at %i. iter (max number reached): obj=%.4f max-err=%.8f mean-err=%.8f"
                % (time.time() - start, tuple(
                    ys.shape), max_niter, tol, tol_goal, lr, u.__name__,
                   utility_aggregator.__name__, i + 1, goal.item(),
                   (prev_h - h).abs().max(), (prev_h - h).abs().mean()))
            break

        if i % (max_niter // 10) == 0:
            printd("[%.2f] iter %i: objective=%.4f err=%.6f" %
                   (time.time() - start, i, goal.item(),
                    (prev_h - h).abs().max()))

        prev_h = torch.tensor(tonumpy(h))
        prev_goal = goal.item()

    return h
Example #9
0
        Flattents the first two dimensions 
        (samples of y for different thetas) from sample_predictive_y0.
    """
    ys = sample_predictive_y0(data, q_theta, nsamples_theta, nsamples_y)
    return flatten_first_two_dims(ys)


# # Constructing losses and utilities

# In[181]:

# include all (training) data points in utility-dependent term
utility_term_mask = torch.ones(schools_dat["J"]).type(env.ByteTensor)

loss, optimal_h_bayes_estimator = losses.LossFactory(**globals()).create(LOSS)
print2("> <%s> loss: %s with (analytical/Bayes estimator) h: %s" %
       (LOSS, loss.__name__, optimal_h_bayes_estimator.__name__))

u = losses.UtilityFactory(**globals()).create(UTIL, loss)
print2("> utility: %s" % u.__name__)

# In[182]:

utility_term_factory = utility_term_estimation.UtilityAggregatorFactory()

# # Evaluation

# In[183]:

measures = evaluation.Measures(
    torch.tensor(schools_dat["y"], dtype=torch.float32),
    loss,
        Flattents the first two dimensions 
        (samples of y for different thetas) from sample_predictive_y0.
    """
    return flatten_first_two_dims(sample_predictive_y0(qw, qz, nsamples_theta, nsamples_y))


# # Constructing losses and utilities

# In[19]:


# mask used to select points to the utility-dependent term: use only training data
utility_term_mask = training_mask 
 
loss, optimal_h_bayes_estimator = losses.LossFactory(**globals()).create(LOSS)
print2("> <%s> loss: %s with (analytical/Bayes estimator) h: %s" % 
        (LOSS, loss.__name__, optimal_h_bayes_estimator.__name__))
        
u = losses.UtilityFactory(**globals()).create(UTIL, loss)
print2("> utility: %s" % u.__name__)            


# In[20]:


utility_term_factory = utility_term_estimation.UtilityAggregatorFactory()


# # Evaluation

# In[21]:
Example #11
0
def crawl(sport, year, division, org, game, url, neutral=False):
    global data
    data = data.format(sport, year, division)

    gamename = game.replace('/', '.')

    def readFlag(flag):
        if not os.path.exists(os.path.join(data, org, gamename)):
            os.mkdir(os.path.join(data, org, gamename))

        return os.path.exists(os.path.join(data, org, gamename, flag))

    def setFlag(flag):
        with open(os.path.join(data, org, gamename, flag), 'w') as f:
            pass

    if neutral and not readFlag(".neutral"):
        setFlag(".neutral")

    filename = os.path.join(data, org, gamename, "{}.csv")

    if not readFlag(".done"):
        try:
            gamelink = urljoin(domain, url)
            log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink)))

            gs = parseURL(gamelink)

            sleep(2)

            gamescore = None
            gameinfo = None

            periods = []
            teams = []
            nextPeriod = 0
            for table in gs.select("div.header_menu a"):
                if (
                        table["href"] == "#" or
                        not (
                            table["href"].startswith("/game/box_score") or
                            table["href"].startswith("/game/play_by_play")
                        )
                    ):
                    continue

                tablelink = urljoin(domain, table["href"])
                print2("{} \033[4m{}\033[0m".format(table.text.strip(), tablelink))

                ts = parseURL(tablelink)

                if gamescore is None:
                    gamescore = parseTable(ts.select("table:nth-of-type(1)")[0])
                    dumpTable(
                        gamescore,
                        filename.format("Score")
                    )

                if gameinfo is None:
                    gameinfo = transposeTable(
                        parseTable(ts.select("table:nth-of-type(3)")[0]) +
                        parseTable(ts.select("table:nth-of-type(4)")[0])
                    )
                    dumpTable(
                        gameinfo,
                        filename.format("Info")
                    )

                teams = [gamescore[1][0].text.strip(), gamescore[2][0].text.strip()]
                periods = [v.text.strip() for v in gamescore[0][1:]]

                if table["href"].startswith("/game/box_score"):
                    if table.text.strip() == "Box Score":
                        sfilename = filename.format("Box Score - {}")
                    else:
                        sfilename = filename.format(periods[nextPeriod] + " - {}")
                        nextPeriod += 1

                    dumpTable(
                        parseTable(ts.select("table:nth-of-type(5)")[0], header=1),
                        sfilename.format(teams[0])
                    )
                    dumpTable(
                        parseTable(ts.select("table:nth-of-type(6)")[0], header=1),
                        sfilename.format(teams[1])
                    )
                elif table["href"].startswith("/game/play_by_play"):
                    sfilename = filename.format("Play by Play - {}")

                    for (i, period) in enumerate(periods[:-1]):
                        dumpTable(
                            parseTable(ts.select("table:nth-of-type({})".format(6 + 2 * i))[0], header=0),
                            sfilename.format(period)
                        )

                sleep(2)

            if gamescore == gameinfo == None:
                raise Exception("Not a game.")

            setFlag(".done")

            sleep(2)
        except Exception as e:
            print2(colored("Error: ", "red"), e)
        finally:
            print2()

    if not readFlag(".parsed"):
        try:
            gamelink = urljoin(domain, url)
            log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink)))
            print2("Parsing...")

            gamescore = loadTable(filename.format("Score"))

            sfilename = filename.format("Box Score - {}")
            teams = [gamescore[1][0], gamescore[2][0]]
            with open(filename.format("Box Score - All (Parsed)"), "w") as af:
                for team in teams:
                    boxScore = parseBoxScore(
                        sfilename.format(team),
                        filename.format("Info"),
                        team,
                        "All"
                    )

                    rawDumpTable(boxScore[(0 if team == teams[0] else 1):], af)

            sfilename = filename.format("Play by Play - {}")
            periods = gamescore[0][1:]
            with open(filename.format("Play by Play - All (Parsed)"), "w") as af:
                for period in periods[:-1]:
                    playByPlay = parsePlayByPlay(
                        sfilename.format(period),
                        period,
                        filename.format("Info")
                    )

                    rawDumpTable(playByPlay[(0 if period == periods[0] else 1):], af)

            setFlag(".parsed")
        except Exception as e:
            print2(colored("Error: ", "red"), e)
        finally:
            print2()
Example #12
0
    def __init__(self,
                 y,
                 loss,
                 u,
                 sample_predictive_y,
                 optimal_h_bayes_estimator=None,
                 y_mask=None,
                 GAIN_OPTIMAL_H_NUMERICALLY=True,
                 RISK_OPTIMAL_H_NUMERICALLY=False,
                 EVAL_NSAMPLES_UTILITY_TERM_THETA=1000,
                 EVAL_NSAMPLES_UTILITY_TERM_Y=1,
                 EVAL_MAX_NITER=10000,
                 EVAL_SGD_PREC=0.0001,
                 EVAL_LR=0.01,
                 EVAL_RESAMPLE_EVERY_TIME=False):
        """
            Args:
              y       Evaluation data.
              y_mask  A mask selecting data points for evaluation (default: all).    
              loss    A function y x h -> loss used to calculate risks.
              u       A function y x h -> utility used to calculate gains.
              sample_predictive_y  A function that for each data point from y, generates samples from predictive posterior.
              EVAL_RESAMPLE_EVERY_TIME  Can results of sample_predictive_y, optimal_h_for_gain and optimal_h_for_risk be cached? 
        """
        self.y = y
        self.y_mask = y_mask
        if self.y_mask is None:
            print2(
                "[Evaluation] WARNING: using default all data points in evaluation."
            )
            env = torch if "cpu" in str(self.y.device).lower() else torch.cuda
            self.y_mask = torch.ones_like(self.y).type(env.ByteTensor)

        self.loss = loss
        self.utility = u
        self.sample_predictive_y = sample_predictive_y
        self.optimal_h_bayes_estimator = optimal_h_bayes_estimator
        if (self.optimal_h_bayes_estimator is None) and \
           (not GAIN_OPTIMAL_H_NUMERICALLY or not RISK_OPTIMAL_H_NUMERICALLY):
            print2(
                "[Evaluation] WARNING: Optimal decisions h for both Risk and Gain will be obtained numerically."
            )
            self.optimal_h_bayes_estimator = lambda ys: None
            GAIN_OPTIMAL_H_NUMERICALLY, RISK_OPTIMAL_H_NUMERICALLY = True, True

        self.GAIN_OPTIMAL_H_NUMERICALLY = GAIN_OPTIMAL_H_NUMERICALLY
        self.RISK_OPTIMAL_H_NUMERICALLY = RISK_OPTIMAL_H_NUMERICALLY

        self.EVAL_NSAMPLES_UTILITY_TERM_THETA = EVAL_NSAMPLES_UTILITY_TERM_THETA
        self.EVAL_NSAMPLES_UTILITY_TERM_Y = EVAL_NSAMPLES_UTILITY_TERM_Y
        self.EVAL_MAX_NITER = EVAL_MAX_NITER
        self.EVAL_SGD_PREC = EVAL_SGD_PREC
        self.EVAL_LR = EVAL_LR

        print("[Evaluation] Configuration: %s" %
              " ".join("%s=%s" % (k, format_value(v))
                       for k, v in vars(self).items()))

        if not EVAL_RESAMPLE_EVERY_TIME:
            self.optimal_h_for_gain = cache_function_last_result(
                self.optimal_h_for_gain)
            self.optimal_h_for_risk = cache_function_last_result(
                self.optimal_h_for_risk)
            self.sample_predictive_posterior = cache_function_last_result(
                self.sample_predictive_posterior)
Example #13
0
)

sleep(2)

for org in s.select("table a"):
    orgname = org.text.strip()

    if filterOrg != None and filterOrg != orgname:
        continue

    try:
        if not os.path.exists(os.path.join(data, orgname)):
            os.mkdir(os.path.join(data, orgname))

        orglink = urljoin(domain, org['href'])
        print2()
        log("{} {} {} {} {}".format(sport, year, division, orgname, dumpURL(orglink)))

        cs = parseURL(orglink)

        for link in cs.select("#contentarea > a"):
            if link.text.strip() == "Roster":
                tq.enqueue(
                    crawlTeam,
                    sport, year, division,
                    org.text.strip(),
                    "Roster",
                    urljoin(domain, link["href"]),
                    1,
                    at_front=atFront
                )