Exemple #1
0
def train(OpType, X, y, nepochs=150):
    progress = ProgressBar('Training')
    np.random.seed(0)
    #Features transformation
    X = np.log2(X)
    #Model
    model = kr.models.Sequential()
    for i, L in enumerate([64, 48, 32, 16, 8]):
        model.add(kr.layers.Dense(L, input_dim=X.shape[1]))
        model.add(kr.layers.Activation('relu'))
    model.add(kr.layers.Dense(1))
    model.add(kr.layers.Activation('linear'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    #Train
    history = model.fit(
        X,
        y,
        validation_split=0.1,
        batch_size=32,
        epochs=nepochs,
        verbose=1,
        callbacks=[
            kr.callbacks.LambdaCallback(
                on_epoch_end=lambda i, _: progress.update(i, nepochs))
        ])
    return model
Exemple #2
0
def show_progress(progress, nsamples):
    bar = ProgressBar('Benchmarks')
    while True:
        sleep(0.1)
        current = np.sum(progress.values())
        bar.update(current, nsamples)
        if (current > nsamples - 1):
            break
Exemple #3
0
def show_progress(progress, nsamples):
    bar = ProgressBar('Benchmarks')
    while True:
        sleep(0.1)
        current = np.sum(progress.values())
        bar.update(current, nsamples)
        if(current > nsamples - 1):
            break
Exemple #4
0
    def train(self, env, config, batch_size=128, updates=500, max_seconds=30):
        models = config.get()
        for model in models:
            model.compile(optimizer=ko.RMSprop(lr=self.lr),
                          loss=[self._logits_loss, self._value_loss])
        # Storage helpers for a single batch of data.
        actions = np.empty((batch_size, config.num), dtype=np.int32)
        rewards, dones, values = np.empty((3, batch_size, config.num))
        observations = np.empty(
            (batch_size, config.window_size, env.observations_size))

        # Training loop: collect samples, send to optimizer, repeat updates times.
        deaths = {}
        for model in models:
            deaths[model.label] = 0
        obs_window = env.reset()
        episodes = []
        steps = 0
        pb = ProgressBar(f'{config.label}')
        total_progress = updates * batch_size
        progress = 0
        pb.reset()
        for _ in range(updates):
            for step in range(batch_size):
                steps += 1
                progress += 1
                observations[step] = obs_window
                for m_i, model in enumerate(models):
                    actions[step,
                            m_i], values[step,
                                         m_i] = model.action_value(obs_window)
                obs_window, rewards[step], dones[step] = env.step(
                    actions[step])
                if any(dones[step]) or max_seconds < steps * env.dt:
                    obs_window = env.reset()
                    episodes.append(steps * env.dt)
                    steps = 0
                    for dead, model in zip(dones[step], models):
                        if dead:
                            deaths[model.label] += 1
            losses = []
            for m_i, model in enumerate(models):
                _, next_value = model.action_value(obs_window)
                returns, advs = self._returns_advantages(
                    rewards[:, m_i], dones[:, m_i], values[:, m_i], next_value)
                # A trick to input actions and advantages through same API.
                acts_and_advs = np.concatenate(
                    [actions[:, m_i, None], advs[:, None]], axis=-1)
                loss = model.train_on_batch(
                    observations[:, -model.input_size:, :],
                    [acts_and_advs, returns])
                losses.append(loss[0])
            pb(progress / total_progress,
               f' loss: {sum(losses)/len(losses):6.3f}')
        return episodes, deaths
def permute_log_odds(clf,
                     boot_n,
                     feature_names=None,
                     region_names=None,
                     n_jobs=1):
    """ Given a fitted RegionalClassifier object, permute the column "importances" (i.e. log odds ratios)
    by resampling across studies. The function returns a pandas dataframe with z-score and p-values for each
    combination between a region and a topic in the Dataset """
    def z_score_array(arr, dist):
        return np.array([(v - dist[dist.region == i + 1].lor.mean()) /
                         dist[dist.region == i + 1].lor.std()
                         for i, v in enumerate(arr.tolist())])

    pb = ProgressBar(len(clf.data), start=True)
    overall_results = []

    if feature_names is None:
        feature_names = clf.feature_names

    if region_names is None:
        region_names = range(1, len(clf.data) + 1)

    # For each region, run boot_n number of permutations in parallel, and save to a list
    for reg, (X, y) in enumerate(clf.data):
        results = Parallel(n_jobs=n_jobs)(delayed(permutation_parallel)(
            X, y, clf.classifier, feature_names, reg, i)
                                          for i in range(boot_n))
        for result in results:
            for res in result:
                overall_results.append(res)
        pb.next()

    # Combine permuted data to a dataframe
    perm_results = pd.DataFrame(
        overall_results, columns=['region', 'perm_n', 'topic_name', 'lor'])

    # Reshape observed log odds ratios with real data, z-score observed value on permuted null distribution
    lor = pd.DataFrame(clf.importance,
                       index=range(1, clf.importance.shape[0] + 1),
                       columns=feature_names)
    lor_z = lor.apply(lambda x: z_score_array(
        x, perm_results[perm_results.topic_name == x.name]))
    lor_z.index = region_names

    # Transform to long format and add p-values
    all_roi_z = pd.melt(pd.concat([lor_z]).reset_index(),
                        value_name='lor_z',
                        id_vars='index')
    all_roi_z = all_roi_z.rename(columns={'index': 'ROI'})
    all_roi_z['p'] = (1 - norm.cdf(all_roi_z.lor_z.abs())) * 2

    return all_roi_z
def test(dataloader):
    pbar = ProgressBar(n_total=len(dataloader), desc='Testing')
    valid_loss = AverageMeter()
    valid_acc = AverageMeter()
    valid_f1 = AverageMeter()
    model.eval()
    count = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            b_features, b_target, b_idx = batch['features'].to(
                DEVICE), batch['target'].to(DEVICE), batch['idx'].to(DEVICE)
            logits, probs = model(b_features)
            loss = F.cross_entropy(logits, b_target).item()
            pred = probs.argmax(
                dim=1,
                keepdim=True)  # get the index of the max log-probability
            correct = pred.eq(b_target.view_as(pred)).sum().item()
            f1 = f1_score(pred.to("cpu").numpy(),
                          b_target.to("cpu").numpy(),
                          average='macro')
            valid_f1.update(f1, n=b_features.size(0))
            valid_loss.update(loss, n=b_features.size(0))
            valid_acc.update(correct, n=1)
            count += b_features.size(0)
            pbar(step=batch_idx)
    return {
        'valid_loss': valid_loss.avg,
        'valid_acc': valid_acc.sum / count,
        'valid_f1': valid_f1.avg
    }
Exemple #7
0
def prune(OpType, model, init_cuda):
    progress = ProgressBar('Pruning')
    device, ctx, stream = init_cuda()
    #Restore progress
    X = np.empty((0, OpType.Nshapes))
    Y = np.empty((0, OpType.Nparams - OpType.Nshapes), dtype=np.uint32)
    V = valid_configurations(OpType, device)
    #Update
    i = Y.shape[0]
    S = bench_shapes(OpType, device)
    for i, x in enumerate(S):
        perf, y = maximize(OpType, model, x, V, device, ctx, stream)
        X = np.vstack((X, x))
        Y = np.vstack((Y, y))
        progress.update(i, len(S))
        print(x, perf)
    #Remove duplicates
    Y = np.vstack(set(map(tuple, Y)))
    return  Y
Exemple #8
0
def prune(OpType, model, init_cuda):
    progress = ProgressBar('Pruning')
    device, ctx, stream = init_cuda()
    #Restore progress
    X = np.empty((0, OpType.Nshapes))
    Y = np.empty((0, OpType.Nparams - OpType.Nshapes), dtype=np.uint32)
    V = valid_configurations(OpType, device)
    #Update
    i = Y.shape[0]
    S = bench_shapes(OpType, device)
    for i, x in enumerate(S):
        perf, y = maximize(OpType, model, x, V, device, ctx, stream)
        X = np.vstack((X, x))
        Y = np.vstack((Y, y))
        progress.update(i, len(S))
        print(x, perf)
    #Remove duplicates
    Y = np.vstack(set(map(tuple, Y)))
    return Y
def bootstrap_log_odds(clf,
                       boot_n,
                       feature_names=None,
                       region_names=None,
                       n_jobs=1):
    def percentile(n):
        def percentile_(x):
            return np.percentile(x, n)

        percentile_.__name__ = 'percentile_%s' % n
        return percentile_

    pb = ProgressBar(len(clf.data), start=True)

    if feature_names is None:
        feature_names = clf.feature_names

    if region_names is None:
        region_names = range(1, len(clf.data) + 1)

    # For each region, calculate in parallel bootstrapped lor estimates
    overall_boot = []
    for reg, (X, y) in enumerate(clf.data):
        results = Parallel(n_jobs=n_jobs)(delayed(bootstrap_parallel)(
            X, y, clf.classifier, feature_names, region_names[reg], i)
                                          for i in range(boot_n))
        for result in results:
            for res in result:
                overall_boot.append(res)
        pb.next()

    overall_boot = pd.DataFrame(
        overall_boot, columns=['region', 'perm_n', 'topic_name', 'fi'])

    # Calculate the 95% confidence intervals from the bootstrapped samples
    return overall_boot.groupby(['region', 'topic_name'])['fi'].agg({
        'mean':
        np.mean,
        'low_ci':
        percentile(0.05),
        'hi_ci':
        percentile(99.95)
    }).reset_index()
Exemple #10
0
def _valid_step(model: tf.keras.Model,
                dataset: tf.data.Dataset,
                progress_bar: ProgressBar,
                loss_metric: tf.keras.metrics.Mean,
                max_train_steps: Any = -1) -> Dict:
    """ 验证步

    :param model: 验证模型
    :param dataset: 验证数据集
    :param progress_bar: 进度管理器
    :param loss_metric: 损失计算器
    :param max_train_steps: 验证步数
    :return: 验证指标
    """
    print("验证轮次")
    start_time = time.time()
    loss_metric.reset_states()

    for (batch, (train_enc, train_dec, month_enc, month_dec,
                 labels)) in enumerate(dataset.take(max_train_steps)):
        train_enc = tf.squeeze(train_enc, axis=0)
        train_dec = tf.squeeze(train_dec, axis=0)
        outputs = model(inputs=[train_enc, train_dec, month_enc, month_dec])
        treat_outputs = tf.squeeze(input=outputs[:, -24:, :], axis=-1)
        loss = tf.keras.losses.MSE(labels, treat_outputs)

        loss_metric(loss)

        progress_bar(
            current=batch + 1,
            metrics=get_dict_string(data={"valid_loss": loss_metric.result()}))

    progress_bar(
        current=progress_bar.total,
        metrics=get_dict_string(data={"valid_loss": loss_metric.result()}))

    progress_bar.done(step_time=time.time() - start_time)

    return {"valid_loss": loss_metric.result()}
Exemple #11
0
    def classify(self, scoring='accuracy', n_jobs=1, importance_function=None):
        """
        scoring -  scoring function or type (str)
        n_jobs - Number of parallel jobs
        importance_function - Function to extract importance vectors from classifiers (differs by algorithm)
        """
        if importance_function is None:
            importance_function = log_odds_ratio

        if self.data is None:
            self.load_data()
            self.initalize_containers()

        print("Classifying...")
        pb = ProgressBar(self.n_regions, start=True)

        for index, output in enumerate(
                Parallel(n_jobs=n_jobs)(delayed(classify_parallel)(
                    self.classifier, scoring, region_data, importance_function)
                                        for region_data in self.data)):
            self.class_score[index] = output['score']
            self.importance[index] = output['importance']
            self.predictions[index] = output['predictions']
            pb.next()
Exemple #12
0
def train(OpType, X, y, nepochs = 150):
    progress = ProgressBar('Training')
    np.random.seed(0)
    #Features transformation
    X = np.log2(X)
    #Model
    model = kr.models.Sequential()
    for i,L in enumerate([64, 48, 32, 16, 8]):
        model.add(kr.layers.Dense(L, input_dim=X.shape[1]))
        model.add(kr.layers.Activation('relu'))
    model.add(kr.layers.Dense(1))
    model.add(kr.layers.Activation('linear'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    #Train
    history = model.fit(X, y, validation_split=0.1, batch_size=32, epochs=nepochs,
                        verbose=1, callbacks = [kr.callbacks.LambdaCallback(on_epoch_end = lambda i, _: progress.update(i, nepochs))])
    return model
Exemple #13
0
def train(dataloader):
    pbar = ProgressBar(n_total=len(dataloader), desc='Training')
    train_loss = AverageMeter()
    model.train()
    for batch_idx, batch in enumerate(dataloader):
        b_features, b_target, b_idx = batch['features'].to(
            DEVICE), batch['target'].to(DEVICE), batch['idx'].to(DEVICE)
        optimizer.zero_grad()
        with autocast():
            logits, probs = model(b_features)
            loss = F.cross_entropy(logits, b_target)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        pbar(step=batch_idx, info={'loss': loss.item()})
        train_loss.update(loss.item(), n=1)
    return {'loss': train_loss.avg}
def test(dataloader):
    pbar = ProgressBar(n_total=len(dataloader), desc='Testing')
    valid_loss = AverageMeter()
    valid_acc = AverageMeter()
    count = 0
    for batch_idx, batch in enumerate(dataloader):
        # forward -- skip backward prop
        probas = model.forward(batch['features'])
        # record loss
        loss = model._logit_cost(batch['target'], probas)
        # get predictions
        prediction = torch.where(probas > 0.5, torch.tensor(1, device=device),
                                 torch.tensor(0, device=device)).view(-1)
        # compare
        correct = prediction.eq(batch['target']).sum().item()
        valid_loss.update(loss.item(), n=batch['features'].size(0))
        valid_acc.update(correct, n=1)
        count += batch['features'].size(0)
        pbar(step=batch_idx)
    return {'valid_loss': valid_loss.avg, 'valid_acc': valid_acc.sum / count}
def train(dataloader):
    pbar = ProgressBar(n_total=len(dataloader), desc='Training')
    train_loss = AverageMeter()
    for batch_idx, batch in enumerate(dataloader):
        # forward
        probas = model.forward(batch['features'])
        # backward
        grad_w, grad_b = model.backward(batch['features'], batch['target'],
                                        probas)
        # manual regularization -- account for mini-batches
        l2_reg = model.LAMBDA * model.weights / len(dataloader)
        # update weights
        model.weights -= learning_rate * (grad_w + l2_reg)
        model.bias -= learning_rate * grad_b
        # record loss
        loss = model._logit_cost(batch['target'], probas)
        # update meter
        train_loss.update(loss.item(), n=1)
        # update progress bar
        pbar(step=batch_idx, info={'batch_loss': loss.item()})
    return {'train_loss': train_loss.avg}
def train(dataloader):
    pbar = ProgressBar(n_total=len(dataloader), desc='Training')
    train_loss = AverageMeter()
    for batch_idx, batch in enumerate(dataloader):
        # forward
        y_hat = model.forward(batch['features'].float())
        # backward
        grad_w, grad_b = model.backward(batch['features'], batch['target'],
                                        y_hat)
        # manual regularization\
        l2_reg = model.LAMBDA * model.weights
        l2_reg = l2_reg.reshape(2, 1)
        # update weights
        model.weights -= learning_rate * (grad_w + l2_reg).view(-1)
        model.bias -= (learning_rate * grad_b).view(-1)
        # record loss
        loss = model.loss(batch['target'], y_hat)
        # update meter
        train_loss.update(loss.item(), n=1)
        # update progress bar
        pbar(step=batch_idx, info={'batch_loss': loss.item()})
    return {'train_loss': train_loss.avg}
def train(dataloader):
    pbar = ProgressBar(n_total=len(dataloader), desc='Training')
    train_loss = AverageMeter()
    model.train()
    for batch_idx, batch in enumerate(dataloader):
        b_features, b_target, b_idx = batch['features'].to(
            DEVICE), batch['target'].to(DEVICE), batch['idx'].to(DEVICE)
        optimizer.zero_grad()
        with autocast():
            logits, probs = model(b_features)
            loss = F.cross_entropy(logits, b_target)
            # regularize loss -- but not the intercept
            LAMBDA, L2 = 2, 0.
            for name, p in model.named_parameters():
                if 'weight' in name:
                    L2 = L2 + (p**2).sum()
            loss = loss + 2. / b_target.size(0) * LAMBDA * L2
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        pbar(step=batch_idx, info={'loss': loss.item()})
        train_loss.update(loss.item(), n=1)
    return {'loss': train_loss.avg}
Exemple #18
0
def main(argv):

    pathFinder = PathFinder()
    numTravellers = 2
    combinationLimit = 1000000
    minPathLength = -1
    maxPathLength = -1
    maxEdgeRedundancy = -1
    guiFormat = False
    fileName = ""

    try:                                
        opts, args = getopt.getopt(argv, "?gst:l:f:i:a:chr:", ["help", "guiFormat", "silent", "travellers=", "limit=", "filename=", "min=", "max=", "cyclic", "allowHomes", "allowhomes", "redundancy="])

        for opt, arg in opts:
            if opt in ("-?", "--help"):
                usage()                     
                sys.exit(1)
            if opt in ("-t", "--travellers"):
                numTravellers = int(arg)
            if opt in ("-l", "--limit"):
                combinationLimit = int(arg) * 1000000

            if opt in ("-f", "--filename"):
                fileName = arg

            if opt in ("-i", "--min"):
                minPathLength = int(arg) + 1

            if opt in ("-a", "--max"):
                maxPathLength = int(arg) + 1

            if opt in ("-r", "--redundancy"):
                maxEdgeRedundancy = int(arg)

            if opt in ("-c", "--cyclic"):
                pathFinder.useCyclicBFS = True

            if opt in ("-h", "--allowHomes", "--allowhomes"):
                pathFinder.canPassHomeNodes = True

            if opt in ("-g", "--guiFormat", "--guiformat"):
                guiFormat = True

            if opt in ("-s", "--silent"):
                global SILENT_MODE 
                SILENT_MODE = True

        if len(fileName) == 0:
            usage()
            sys.exit(2)
            
    except getopt.GetoptError:
        usage()
        sys.exit(2) 

    progressBar = ProgressBar()
    testGraph = parseGraph(fileName)

    Message("\n* Solving for " + str(numTravellers) + " traveller(s)")

    if combinationLimit > 0:
        Message("* Considering at most " + str(combinationLimit) + " combinations.")
    else:
        Message("* Attempting to solve all combinations.")

    homeNodeIds = testGraph.GetHomeNodeIds()
    homeNodePairs = itertools.combinations(homeNodeIds, 2)

    solutions = []


    # FindAllPaths dla wszystkich par domkow
    for p in homeNodePairs:
        for s in pathFinder.FindAllPaths(testGraph, p[0], p[1]):
            if(minPathLength == -1 or len(s) >= minPathLength) and (maxPathLength == -1 or len(s) <= maxPathLength):
                solutions.append(s)


    #generate solution sets
    solutions.sort()
    
    Message("Discovered " + str(len(solutions)) + " paths for all home nodes.")
    combinations = itertools.combinations(solutions, numTravellers)
    
    solutionSets = []

    numMillions = 1
  
   # if combinationLimit > 0:
    currentCombination = 0
    for c in combinations:
        if currentCombination == combinationLimit and combinationLimit > 0:
            break

        if currentCombination > numMillions*1000000:
            Warning("** WARNING: over " + str(numMillions) + " million combinations.")
            numMillions = numMillions + 1

        solutionSets.append(c)
        currentCombination = currentCombination + 1
   # else:
   #     solutionSets = list(combinations)

    Message("* Spawned " + str(len(solutionSets)) + " combinations.")
    
    # get rid of gazillions duplicate entries
    Message("* Filtering combinations, this may take a while...")
    solutionSets.sort()
    solutionSets = list(solutionSets for solutionSets,_ in itertools.groupby(solutionSets))

    totalNumSets = len(solutionSets)

    Message("* Will check " + str(totalNumSets) + " unique sets")

    possibleSolutions = []
    currentSetNum = 0
    solutionNum = 1

    for s in solutionSets:

        if not SILENT_MODE:
            progressBar.draw(currentSetNum, totalNumSets)

        currentSetNum = currentSetNum + 1

        testGraph.Reset()
        possibleSolution = testGraph.IsSolvableForSet(s)

        if possibleSolution is not None:
            Message("\rSolution " + str(solutionNum) + " " + str(possibleSolution))

            # check how many edges are left unused, the less the better
            unusedEdges = testGraph.GetFreeEdges()

            possibleSolutions.append((possibleSolution, unusedEdges))
            solutionNum = solutionNum + 1

        if not SILENT_MODE:
            progressBar.draw(currentSetNum, totalNumSets)

    Message("\n")

    # sort solutions by number of unused edges
    possibleSolutions.sort(key=lambda possibleSolutions: len(possibleSolutions[1]))

    numSolutionsListed = 0

    guiFormatDataList = [] # container of guiFormatData

    for s in possibleSolutions:
        solutionString = str(s[0]) + " "

        guiFormatData = dict()
        guiFormatData['Paths'] = s[0]
        guiFormatData['PathEndNodes'] = []
        guiFormatData['MoveLimits'] = []
        for element in s[0]:
            startPoint = "(SP: " + str(element[0]) + "|" + str(element[len(element)-1]) + " ML: " + str(len(element)-1) + ") "
            solutionString += startPoint
            guiFormatData['PathEndNodes'].append((element[0], element[len(element)-1]))
            guiFormatData['MoveLimits'].append(len(element)-1)
                                                 

        solutionString += "RE: " + str(len(s[1])) + " "

        redundantEdgeIdList = []

        for e in s[1]:
            redundantEdgeIdList.append(e.id)

        guiFormatData['RedundantEdgeIds'] = redundantEdgeIdList
 
        if len(s[1]) > 0:
            unusedEdgesStr = ""
            for ue in s[1]:
                unusedEdgesStr += "(" + str(ue.connectedNodes[0].id) + "-" + str(ue.connectedNodes[1].id) + ")"

            solutionString += "[" + unusedEdgesStr + "]"        

        if maxEdgeRedundancy < 0 or len(s[1]) <= maxEdgeRedundancy:
            numSolutionsListed = numSolutionsListed + 1
            guiFormatDataList.append(guiFormatData)
            print solutionString


    guiDataOutput = open('output.txt', 'wb')
    pickle.dump(guiFormatDataList, guiDataOutput, -1)
    guiDataOutput.close()

    if len(possibleSolutions) == 0:
        Warning("*** NO SOLUTIONS FOUND. ***\n")
        sys.exit(1)
    else:
        Message("\nFound " + str(len(possibleSolutions)) + " solutions. ")
        Message("\nListed " + str(numSolutionsListed) + " solutions. ")
Exemple #19
0
def train(model: tf.keras.Model,
          checkpoint: tf.train.CheckpointManager,
          batch_size: Any,
          epochs: Any,
          train_dataset: Any,
          valid_dataset: AnyStr = None,
          max_train_steps: Any = -1,
          checkpoint_save_freq: Any = 2,
          *args,
          **kwargs) -> Dict:
    """ 训练器

    :param model: 训练模型
    :param checkpoint: 检查点管理器
    :param batch_size: batch 大小
    :param epochs: 训练周期
    :param train_dataset: 训练数据集
    :param valid_dataset: 验证数据集
    :param max_train_steps: 最大训练数据量,-1为全部
    :param checkpoint_save_freq: 检查点保存频率
    :return:
    """
    print("训练开始,正在准备数据中")
    # learning_rate = CustomSchedule(d_model=embedding_dim)
    loss_metric = tf.keras.metrics.Mean(name="train_loss_metric")
    optimizer = tf.optimizers.Adam(learning_rate=2e-5,
                                   beta_1=0.9,
                                   beta_2=0.999,
                                   name="optimizer")

    train_steps_per_epoch = max_train_steps if max_train_steps != -1 else (
        40000 // batch_size)
    valid_steps_per_epoch = 3944 // batch_size

    progress_bar = ProgressBar()
    for epoch in range(epochs):
        print("Epoch {}/{}".format(epoch + 1, epochs))
        start_time = time.time()
        loss_metric.reset_states()
        progress_bar.reset(total=train_steps_per_epoch, num=batch_size)

        train_metric = None
        for (batch,
             (train_enc, train_dec, month_enc, month_dec,
              labels)) in enumerate(train_dataset.take(max_train_steps)):
            train_metric, prediction = _train_step(model=model,
                                                   optimizer=optimizer,
                                                   loss_metric=loss_metric,
                                                   train_enc=train_enc,
                                                   train_dec=train_dec,
                                                   month_enc=month_enc,
                                                   month_dec=month_dec,
                                                   labels=labels)

            progress_bar(current=batch + 1,
                         metrics=get_dict_string(data=train_metric))
        progress_bar(current=progress_bar.total,
                     metrics=get_dict_string(data=train_metric))

        progress_bar.done(step_time=time.time() - start_time)

        if (epoch + 1) % checkpoint_save_freq == 0:
            checkpoint.save()

            if valid_steps_per_epoch == 0 or valid_dataset is None:
                print("验证数据量过小,小于batch_size,已跳过验证轮次")
            else:
                progress_bar.reset(total=valid_steps_per_epoch, num=batch_size)
                valid_metrics = _valid_step(model=model,
                                            dataset=valid_dataset,
                                            progress_bar=progress_bar,
                                            loss_metric=loss_metric,
                                            **kwargs)
    print("训练结束")
    return {}
Exemple #20
0
def download(tickers: list,
             start: Union[str, int] = None,
             end: Union[str, int] = None,
             interval: str = "1d") -> dict:
    """
    Download historical data for tickers in the list.

    Parameters
    ----------
    tickers: list
        Tickers for which to download historical information.
    start: str or int
        Start download data from this date.
    end: str or int
        End download data at this date.
    interval: str
        Frequency between data.

    Returns
    -------
    data: dict
        Dictionary including the following keys:
        - tickers: list of tickers
        - logp: array of log-adjusted closing prices, shape=(num stocks, length period);
        - volume: array of volumes, shape=(num stocks, length period);
        - sectors: dictionary of stock sector for each ticker;
        - industries: dictionary of stock industry for each ticker.
    """
    tickers = tickers if isinstance(tickers,
                                    (list, set, tuple)) else tickers.replace(
                                        ',', ' ').split()
    tickers = list(set([ticker.upper() for ticker in tickers]))

    data = {}
    si_columns = ["SYMBOL", "CURRENCY", "SECTOR", "INDUSTRY"]
    si_filename = "stock_info.csv"
    if not os.path.exists(si_filename):
        # create a .csv to store stock information
        with open(si_filename, 'w') as file:
            wr = csv.writer(file)
            wr.writerow(si_columns)
    # load stock information file
    si = pd.read_csv(si_filename)
    missing_tickers = [
        ticker for ticker in tickers if ticker not in si['SYMBOL'].values
    ]
    missing_si, na_si = {}, {}
    currencies = {}

    if end is None:
        end = int(dt.datetime.timestamp(dt.datetime.today()))
    elif type(end) is str:
        end = int(dt.datetime.timestamp(dt.datetime.strptime(end, '%Y-%m-%d')))
    if start is None:
        start = int(
            dt.datetime.timestamp(dt.datetime.today() - dt.timedelta(365)))
    elif type(start) is str:
        start = int(
            dt.datetime.timestamp(dt.datetime.strptime(start, '%Y-%m-%d')))

    @multitasking.task
    def _download_one_threaded(ticker: str,
                               start: str,
                               end: str,
                               interval: str = "1d"):
        """
        Download historical data for a single ticker with multithreading. Plus, it scrapes missing stock information.

        Parameters
        ----------
        ticker: str
            Ticker for which to download historical information.
        interval: str
            Frequency between data.
        start: str
            Start download data from this date.
        end: str
            End download data at this date.
        """
        data_one = _download_one(ticker, start, end, interval)

        try:
            data_one = data_one["chart"]["result"][0]
            data[ticker] = _parse_quotes(data_one)

            if ticker in missing_tickers:
                currencies[ticker] = data_one['meta']['currency']
                try:
                    html = requests.get(
                        url='https://finance.yahoo.com/quote/' + ticker).text
                    json_str = html.split('root.App.main =')[1].split(
                        '(this)')[0].split(';\n}')[0].strip()
                    info = json.loads(json_str)['context']['dispatcher'][
                        'stores']['QuoteSummaryStore']['summaryProfile']
                    assert (len(info['sector']) > 0) and (len(info['industry'])
                                                          > 0)
                    missing_si[ticker] = dict(sector=info["sector"],
                                              industry=info["industry"])
                except:
                    pass
        except:
            pass
        progress.animate()

    num_threads = min([len(tickers), multitasking.cpu_count() * 2])
    multitasking.set_max_threads(num_threads)

    progress = ProgressBar(len(tickers), 'completed')

    for ticker in tickers:
        _download_one_threaded(ticker, start, end, interval)
    multitasking.wait_for_tasks()

    progress.completed()

    if len(data) == 0:
        raise Exception("No symbol with full information is available.")

    data = pd.concat(data.values(), keys=data.keys(), axis=1, sort=True)
    data.drop(
        columns=data.columns[data.isnull().sum(0) > 0.33 * data.shape[0]],
        inplace=True)
    data = data.fillna(method='bfill').fillna(method='ffill').drop_duplicates()

    info = zip(list(missing_si.keys()),
               [currencies[ticker] for ticker in missing_si.keys()],
               [v['sector'] for v in missing_si.values()],
               [v['industry'] for v in missing_si.values()])
    with open(si_filename, 'a+', newline='') as file:
        wr = csv.writer(file)
        for row in info:
            wr.writerow(row)
    si = pd.read_csv('stock_info.csv').set_index("SYMBOL").to_dict(
        orient='index')

    missing_tickers = [
        ticker for ticker in tickers
        if ticker not in data.columns.get_level_values(0)[::2].tolist()
    ]
    tickers = data.columns.get_level_values(0)[::2].tolist()
    if len(missing_tickers) > 0:
        print(
            '\nRemoving {} from list of symbols because we could not collect full information.'
            .format(missing_tickers))

    # download exchange rates and convert to most common currency
    currencies = [
        si[ticker]['CURRENCY'] if ticker in si else currencies[ticker]
        for ticker in tickers
    ]
    ucurrencies, counts = np.unique(currencies, return_counts=True)
    default_currency = ucurrencies[np.argmax(counts)]
    xrates = get_exchange_rates(currencies, default_currency, data.index,
                                start, end, interval)

    return dict(tickers=tickers,
                dates=pd.to_datetime(data.index),
                price=data.iloc[:,
                                data.columns.get_level_values(1) ==
                                'Adj Close'].to_numpy().T,
                volume=data.iloc[:,
                                 data.columns.get_level_values(1) ==
                                 'Volume'].to_numpy().T,
                currencies=currencies,
                exchange_rates=xrates,
                default_currency=default_currency,
                sectors={
                    ticker:
                    si[ticker]['SECTOR'] if ticker in si else "NA_" + ticker
                    for ticker in tickers
                },
                industries={
                    ticker:
                    si[ticker]['INDUSTRY'] if ticker in si else "NA_" + ticker
                    for ticker in tickers
                })
Exemple #21
0
def download(tickers: list, interval: str = "1d", period: str = "1y"):
    """
    Download historical data for tickers in the list.

    Parameters
    ----------
    tickers: list
        Tickers for which to download historical information.
    interval: str
        Frequency between data.
    period: str
        Data period to download.

    Returns
    -------
    data: dict
        Dictionary including the following keys:
        - tickers: list of tickers
        - logp: array of log-adjusted closing prices, shape=(num stocks, length period);
        - volume: array of volumes, shape=(num stocks, length period);
        - sectors: list of stock sectors;
        - industries: list stock industries.
    """
    tickers = tickers if isinstance(tickers,
                                    (list, set, tuple)) else tickers.replace(
                                        ',', ' ').split()
    tickers = list(set([ticker.upper() for ticker in tickers]))

    data = {}
    si_columns = ["SYMBOL", "SECTOR", "INDUSTRY"]
    si_filename = "stock_info.csv"
    if not os.path.exists(si_filename):
        # create a .csv to store stock information
        with open(si_filename, 'w') as file:
            wr = csv.writer(file)
            for row in zip([[c] for c in si_columns]):
                wr.writerow(row)
    # load stock information file
    si = pd.read_csv(si_filename)
    missing_tickers = [
        ticker for ticker in tickers if ticker not in si['SYMBOL'].values
    ]
    missing_si, na_si = {}, {}

    @multitasking.task
    def _download_one_threaded(ticker: str,
                               interval: str = "1d",
                               period: str = "1y"):
        """
        Download historical data for a single ticker with multithreading. Plus, it scrapes missing stock information.

        Parameters
        ----------
        ticker: str
            Ticker for which to download historical information.
        interval: str
            Frequency between data.
        period: str
            Data period to download.
        """
        data_one = _download_one(ticker, interval, period)

        try:
            data[ticker] = parse_quotes(data_one["chart"]["result"][0])

            if ticker in missing_tickers:
                try:
                    html = requests.get(
                        url='https://finance.yahoo.com/quote/' + ticker).text
                    json_str = html.split('root.App.main =')[1].split(
                        '(this)')[0].split(';\n}')[0].strip()
                    info = json.loads(json_str)['context']['dispatcher'][
                        'stores']['QuoteSummaryStore']['summaryProfile']
                    assert (len(info['sector']) > 0) and (len(info['industry'])
                                                          > 0)
                    missing_si[ticker] = dict(sector=info["sector"],
                                              industry=info["industry"])
                except:
                    pass
        except:
            pass
        progress.animate()

    num_threads = min([len(tickers), multitasking.cpu_count() * 2])
    multitasking.set_max_threads(num_threads)

    progress = ProgressBar(len(tickers), 'completed')

    for ticker in tickers:
        _download_one_threaded(ticker, interval, period)
    multitasking.wait_for_tasks()

    progress.completed()

    if len(data) == 0:
        raise Exception("No symbol with full information is available.")

    data = pd.concat(data.values(), keys=data.keys(), axis=1)
    data.drop(
        columns=data.columns[data.isnull().sum(0) > 0.33 * data.shape[0]],
        inplace=True)
    data = data.fillna(method='bfill').fillna(method='ffill').drop_duplicates()

    info = zip(list(missing_si.keys()),
               [v['sector'] for v in missing_si.values()],
               [v['industry'] for v in missing_si.values()])
    with open(si_filename, 'a+', newline='') as file:
        wr = csv.writer(file)
        for row in info:
            wr.writerow(row)
    si = pd.read_csv('stock_info.csv').set_index("SYMBOL").to_dict(
        orient='index')

    missing_tickers = [
        ticker for ticker in tickers
        if ticker not in data.columns.get_level_values(0)[::2].tolist()
    ]
    tickers = data.columns.get_level_values(0)[::2].tolist()
    if len(missing_tickers) > 0:
        print(
            '\nRemoving {} from list of symbols because we could not collect full information.'
            .format(missing_tickers))

    return dict(tickers=tickers,
                dates=pd.to_datetime(data.index),
                logp=np.log(data.iloc[:,
                                      data.columns.get_level_values(1) ==
                                      'Adj Close'].to_numpy().T),
                volume=data.iloc[:,
                                 data.columns.get_level_values(1) ==
                                 'Volume'].to_numpy().T,
                sectors=[
                    si[ticker]['SECTOR'] if ticker in si else "NA_" + ticker
                    for ticker in tickers
                ],
                industries=[
                    si[ticker]['INDUSTRY'] if ticker in si else "NA_" + ticker
                    for ticker in tickers
                ])