def Off_policy_MC_Control(env,episode_nums,discount_factor=1.0):

    env = Blackjack()
    Q = defaultdict(lambda:np.zeros(env.nA))
    target_policy = defaultdict(float)

    return_count=defaultdict(float)


    for i_episode in range(1,1+episode_nums):
        env._reset()
        state = env.observation()
        episode=[]
        prob_b=[]
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, episode_nums))
            sys.stdout.flush()
        for i in range(100):

            A = sample_policy(Q,state,env.nA)
            probs = A
            action = np.random.choice(np.arange(env.nA),p=probs)

            next_state,reward,done = env._step(action)
            episode.append((state,action,reward))
            prob_b.append(probs[action])
            if done:
                break
            else:
                state = next_state

        seperate_episode = set([(tuple(x[0]), x[1]) for x in episode])

        G =0.0
        W =1
        prob_b=prob_b[::-1]
        for idx,eps in enumerate(episode[::-1]):
            state,action,reward  = eps
            pair=(state,action)
            G = discount_factor*G+reward
            return_count[pair]+=W
            Q[state][action]+=W*1.0/return_count[pair]*(G-Q[state][action])
            target_policy[state] = np.argmax(Q[state])
            if target_policy[state]!=action:
                break
            W = W*1.0/prob_b[idx]

    return Q
def MC_Control_with_epsilon_greedy(env,episode_nums,discount_factor=1.0, epsilon=0.1):

    env = Blackjack()
    Q = defaultdict(lambda:np.zeros(env.nA))
    return_sum=defaultdict(float)
    return_count=defaultdict(float)

    for i_episode in range(1,1+episode_nums):
        env._reset()
        state = env.observation()
        episode=[]
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, episode_nums))
            sys.stdout.flush()
        for i in range(100):

            A = epsilon_greedy_policy(Q,state,env.nA,epsilon)

            probs = A
            action = np.random.choice(np.arange(env.nA),p=probs)

            next_state,reward,done = env._step(action)
            episode.append((state,action,reward))
            if done:
                break
            else:
                state = next_state

        seperate_episode = set([(tuple(x[0]), x[1]) for x in episode])

        for state,action in seperate_episode:
            for idx,e in enumerate(episode):
                if e[0]==state and e[1]==action:
                    first_visit_idx = idx
                    break
            pair = (state,action)
            G = sum([e[2]*(discount_factor**i) for i,e in enumerate(episode[first_visit_idx:])])
            return_sum[pair]+=G
            return_count[pair]+=1.0
            Q[state][action]=return_sum[pair]*1.0/return_count[pair]
    return Q