def gensummary_gpt2(template_vec,
                    ge,
                    vocab,
                    LMModel,
                    word_list,
                    subvocab,
                    clustermask=None,
                    mono=True,
                    renorm=True,
                    temperature=1,
                    bpe2word='last',
                    max_step = 20,
                    beam_width = 10,
                    beam_width_start = 10,
                    alpha=0.1,
                    alpha_start=0.1,
                    begineos=True,
                    stopbyLMeos=False,
                    devid=0,
                    **kwargs):
    """
    Unsupervised sentence summary generation using beam search, by contextual matching and a summary style language model.
    The contextual matching here is on top of pretrained ELMo embeddings.
    
    Input:
        template_vec: forward only ELMo embeddings of the source sentence. 'torch.Tensor' of size (3, seq_len, 512).
        ge: 'gpt2_sequential_embedder.GPT2Embedder' object.
        vocab: 'torchtext.vocab.Vocab' object. Should be the same as is used for the pretrained language model.
        LMModel: a pretrained language model on the summary sentences.
        word_list: a list of words in the vocabulary to work with. 'List'.
        subvocab: 'torch.LongTensor' consisting of the indices of the words corresponding to 'word_list'.
        clustermask: a binary mask for each of the sub-vocabulary word. 'torch.ByteTensor' of size (len(sub-vocabulary), len(vocabulary)). Default:None.
        mono: whether to keep monotonicity contraint. Default: True.
        renorm: whether to renormalize the probabilities over the sub-vocabulary. Default: True.
        Temperature: temperature applied to the softmax in the language model. Default: 1.
        bpe2word: how to turn the BPE vectors into word vectors. Choose from ['last', 'avg']. Default: 'last'.
        max_step: maximum number of beam steps.
        beam_width: beam width.
        beam_width_start: beam width of the first step.
        alpha: the amount of language model part used for scoring. The score is: (1 - \alpha) * similarity_logscore + \alpha * LM_logscore.
        begineos: whether to begin with the special '<eos>' token as is trained in the language model. Note that ELMo has its own special beginning token. Default: True.
        stopbyLMeos: whether to stop a sentence solely by the language model predicting '<eos>' as the top possibility. Default: False.
        devid: device id to run the algorithm and LSTM language models. 'int', default: 0. -1 for cpu.
        **kwargs: other arguments input to function <Beam.beamstep>. 
            E.g. normalized: whether to normalize the dot product when calculating the similarity, which makes it cosine similarity. Default: True.
                 ifadditive: whether to use an additive model on mixing the probability scores. Default: False.
    
    Output:
        beam: 'Beam' object, recording all the generated sequences.
        
    """
    device = 'cpu' if devid == -1 else f'cuda:{devid}'
    
    # Beam Search: initialization
    if begineos:
        beam = Beam(1, vocab, init_ids=[vocab.stoi['<eos>']], device=device,
                sim_score=0, lm_score=0, lm_state=None, gpt2_state=None, align_loc=None)
    else:
        beam = Beam(1, vocab, init_ids=[None], device=device,
                sim_score=0, lm_score=0, lm_state=None, gpt2_state=None, align_loc=None)
    
    # first step: start with 'beam_width_start' best matched words
    beam.beamstep(beam_width_start,
                  beam.combscoreK_GPT2,
                  template_vec=template_vec,
                  ge=ge,
                  LMModel=LMModel,
                  word_list=word_list,
                  subvocab=subvocab,
                  clustermask=clustermask, 
                  alpha=alpha_start,
                  renorm=renorm,
                  temperature=temperature,
                  bpe2word=bpe2word,
                  normalized=True,
                  ifadditive=False,
                  **kwargs)
    
    # run beam search, until all sentences hit <EOS> or max_step reached
    for s in range(max_step):
        print(f'beam step {s+1} ' + '-' * 50 + '\n')
        beam.beamstep(beam_width,
                      beam.combscoreK_GPT2,
                      template_vec=template_vec,
                      ge=ge,
                      LMModel=LMModel,
                      word_list=word_list,
                      subvocab=subvocab,
                      clustermask=clustermask,
                      mono=mono,
                      alpha=alpha,
                      renorm=renorm,
                      temperature=temperature,
                      stopbyLMeos=stopbyLMeos,
                      bpe2word=bpe2word,
                      normalized=True,
                      ifadditive=False,
                      **kwargs)
        # all beams reach termination
        if beam.endall:
            break
    
    return beam
def gensummary_elmo(template_vec,
                    ee,
                    vocab,
                    LMModel,
                    word_list,
                    subvocab,
                    clustermask=None,
                    mono=True,
                    renorm=True,
                    temperature=1,
                    elmo_layer='avg',
                    max_step=20,
                    beam_width=10,
                    beam_width_start=10,
                    alpha=0.1,
                    alpha_start=0.1,
                    begineos=True,
                    stopbyLMeos=False,
                    devid=0,
                    **kwargs):
    """
    Unsupervised sentence summary generation using beam search, by contextual matching and a summary style language model.
    The contextual matching here is on top of pretrained ELMo embeddings.
    
    Input:
        - template_vec (torch.Tensor): forward only ELMo embeddings of the source sentence.
            'torch.Tensor' of size (3, seq_len, 512).
        - ee (elmo_sequential_embedder.ElmoEmbedderForward): 'elmo_sequential_embedder.ElmoEmbedderForward' object.
        - vocab (torchtext.vocab.Vocab): 'torchtext.vocab.Vocab' object. Should be the same as is used for the
            pretrained language model.
        - LMModel (user defined torch.nn.Module): a pretrained language model on the summary sentences.
        - word_list (list): a list of words in the vocabulary to work with. 'List'.
        - subvocab (torch.LongTensor): 'torch.LongTensor' consisting of the indices of the words corresponding
            to `word_list`.
        - clustermask (torch.ByteTensor): a binary mask for each of the sub-vocabulary word.
            'torch.ByteTensor' of size (len(sub-vocabulary), len(vocabulary)). Default:None.
        - mono (bool): whether to keep monotonicity contraint. Default: True.
        - renorm (bool): whether to renormalize the probabilities over the sub-vocabulary. Default: True.
        - temperature (float): temperature applied to the softmax in the language model. Default: 1.
        - elmo_layer (str): which ELMo layer to use as the word type representation.
            Choose from ['avg', 'cat', 'bot', 'mid', 'top']. Default: 'avg'.
        - max_step (int): maximum number of beam steps.
        - beam_width (int): beam width.
        - beam_width_start (int): beam width of the first step.
        - alpha (float): the amount of language model part used for scoring. The score is:
            (1 - \alpha) * similarity_logscore + \alpha * LM_logscore.
        - alpha_start (float): the amount of language model part used for scoring, only for the first step.
        - begineos (bool): whether to begin with the special '<eos>' token as is trained in the language model.
            Note that ELMo has its own special beginning token. Default: True.
        - stopbyLMeos (bool): whether to stop a sentence solely by the language model predicting '<eos>' as the
            top possibility. Default: False.
        - devid (int): device id to run the algorithm and LSTM language models. 'int', default: 0. -1 for cpu.
        **kwargs: other arguments input to function <Beam.beamstep>. 
            E.g. - normalized (bool): whether to normalize the dot product when calculating the similarity,
                     which makes it cosine similarity. Default: True.
                 - ifadditive (bool): whether to use an additive model on mixing the probability scores. Default: False.
    
    Output:
        - beam (beam_search.Beam): 'Beam' object, recording all the generated sequences.
        
    """
    device = 'cpu' if devid == -1 else f'cuda:{devid}'

    # Beam Search: initialization
    if begineos:
        beam = Beam(1,
                    vocab,
                    init_ids=[vocab.stoi['<eos>']],
                    device=device,
                    sim_score=0,
                    lm_score=0,
                    lm_state=None,
                    elmo_state=None,
                    align_loc=None)
    else:
        beam = Beam(1,
                    vocab,
                    init_ids=[None],
                    device=device,
                    sim_score=0,
                    lm_score=0,
                    lm_state=None,
                    elmo_state=None,
                    align_loc=None)

    # first step: start with 'beam_width_start' best matched words
    beam.beamstep(
        beam_width_start,
        beam.combscoreK,
        template_vec=template_vec,
        ee=ee,
        LMModel=LMModel,
        word_list=word_list,
        subvocab=subvocab,
        clustermask=clustermask,
        alpha=alpha_start,
        renorm=renorm,
        temperature=temperature,
        elmo_layer=elmo_layer,
        # normalized=True,
        # ifadditive=False,
        **kwargs)

    # run beam search, until all sentences hit <EOS> or max_step reached
    for s in range(max_step):
        print(f'beam step {s + 1} ' + '-' * 50 + '\n')
        beam.beamstep(
            beam_width,
            beam.combscoreK,
            template_vec=template_vec,
            ee=ee,
            LMModel=LMModel,
            word_list=word_list,
            subvocab=subvocab,
            clustermask=clustermask,
            mono=mono,
            alpha=alpha,
            renorm=renorm,
            temperature=temperature,
            stopbyLMeos=stopbyLMeos,
            elmo_layer=elmo_layer,
            # normalized=True,
            # ifadditive=False,
            **kwargs)
        # all beams reach termination
        if beam.endall:
            break

    return beam