Ejemplo n.º 1
0
 def rank_by_spanbert(phrase_cand, sgs, drug_formal):
     from transformers import BertForMaskedLM, BertTokenizer
     bert_tokenizer = BertTokenizer.from_pretrained(
         'data/BERT_model_reddit/vocab.txt')
     bert_model = BertForMaskedLM.from_pretrained(
         'data/BERT_model_reddit').to(device)
     fb = FitBert(model=bert_model,
                  tokenizer=bert_tokenizer,
                  mask_token='[MASK]')
     MLM_score = defaultdict(float)
     temp = sgs if len(sgs) < 10 else tqdm(sgs)
     for sgs_i in temp:
         if not any(x in sgs_i for x in drug_formal + ['drug']):
             continue
         temp = fb.rank_multi(
             sgs_i, phrase_cand +
             ['cbd oil', 'hash oil', 'charlie horse', 'lunch money'])
         scores = [x / max(temp[1]) for x in temp[1]]
         scores = fb.softmax(torch.tensor(scores).unsqueeze(0)).tolist()[0]
         top_words = [[temp[0][i], scores[i]]
                      for i in range(min(len(temp[0]), 50))]
         for j in top_words:
             if j[0] in string.punctuation:
                 continue
             if j[0] in stopwords.words('english'):
                 continue
             if j[0] in drug_formal:
                 continue
             if j[0] in ['drug', 'drugs']:
                 continue
             if j[0][:
                     2] == '##':  # the '##' by BERT indicates that is not a word.
                 continue
             MLM_score[j[0]] += j[1]
         print(sgs_i)
         print([x[0] for x in top_words[:20]])
     out = sorted(MLM_score, key=lambda x: MLM_score[x], reverse=True)
     out_tuple = [[x, MLM_score[x]] for x in out]
     return out, out_tuple
Ejemplo n.º 2
0
# currently supported models: bert-large-uncased and distilbert-base-uncased
# this takes a while and loads a whole big BERT into memory
fb = FitBert()
"""
masked_string = "Why ***mask***, you're looking ***mask*** today!"
options = ['buff', 'handsome', 'strong']

ranked_options = fb.rank(masked_string, options=options)
print(ranked_options)
# >>> ['handsome', 'strong', 'buff']
# or
filled_in = fb.fitb(masked_string, options=options)
# >>> "Why Bert, you're looking handsome today!"

print(filled_in)
"""
masked_string = "Hello  ***mask*** test ***mask*** today!"

options1 = [
    'looking',
    'catching',
    'master',
    'handsome',
]
options2 = ['rank', 'book', 'strong']
filled_in = fb.rank_multi(masked_string, options=options1)
print("rank_multi", filled_in)

filled_in1 = fb.new_rank_multi(masked_string, words=options1)
print("new_rank_multi", filled_in1)