Ejemplo n.º 1
0
                        count = cos(v, self.model.wv[ps])
                        best_syn = ps
                try:
                    if ps.capitalize() in self.model.vocab:
                        cap = ps.capitalize()
                        if count <= cos(v, self.model.wv[cap].capitalize()):
                            count = cos(v, self.model.wv[cap].capitalize())
                            best_syn = ps
                except:
                    #print(f"{ps} not in vocab")
                    pass
        return best_syn


if __name__ == "__main__":

    # At submission time, this program should run your best predictor (part 6).

    W2VMODEL_FILENAME = 'GoogleNews-vectors-negative300.bin.gz'
    predictor = Word2VecSubst(W2VMODEL_FILENAME)

    for context in read_lexsub_xml(sys.argv[1]):

        #print(context)  # useful for debugging
        #prediction = wn_simple_lesk_predictor(context)
        #print(prediction)
        #prediction = wn_simple_lesk_predictor(context)
        prediction = predictor.improved_predictor(context)
        print("{}.{} {} :: {}".format(context.lemma, context.pos, context.cid,
                                      prediction))
Ejemplo n.º 2
0
 def __init__(self, filename, maxlen=80):
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.maxlen = maxlen
     self.file = [i for i in read_lexsub_xml(filename)]