Esempio n. 1
0
def predict(sentence):
    meta_data = joblib.load("meta.bin")
    enc_pos = meta_data["enc_pos"]
    enc_tag = meta_data["enc_tag"]

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))
    tokenized_sentence = config.TOKENIZER.encode(sentence)

    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         pos=[[0] * len(sentence)],
                                         tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(config.MODEL_PATH), strict=False)
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)

        geo_tag_sentence = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        pos_sentence = enc_pos.inverse_transform(
            pos.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        # print(geo_tag_sentence)
        print(pos_sentence)
        return tokenized_sentence, pos_sentence, geo_tag_sentence
Esempio n. 2
0
def predict(sentence):
    tokenized_sentence = config.TOKENIZER.encode(sentence)

    sentence = sentence.split()
    #print(sentence)
    #print(tokenized_sentence)

    test_dataset = EntityDataset(texts=[sentence],
                                 pos=[[0] * len(sentence)],
                                 tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)

        tags = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]

        i = 0
        names = []
        while i < len(tags):
            item = tags[i]
            indices = []
            if (item == "B-per"):
                while (item == "B-per"):
                    indices.append(i)
                    i += 1
                    item = tags[i]
                tokenized_name = tokenized_sentence[indices[0]:indices[-1] + 1]
                name = config.TOKENIZER.decode(tokenized_name)
                names.append(name)
            indices = []
            i += 1

        return names
    sentence = sentence.split(
    )  # This is currently the starting point for my dataset
    print(sentence)
    print(len(sentence))
    print(tokenized_sentence)
    print(len(tokenized_sentence))

    print(config.TOKENIZER.decode(tokenized_sentence))
    print(len(config.TOKENIZER.decode(tokenized_sentence).split()))

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)

        print(
            enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))
            [:len(tokenized_sentence)])
        print(
            len(
                enc_tag.inverse_transform(
                    tag.argmax(2).cpu().numpy().reshape(-1))
Esempio n. 4
0
    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)
    print((config.TOKENIZER.decode(tokenized_sentence)))
    #print(config.TOKENIZER.convert_ids_to_tokens(tokenized_sentence))
    bert_tokens = config.TOKENIZER.convert_ids_to_tokens(tokenized_sentence)

    test_dataset = dataset.EntityDataset(
        texts=[sentence], 
        pos=[[0] * len(sentence)], 
        tags=[[0] * len(sentence)]
    )

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(config.MODEL_PATH, map_location=torch.device(device)))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)

        resultTags = list(enc_tag.inverse_transform(
                tag.argmax(2).cpu().numpy().reshape(-1)
            )[:len(tokenized_sentence)])


        # print(
        #     #[i[0] for i in groupby(resultTags)]
Esempio n. 5
0
                indices = []
                if (item == "B-per"):
                    while (item == "B-per"):
                        indices.append(i)
                        i += 1
                        item = tags[i]
                    tokenized_name = tokenized_sentence[
                        indices[0]:indices[-1] + 1]
                    name = config.TOKENIZER.decode(tokenized_name)
                    names.append(name)
                indices = []
                i += 1

            resp = ','.join(names)

            return [resp]


if __name__ == "__main__":

    device = torch.device("cpu")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(config.MODEL_PATH, map_location='cpu'))
    model.to(device)

    # 2) `pack` it with required artifacts
    bento_svc = PyTorchModel()
    bento_svc.pack('ner', model)

    # 3) save your BentoSerivce
    saved_path = bento_svc.save()
Esempio n. 6
0
    num_tag = len(list(enc_tag.classes_))

    sentence = """
    President Trump has addressed the nation on US supremacy over the world.
    """
    tokenized_sentence = config.params["TOKENIZER"].encode(sentence)

    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)
    print(config.params["TOKENIZER"].convert_ids_to_tokens(tokenized_sentence))

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         tags=[[0] * len(sentence)],
                                         O_tag_id=enc_tag.transform(["O"])[0])

    model = EntityModel(num_tag=num_tag)
    model.load_state_dict(torch.load(config.params["MODEL_PATH"]))
    device = torch.device("cuda")
    if torch.cuda.is_available(): model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)

        print(
            enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))
            [:len(tokenized_sentence)])
Esempio n. 7
0
def predic(sentence):
    meta_data = joblib.load(config.METADATA_PATH)
    enc_tag = meta_data["enc_tag"]

    num_tag = len(list(enc_tag.classes_))

    tokenized_sentence = config.TOKENIZER.encode(sentence)
    ofToken = config.tok
    offSets = ofToken.encode(sentence)
    sentence = sentence.split()
    text = offSets.tokens

    print(sentence)
    print(tokenized_sentence)
    print(text)

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)
        decodedText = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        print(decodedText)
        # print(config.TOKENIZER.encode())
    text = text[1:-1]
    decodedText = decodedText[1:-1]

    finalText = ''
    listNum = []
    for i, tex in enumerate(text):
        if tex.startswith('##'):
            finalText = finalText + tex[2:]
            listNum.append([(i - 1), i])
        else:
            finalText = finalText + ' ' + tex
    if len(listNum) > 0:
        listNum.append([0, 0])
        print(f'finalText {finalText}')
        print(f'listNum {listNum}')
        finalNum = []

        for eachListNum in range(len(listNum) - 1):
            if listNum[eachListNum][1] == listNum[eachListNum + 1][0]:
                tempList = listNum[eachListNum]
                tempList.extend(listNum[eachListNum + 1])
                finalNum.append(tempList)
            else:
                finalNum.append(listNum[eachListNum])

        finalNum = [list(set(i)) for i in finalNum]

        finalNumList = [j for i in finalNum for j in i]
        print(f'finalNum {finalNum}')

        for i in finalNum[-2]:
            if i in finalNum[-1]:
                finalNum = finalNum[:-1]
                break

        finalIntent = []
        for i in range(len(decodedText)):
            if not i in finalNumList:
                finalIntent.append(decodedText[i])
            else:
                # index = (eachList if i == eachList[0] else False for enu, eachList in enumerate(finalNum))
                index = []
                for enu, eachList in enumerate(finalNum):
                    if i == eachList[0]:
                        index = eachList
                if index:
                    tempToken = decodedText[index[0]:(index[-1] + 1)]
                    print(f'temp token {tempToken}')
                    tempToken = list(set(tempToken))
                    if len(tempToken) > 1:
                        if 'O' in tempToken:
                            tempToken = ' '.join(tempToken)
                            tempToken = tempToken.replace("O",
                                                          '').strip().split()
                    tempToken = tempToken[-1]
                    finalIntent.append(tempToken)
    else:
        finalText = ' '.join(text)
        finalIntent = decodedText

    intentDict = {}

    for i, inte in enumerate(finalIntent):
        if not inte == 'O':
            intentDict[finalText.strip().split(' ')[i]] = inte

    withOutZeroList = ' '.join(finalIntent)
    withOutZeroList = withOutZeroList.replace('O', '').strip().split()

    return withOutZeroList, intentDict