Example #1
0
def predict(sentence):
    meta_data = joblib.load("meta.bin")
    enc_pos = meta_data["enc_pos"]
    enc_tag = meta_data["enc_tag"]

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))
    tokenized_sentence = config.TOKENIZER.encode(sentence)

    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         pos=[[0] * len(sentence)],
                                         tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(config.MODEL_PATH), strict=False)
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)

        geo_tag_sentence = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        pos_sentence = enc_pos.inverse_transform(
            pos.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        # print(geo_tag_sentence)
        print(pos_sentence)
        return tokenized_sentence, pos_sentence, geo_tag_sentence
Example #2
0
def predict(sentence):
    tokenized_sentence = config.TOKENIZER.encode(sentence)

    sentence = sentence.split()
    #print(sentence)
    #print(tokenized_sentence)

    test_dataset = EntityDataset(texts=[sentence],
                                 pos=[[0] * len(sentence)],
                                 tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)

        tags = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]

        i = 0
        names = []
        while i < len(tags):
            item = tags[i]
            indices = []
            if (item == "B-per"):
                while (item == "B-per"):
                    indices.append(i)
                    i += 1
                    item = tags[i]
                tokenized_name = tokenized_sentence[indices[0]:indices[-1] + 1]
                name = config.TOKENIZER.decode(tokenized_name)
                names.append(name)
            indices = []
            i += 1

        return names
Example #3
0
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4
    )

    valid_dataset = dataset.EntityDataset(
        texts=test_sentences, pos=test_pos, tags=test_tag
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    
    #device = torch.device("cuda")
    device = torch.device("cpu")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
def run():
    sentences, pos, tag, enc_pos, enc_tag = utils.process_data(config.DATA_FILE)

    meta_data = {
        "enc_pos": enc_pos,
        "enc_tag": enc_tag
    }

    joblib.dump(meta_data, "meta.bin")

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences,
        test_sentences,
        train_pos,
        test_pos,
        train_tag,
        test_tag
    ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

    train_dataset = dataset.EntityDataset(
        texts = train_sentences, pos=train_pos, tags=train_tag
    )

    test_dataset  = dataset.EntityDataset(
        texts = test_sentences, pos=test_pos, tags=test_tag
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size = config.TRAIN_BATCH_SIZE, num_workers=4
    )

    test_data_loader  = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device("cuda")

    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay        = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_param = [
        {
            "params" : [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params" : [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS )
    optimizer = AdamW(optimizer_param, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_loss = np.inf

    for epoch in range(config.EPOCHS):
        train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss  = engine.eval_fn(test_data_loader, model, device)
        print(f"Train Loss = {train_loss} Valod Loss = {test_loss}")
        if test_loss < best_loss:
            torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
            best_loss = test_loss
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.params["TRAIN_BATCH_SIZE"], num_workers=4
    )

    valid_dataset = dataset.EntityDataset(
        texts=test_sentences, tags=test_tag, classification_tags=test_classify_tag, O_tag_id= enc_tag.transform(["O"])[0]
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.params["VALID_BATCH_SIZE"], num_workers=1
    )
    
    model = EntityModel(num_tag=num_tag, num_classify_tag= num_classify_tag)
    device = torch.device("cuda" if config.params["CUDA"] else "cpu")
    model.to(device) #BioBERT is taking alot of space

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
Example #6
0
    num_tag = len(list(enc_tag.classes_))

    sentence = """
    President Trump has addressed the nation on US supremacy over the world.
    """
    tokenized_sentence = config.params["TOKENIZER"].encode(sentence)

    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)
    print(config.params["TOKENIZER"].convert_ids_to_tokens(tokenized_sentence))

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         tags=[[0] * len(sentence)],
                                         O_tag_id=enc_tag.transform(["O"])[0])

    model = EntityModel(num_tag=num_tag)
    model.load_state_dict(torch.load(config.params["MODEL_PATH"]))
    device = torch.device("cuda")
    if torch.cuda.is_available(): model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)

        print(
            enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))
            [:len(tokenized_sentence)])
Example #7
0
def predic(sentence):
    meta_data = joblib.load(config.METADATA_PATH)
    enc_tag = meta_data["enc_tag"]

    num_tag = len(list(enc_tag.classes_))

    tokenized_sentence = config.TOKENIZER.encode(sentence)
    ofToken = config.tok
    offSets = ofToken.encode(sentence)
    sentence = sentence.split()
    text = offSets.tokens

    print(sentence)
    print(tokenized_sentence)
    print(text)

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)
        decodedText = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        print(decodedText)
        # print(config.TOKENIZER.encode())
    text = text[1:-1]
    decodedText = decodedText[1:-1]

    finalText = ''
    listNum = []
    for i, tex in enumerate(text):
        if tex.startswith('##'):
            finalText = finalText + tex[2:]
            listNum.append([(i - 1), i])
        else:
            finalText = finalText + ' ' + tex
    if len(listNum) > 0:
        listNum.append([0, 0])
        print(f'finalText {finalText}')
        print(f'listNum {listNum}')
        finalNum = []

        for eachListNum in range(len(listNum) - 1):
            if listNum[eachListNum][1] == listNum[eachListNum + 1][0]:
                tempList = listNum[eachListNum]
                tempList.extend(listNum[eachListNum + 1])
                finalNum.append(tempList)
            else:
                finalNum.append(listNum[eachListNum])

        finalNum = [list(set(i)) for i in finalNum]

        finalNumList = [j for i in finalNum for j in i]
        print(f'finalNum {finalNum}')

        for i in finalNum[-2]:
            if i in finalNum[-1]:
                finalNum = finalNum[:-1]
                break

        finalIntent = []
        for i in range(len(decodedText)):
            if not i in finalNumList:
                finalIntent.append(decodedText[i])
            else:
                # index = (eachList if i == eachList[0] else False for enu, eachList in enumerate(finalNum))
                index = []
                for enu, eachList in enumerate(finalNum):
                    if i == eachList[0]:
                        index = eachList
                if index:
                    tempToken = decodedText[index[0]:(index[-1] + 1)]
                    print(f'temp token {tempToken}')
                    tempToken = list(set(tempToken))
                    if len(tempToken) > 1:
                        if 'O' in tempToken:
                            tempToken = ' '.join(tempToken)
                            tempToken = tempToken.replace("O",
                                                          '').strip().split()
                    tempToken = tempToken[-1]
                    finalIntent.append(tempToken)
    else:
        finalText = ' '.join(text)
        finalIntent = decodedText

    intentDict = {}

    for i, inte in enumerate(finalIntent):
        if not inte == 'O':
            intentDict[finalText.strip().split(' ')[i]] = inte

    withOutZeroList = ' '.join(finalIntent)
    withOutZeroList = withOutZeroList.replace('O', '').strip().split()

    return withOutZeroList, intentDict