Example #1
0
def predict(sentence):
    meta_data = joblib.load("meta.bin")
    enc_pos = meta_data["enc_pos"]
    enc_tag = meta_data["enc_tag"]

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))
    tokenized_sentence = config.TOKENIZER.encode(sentence)

    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         pos=[[0] * len(sentence)],
                                         tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(config.MODEL_PATH), strict=False)
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)

        geo_tag_sentence = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        pos_sentence = enc_pos.inverse_transform(
            pos.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        # print(geo_tag_sentence)
        print(pos_sentence)
        return tokenized_sentence, pos_sentence, geo_tag_sentence
Example #2
0
    joblib.dump(meta_data, "meta.bin")

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences,
        test_sentences,
        train_pos,
        test_pos,
        train_tag,
        test_tag
    ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

    train_dataset = dataset.EntityDataset(
        texts=train_sentences, pos=train_pos, tags=train_tag
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4
    )

    valid_dataset = dataset.EntityDataset(
        texts=test_sentences, pos=test_pos, tags=test_tag
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    
    sentence = """
    Hung uses the ADNI dataset for his research at the University.
    """
    tokenized_sentence = config.TOKENIZER.encode(sentence)

    sentence = sentence.split(
    )  # This is currently the starting point for my dataset
    print(sentence)
    print(len(sentence))
    print(tokenized_sentence)
    print(len(tokenized_sentence))

    print(config.TOKENIZER.decode(tokenized_sentence))
    print(len(config.TOKENIZER.decode(tokenized_sentence).split()))

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)

        print(
            enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))
            [:len(tokenized_sentence)])
def run():
    sentences, pos, tag, enc_pos, enc_tag = utils.process_data(config.DATA_FILE)

    meta_data = {
        "enc_pos": enc_pos,
        "enc_tag": enc_tag
    }

    joblib.dump(meta_data, "meta.bin")

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences,
        test_sentences,
        train_pos,
        test_pos,
        train_tag,
        test_tag
    ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

    train_dataset = dataset.EntityDataset(
        texts = train_sentences, pos=train_pos, tags=train_tag
    )

    test_dataset  = dataset.EntityDataset(
        texts = test_sentences, pos=test_pos, tags=test_tag
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size = config.TRAIN_BATCH_SIZE, num_workers=4
    )

    test_data_loader  = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device("cuda")

    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay        = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_param = [
        {
            "params" : [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params" : [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS )
    optimizer = AdamW(optimizer_param, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_loss = np.inf

    for epoch in range(config.EPOCHS):
        train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss  = engine.eval_fn(test_data_loader, model, device)
        print(f"Train Loss = {train_loss} Valod Loss = {test_loss}")
        if test_loss < best_loss:
            torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
            best_loss = test_loss
    
    train_sentences, train_tag, train_classify_tag, enc_tag, classify_enc_tag = read_prof_bilou(config.params["TRAINING_FILE"])
    test_sentences, test_tag, test_classify_tag, _, _ = read_prof_bilou(config.params["VALIDATION_FILE"])

    meta_data = {
        "enc_tag": enc_tag,
        "classify_enc_tag": classify_enc_tag
    }

    joblib.dump(meta_data, "meta.bin")

    num_tag = len(list(enc_tag.classes_))
    num_classify_tag = len(list(classify_enc_tag.classes_))

    train_dataset = dataset.EntityDataset(
        texts=train_sentences, tags=train_tag, classification_tags=train_classify_tag, O_tag_id= enc_tag.transform(["O"])[0]
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.params["TRAIN_BATCH_SIZE"], num_workers=4
    )

    valid_dataset = dataset.EntityDataset(
        texts=test_sentences, tags=test_tag, classification_tags=test_classify_tag, O_tag_id= enc_tag.transform(["O"])[0]
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.params["VALID_BATCH_SIZE"], num_workers=1
    )
    
    model = EntityModel(num_tag=num_tag, num_classify_tag= num_classify_tag)
Example #6
0
    titles, tags, enc_tag = process_data(config.TRAINING_FILE)

    meta_data = {
        'enc_tag': enc_tag
    }
    joblib.dump(meta_data,'meta.bin')
    num_tag = len(list(enc_tag.classes_))
    
    (
        train_titles,
        valid_titles,
        train_tag,
        valid_tag
    ) = model_selection.train_test_split(titles, tags, random_state=42,test_size=0.1)

    train_dataset = dataset.EntityDataset(texts = train_titles, tags=train_tag)
    print(train_dataset[0])
    print(train_dataset[1])
    print(train_dataset[9])
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4
    )
  
    valid_dataset = dataset.EntityDataset(texts = valid_titles, tags=valid_tag)
    print(valid_dataset[3])
    print(valid_dataset[8])
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Example #7
0
def read_data(name):
    with open(f'../data/{name}/train.p', 'rb') as f:
        train_data = pickle.load(f)
    with open(f'../data/{name}/test.p', 'rb') as f:
        test_data = pickle.load(f)
    with open(f'../data/{name}/valid.p', 'rb') as f:
        valid_data = pickle.load(f)
    with open(f'../data/{name}/encoder.p', 'rb') as f:
        tag_enc = pickle.load(f)

    return train_data, test_data, valid_data, len(list(tag_enc.classes_))

train_data, test_data, valid_data, n_tags = read_data(config.DATASET)

train_dataset = dataset.EntityDataset(train_data)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=1)

test_dataset = dataset.EntityDataset(test_data)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=1)

valid_dataset = dataset.EntityDataset(valid_data)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=1)

model = EntityModel_crf(num_tag=n_tags)
#model = torch.load('model.pt')
print(model)
model.to(config.DEVICE)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
Example #8
0
import seaborn as sns

import config
import dataset
from model import EntityModel
from utils import process_data_class

if __name__ == "__main__":

    meta_data = joblib.load("meta.bin")
    enc_tag = meta_data["enc_tag"]
    idx2class = list(enc_tag.classes_)
    num_tag = len(list(enc_tag.classes_))
    sentences, tags, enc_label_ = process_data_class(config.TESTING_FILE,
                                                     enc_tag)
    dataset = dataset.EntityDataset(texts=sentences, tags=tags)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, )
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)
    final_loss = 0
    model.eval()
    y_pred_list = []

    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        tag, _ = model(**data)
Example #9
0
    meta_data = joblib.load('meta.bin')
    # enc_pos = meta_data['enc_pos']
    enc_tag = meta_data['enc_tag']
    # num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    title = 'police'

    encoded_title = config.TOKENIZER.encode(title)
    tokenized_title = config.TOKENIZER.tokenize(title)
    # title = title.split()
    # print(title)
    # print(encoded_titel)
    # print(tokenized_title)

    test_dataset = dataset.EntityDataset(texts=[[title]],
                                         tags=[[0] * len(title)])
    print(test_dataset[0])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = EntityModel(num_tag=num_tag)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)
    print(title)
    print(encoded_title)
    print(tokenized_title)
    print(
Example #10
0
    enc_tag = meta_data["enc_tag"]

    num_tag = len(list(enc_tag.classes_))

    sentence = """
    President Trump has addressed the nation on US supremacy over the world.
    """
    tokenized_sentence = config.params["TOKENIZER"].encode(sentence)

    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)
    print(config.params["TOKENIZER"].convert_ids_to_tokens(tokenized_sentence))

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         tags=[[0] * len(sentence)],
                                         O_tag_id=enc_tag.transform(["O"])[0])

    model = EntityModel(num_tag=num_tag)
    model.load_state_dict(torch.load(config.params["MODEL_PATH"]))
    device = torch.device("cuda")
    if torch.cuda.is_available(): model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)

        print(
            enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))
Example #11
0
def predic(sentence):
    meta_data = joblib.load(config.METADATA_PATH)
    enc_tag = meta_data["enc_tag"]

    num_tag = len(list(enc_tag.classes_))

    tokenized_sentence = config.TOKENIZER.encode(sentence)
    ofToken = config.tok
    offSets = ofToken.encode(sentence)
    sentence = sentence.split()
    text = offSets.tokens

    print(sentence)
    print(tokenized_sentence)
    print(text)

    test_dataset = dataset.EntityDataset(texts=[sentence],
                                         tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, _ = model(**data)
        decodedText = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]
        print(decodedText)
        # print(config.TOKENIZER.encode())
    text = text[1:-1]
    decodedText = decodedText[1:-1]

    finalText = ''
    listNum = []
    for i, tex in enumerate(text):
        if tex.startswith('##'):
            finalText = finalText + tex[2:]
            listNum.append([(i - 1), i])
        else:
            finalText = finalText + ' ' + tex
    if len(listNum) > 0:
        listNum.append([0, 0])
        print(f'finalText {finalText}')
        print(f'listNum {listNum}')
        finalNum = []

        for eachListNum in range(len(listNum) - 1):
            if listNum[eachListNum][1] == listNum[eachListNum + 1][0]:
                tempList = listNum[eachListNum]
                tempList.extend(listNum[eachListNum + 1])
                finalNum.append(tempList)
            else:
                finalNum.append(listNum[eachListNum])

        finalNum = [list(set(i)) for i in finalNum]

        finalNumList = [j for i in finalNum for j in i]
        print(f'finalNum {finalNum}')

        for i in finalNum[-2]:
            if i in finalNum[-1]:
                finalNum = finalNum[:-1]
                break

        finalIntent = []
        for i in range(len(decodedText)):
            if not i in finalNumList:
                finalIntent.append(decodedText[i])
            else:
                # index = (eachList if i == eachList[0] else False for enu, eachList in enumerate(finalNum))
                index = []
                for enu, eachList in enumerate(finalNum):
                    if i == eachList[0]:
                        index = eachList
                if index:
                    tempToken = decodedText[index[0]:(index[-1] + 1)]
                    print(f'temp token {tempToken}')
                    tempToken = list(set(tempToken))
                    if len(tempToken) > 1:
                        if 'O' in tempToken:
                            tempToken = ' '.join(tempToken)
                            tempToken = tempToken.replace("O",
                                                          '').strip().split()
                    tempToken = tempToken[-1]
                    finalIntent.append(tempToken)
    else:
        finalText = ' '.join(text)
        finalIntent = decodedText

    intentDict = {}

    for i, inte in enumerate(finalIntent):
        if not inte == 'O':
            intentDict[finalText.strip().split(' ')[i]] = inte

    withOutZeroList = ' '.join(finalIntent)
    withOutZeroList = withOutZeroList.replace('O', '').strip().split()

    return withOutZeroList, intentDict