Exemple #1
0
def _word_seg(text, word_seg_crf_model):
    sentence = tokenize(text).split()
    X_test = [sent2features(sentence, 'raw')]

    start = time.time()
    IOBtag = word_seg_crf_model.predict(X_test)
    print("Executed time for segmentating: " + str(time.time() - start))

    output = []
    for tag, token in zip(IOBtag[0], sentence):
        if tag == "I_W":
            output[-1] = output[-1] + u" " + token
        else:
            output.append(token)

    return output
Exemple #2
0
    # args = parse_argument()

    # MODEL_NAME = args.n + pkl
    MODEL_NAME = "ws.pkl"
    model_path = "./models/"
    print("=======================================")
    print("Reading testing data ....")
    test_sents = []
    filename_test = "test.txt"

    with open(filename_test, "r") as ftrue:
        raw_data = ftrue.read().strip("\n").strip(" ").split("\n\n")
        for sent in raw_data:
            test_sents.append([xxx.replace("B-W", "B_W").replace("I-W", "I_W").replace("o", "O").split("\t") for xxx in sent.split("\n")])

    X_test = [sent2features(sent, 'test') for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]
    y_test = [sent2labels(sent) for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]

    print("Done!")
    print("=======================================")
    print("Load the model ...")
    model = pickle.load(file=open(os.path.join(model_path, MODEL_NAME), "rb"))
    print(model)
    print("Done loading", MODEL_NAME)

    print("=======================================")
    print("Testing ...")
    score = model.score(X_test, y_test)
    y_pred = model.predict(X_test)

    sum_accuracy = 0
Exemple #3
0
if __name__ == "__main__":
    args = parse_argument()

    # Model name, duplication check
    MODEL_NAME = checkModelFileExistAndCreateNewModelName(
        args.m.replace(".pkl", "") + ".pkl")

    reader = Reader("/data/train")
    reader_test = Reader("/data/test")

    test_sents = reader_test.read('10000')  # 10000 is the dataset
    train_sents = reader.read('10000')  # 10000 is the dataset

    X_train = [
        sent2features(sent, "train") for sent in train_sents
        if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]
    ]
    y_train = [
        sent2labels(sent) for sent in train_sents
        if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]
    ]

    X_test = [
        sent2features(sent, "test") for sent in test_sents
        if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]
    ]
    y_test = [
        sent2labels(sent) for sent in test_sents
        if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]
    ]
Exemple #4
0
    filename_true = "/home/enamoria/Desktop/workspace/evaluation/ws/data/train_with_dict.txt"
    train_sents = read_data(filename_true)

    dev_sents = read_data(
        "/home/enamoria/Desktop/workspace/evaluation/ws/data/dev.txt")
    print(len(train_sents), len(dev_sents))
    train_sents.extend(dev_sents)
    print(len(train_sents))

    # with open(filename_true, "r") as ftrue:
    #     raw_data = ftrue.read().strip("\n").strip(" ").split("\n\n")
    #     for sent in raw_data:
    #         train_sents.append([xxx.replace("B-W", "B_W").replace("I-W", "I_W").replace("o", "O").split("\t") for xxx in sent.split("\n")])

    # Feature extracting from templates
    X_train = [sent2features(sent, mode="train") for sent in train_sents
               ]  # if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]
    y_train = [sent2labels(sent) for sent in train_sents
               ]  # len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]

    X_dev = [sent2features(sent, mode="dev") for sent in dev_sents
             ]  # if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]
    y_dev = [sent2labels(sent) for sent in dev_sents]

    # X_test = [sent2features(sent, 'test') for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]
    # y_test = [sent2labels(sent) for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]

    # X_test = [sent2features(sent) for sent in test_sents]
    # y_test = [sent2labels(sent) for sent in test_sents]

    # Training ...
Exemple #5
0
    return [y[1] for y in sent]


def sent2tokens(sent):
    return [y[0] for y in sent]


if __name__ == "__main__":
    reader = Reader("/data/test")
    test_sents = reader.read('10000')

    print(test_sents[1])
    MODEL_NAME = "pos_8.pkl"

    X_test = [
        sent2features(sent, 'test') for sent in test_sents
        if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]
    ]
    y_test = [
        sent2labels(sent) for sent in test_sents
        if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]
    ]

    loaded_model = pickle.load(
        open(MODEL_NAME.replace(".pkl", "") + ".pkl", 'rb'))
    print("Loaded", MODEL_NAME, loaded_model)
    print(loaded_model.classes_)

    sum = 0.0

    y_pred = loaded_model.predict(X_test)
Exemple #6
0

if __name__ == "__main__":
    filename_true = "train.txt"
    # fielname_pred = "pred.iob"
    MODEL_NAME = checkModelFileExistAndCreateNewModelName(MODEL_NAME)

    # Read data
    train_sents = []
    with open(filename_true, "r") as ftrue:
        raw_data = ftrue.read().strip("\n").strip(" ").split("\n\n")
        for sent in raw_data:
            train_sents.append([xxx.split("\t") for xxx in sent.split("\n")])

    # Feature extracting from templates
    X_train = [sent2features(sent) for sent in train_sents
               ]  # if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]
    y_train = [sent2labels(sent) for sent in train_sents
               ]  # len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]

    # print(X_train)
    # print(y_train)

    # X_test = [sent2features(sent, 'test') for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]
    # y_test = [sent2labels(sent) for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]]

    # X_test = [sent2features(sent) for sent in test_sents]
    # y_test = [sent2labels(sent) for sent in test_sents]

    # Training ...
    print("=======================================")