コード例 #1
0
def get_train_teem_data(question_len=20):
    questions, subjects, predicates = readTrainData()
    qs = []
    labels = []
    for q, sub in zip(questions, subjects):
        question = re.split("\\s+", q.lower())
        subject = re.split("\\s+", sub.lower())

        qtemp = q.lower().replace(" ", "")
        stemp = sub.lower().replace(" ", "")
        if not stemp in qtemp:
            print(qtemp, stemp)
            stemp = subject[0]
        i = qtemp.index(stemp)

        prefix = q.lower().replace(" ", "")[:i]
        pre = ""
        for s in range(len(question)):
            pre += question[s]
            if prefix == pre:
                s += 1
                break
            if prefix in pre:
                break

        e = s + len(subject)
        label = np.zeros(question_len)
        label[s:e] = 1
        qidx = [vocab.get(word, 0) for word in question]

        qs.append(qidx)
        labels.append(label)
    return padding(qs, maxlen=question_len,
                   value=1), padding(labels, maxlen=question_len, value=0)
コード例 #2
0
def get_train_teem_data(question_len=50):
    questions, subjects, predicates = readTrainData()
    qs = []
    labels = []
    for question, subject in zip(questions, subjects):
        question = question.lower().replace(" ", "")
        subject = subject.lower().replace(" ", "")
        pattern = question.replace(subject, "X" * len(subject))
        label = [int(w == "X") for w in pattern]
        qidx = [vocab.get(w, 0) for w in question]
        sidx = [vocab.get(w, 0) for w in subject]
        qs.append(qidx)
        labels.append(label)
    return padding(qs, maxlen=question_len,
                   value=1), padding(labels, maxlen=question_len, value=0)
コード例 #3
0
def get_test_teem_data(questions, question_len=20):
    qs = []
    for q in questions:
        question = re.split("\\s+", q.lower())
        qidx = [vocab.get(w, 0) for w in question]
        qs.append(qidx)
    return padding(qs, maxlen=question_len, value=1)
コード例 #4
0
def get_test_teem_data(questions, question_len=50):
    qs = []
    for question in questions:
        question = question.lower().replace(" ", "")
        qidx = [vocab.get(w, 0) for w in question]
        qs.append(qidx)
    return padding(qs, maxlen=question_len, value=1)
コード例 #5
0
ファイル: base_dssm.py プロジェクト: JuneTse/NLPCCKBQAProj
def load_questions(vocab=vocab, path=config.seg_test_question_path):
    '''加载问题'''
    f = open(path, encoding="utf-8")
    questions = []
    for line in f:
        question = list(line.strip().replace(" ", ""))
        q_seq = []
        for q in question:
            q_seq.append(vocab.get(q, 0))
        questions.append(q_seq)
    return padding(questions, 50, value=1)
コード例 #6
0
ファイル: base_dssm.py プロジェクト: JuneTse/NLPCCKBQAProj
def load_predicates(vocab=vocab, path=config.all_predicate_path):
    '''加载predicate'''
    f = open(path, encoding="utf-8")
    predicates = []
    for line in f:
        predicate = list(line.strip().replace(" ", ""))
        p_seq = []
        for w in predicate:
            i = vocab.get(w, 0)
            p_seq.append(i)
        predicates.append(p_seq)
    return padding(predicates, maxlen=20,
                   value=1)  #padding(datas,maxlen=max_len,value=1)