コード例 #1
0
ファイル: preprocess.py プロジェクト: lyan62/qbcards
def match_question_blocks(question_obj_list):
    clues = []
    sas = []
    clue_sents = set()

    # get clue and clue sentences in a pairewise manner
    for i in range(len(question_obj_list)):
        for j in range(i + 1, len(question_obj_list)):
            q1, q2 = clean_question(question_obj_list[i]["text"]), \
                     clean_question(question_obj_list[j]["text"])
            question_matcher = SequenceMatcher(None, q1, q2)
            matching_blocks = question_matcher.get_matching_blocks()
            longest_match = sorted(matching_blocks,
                                   key=lambda k: k.size,
                                   reverse=True)[0]
            if longest_match.size > 10:
                sa, sb, span = longest_match.a, longest_match.b, longest_match.size
                # print(i, j, longest_match, "q1: ", q1[sa:sa + span],
                #       "q2: ", q2[sb:sb + span])
                clue = q1[sa:sa + span]
                complete_clue = get_complete_clue(sa, sa + span, q1)
                if clue != '':
                    clues.append(complete_clue.strip())
                    sas.append(sa)
                    clue_sent = clean_question(
                        get_clue_sent(question_obj_list[i], sa))
                    if len(clue_sent) > 1:
                        clue_sents.add(clue_sent.strip())

    # deduplicate clues
    if len(clues) > 1:
        clues = deduplicate_clues(clues)
    return clues, list(clue_sents)
コード例 #2
0
def find_len():
    openfile = open("Training_Body_Title_user.p", "rb")
    x = pickle.load(openfile)

    c = 0
    m = 0
    s = 0
    tot = 0
    for y in x:
        tot += 1
        try:
            question = y['Title'].encode('utf-8') + ' '
        except:
            question = ''
        question = question + y['Body'].encode('utf-8')
        question = utils.clean_question(question)

        t = len(question.split())
        s += t
        # print t
        if t > 300:
            c += 1
        if t > m:
            m = t

    print m
    print s / tot
    print c
コード例 #3
0
def train():
    openfile = open("Training_Body_Title_user.p", "rb")
    x = pickle.load(openfile)
    #print "x:",len(x)

    for loop in xrange(5):
        #print "loop: ",loop
        x_train = []
        user = []
        y_train = []
        cnt = 0
        trace = 0
        for o, y in enumerate(x):
            try:
                question = y['Title'].encode('utf-8') + ' '
            except:
                question = ''
            question = question + y['Body'].encode('utf-8')
            question = utils.clean_question(question)

            tag_string = y['tags'].encode('utf-8')
            #print tag_string
            tag_list = utils.get_tag_list(tag_string)

            question_enc = get_question_embedding(question)
            tag_enc = get_tag_encoding(tag_list)

            cnt = cnt + 1
            x_train.append(question_enc)
            y_train.append(tag_enc)
            try:
                user.append(meta_model[str(user_num[y['OwnerUserId']])])
            except:
                #trace += 1
                #print trace
                user.append(np.zeros(128))

            if cnt == batch_size:
                x_train = np.asarray(x_train)
                y_train = np.asarray(y_train)
                user = np.asarray(user)
                print "cnt: ", cnt, " loop: ", loop, " o: ", o

                # print (x_train.shape)
                #model.fit([x_train, user],y_train, epochs=1)
                model.fit(x_train, y_train, epochs=1)
                #model.save('model4_train_add_Body_Title_gru_epochs10.h5')
                #model.save_weights('model4_train_add_Body_Title_weights_gru_epochs10.h5')
                model.save(
                    'model4_train_DeepTagRecContent_usingAdd_sigmoid.h5')
                model.save_weights(
                    'model4_train_DeepTagRecContent_weights_usingAdd_sigmoid.h5'
                )
                x_train = []
                user = []
                y_train = []

                cnt = 0
コード例 #4
0
ファイル: preprocess.py プロジェクト: lyan62/qbcards
def match_question_wiki(question_obj_list, wiki_paras):
    clues = []
    sas = []
    clue_sents = set()

    # get clue and clue sentences in a pairewise manner
    for i in range(len(question_obj_list)):
        qanta_id = question_obj_list[i]["qanta_id"]
        paras = wiki_paras[str(qanta_id)]
        for j in range(len(paras)):
            q1, q2 = clean_question(question_obj_list[i]["text"]), \
                     clean_question(paras[j])
            clues, sas, clue_sents = get_pairwise_match(
                q1, q2, clues, sas, clue_sents, question_obj_list, i)

    # deduplicate clues
    if len(clues) > 1:
        clues = deduplicate_clues(clues)
    return clues, list(clue_sents)
コード例 #5
0
ファイル: preprocess.py プロジェクト: lyan62/qbcards
def get_pairwise_match(q1, q2, clues, sas, clue_sents, question_obj_list, i):
    question_matcher = SequenceMatcher(None, q1, q2)
    matching_blocks = question_matcher.get_matching_blocks()
    longest_match = sorted(matching_blocks, key=lambda k: k.size,
                           reverse=True)[0]
    if longest_match.size > 10:
        sa, sb, span = longest_match.a, longest_match.b, longest_match.size
        # print(i, j, longest_match, "q1: ", q1[sa:sa + span],
        #       "q2: ", q2[sb:sb + span])
        clue = q1[sa:sa + span]
        complete_clue = get_complete_clue(sa, sa + span, q1)
        if clue != '':
            clues.append(complete_clue.strip())
            sas.append(sa)
            clue_sent = clean_question(get_clue_sent(question_obj_list[i], sa))
            if len(clue_sent) > 1:
                css = clue_sent.strip().split(",")
                for s in css:
                    clue_sents.add(s)
    return clues, sas, clue_sents
コード例 #6
0
def test():
    # openfile = open("ValidationPosts_15000.pickle", "rb")
    openfile = open("score_low.p", "rb")
    x = pickle.load(openfile)

    x_test = []
    user = []
    actual = []
    cnt = 0
    correct = 0
    precision = 0.0
    recall = 0.0
    total = 0
    count = 0
    for y in x:
        #if total > 5:
        #	break
        #question = y['Body'].encode('utf-8')

        question = y['Title'].encode('utf-8')
        question = question + ' ' + y['Body'].encode('utf-8')
        question = utils.clean_question(question)

        tag_string = y['Tags'].encode('utf-8')
        tag_list = utils.get_tag_list(tag_string)

        question_enc = get_question_embedding(question)
        tag_enc = get_tag_encoding(tag_list)
        try:
            user.append(meta_model[str(user_num[y['OwnerUserId']])])
        except:
            #trace += 1
            #print trace
            user.append(np.zeros(128))

        cnt = cnt + 1
        #print cnt
        x_test.append(question_enc)
        actual.append(np.asarray(tag_enc))
        #print(cnt)

    user = np.asarray(user)
    x_test = np.asarray(x_test)
    #s = model.predict([x_test, user])
    s = model.predict(x_test)
    actual = np.asarray(actual)
    predicted = s  #run_user(s, user)
    #print predicted
    #break
    # (t_correct, t_total) = calc_precision_new(actual,s)
    #(t_num, t_den_p, t_den_r, t_total) = evaluate(actual,s)

    #num += t_num
    #den_p += t_den_p
    #den_r += t_den_r
    # correct += t_correct
    #total += t_total
    # print "correct = ", correct
    #print "total = ", total
    #print "============="
    #predicted = []
    #dict1 = pickle.load(open("predicted.p", 'rb'))
    #for i in dict1.keys():
    #	predicted.append(dict1[i])
    #predicted = np.asarray(predicted)
    #actual = np.asarray(actual)
    #'''
    #print 'I m done'
    #break
    #break

    for i in [3, 5, 10]:
        #print actual
        precision = 0.0
        recall = 0.0
        total = 0

        p, r, t = evaluate(actual, predicted, i)
        precision += p * t
        recall += r * t
        total += t

        precision = precision / total
        recall = recall / total
        #precision = (float(num)/den_p)/ float(total)
        #recall = (float(num)/den_r)/ float(total)

        print "Precision @" + str(i) + ": ", precision
        print "Recall @" + str(i) + ": ", recall