def match_question_blocks(question_obj_list): clues = [] sas = [] clue_sents = set() # get clue and clue sentences in a pairewise manner for i in range(len(question_obj_list)): for j in range(i + 1, len(question_obj_list)): q1, q2 = clean_question(question_obj_list[i]["text"]), \ clean_question(question_obj_list[j]["text"]) question_matcher = SequenceMatcher(None, q1, q2) matching_blocks = question_matcher.get_matching_blocks() longest_match = sorted(matching_blocks, key=lambda k: k.size, reverse=True)[0] if longest_match.size > 10: sa, sb, span = longest_match.a, longest_match.b, longest_match.size # print(i, j, longest_match, "q1: ", q1[sa:sa + span], # "q2: ", q2[sb:sb + span]) clue = q1[sa:sa + span] complete_clue = get_complete_clue(sa, sa + span, q1) if clue != '': clues.append(complete_clue.strip()) sas.append(sa) clue_sent = clean_question( get_clue_sent(question_obj_list[i], sa)) if len(clue_sent) > 1: clue_sents.add(clue_sent.strip()) # deduplicate clues if len(clues) > 1: clues = deduplicate_clues(clues) return clues, list(clue_sents)
def find_len(): openfile = open("Training_Body_Title_user.p", "rb") x = pickle.load(openfile) c = 0 m = 0 s = 0 tot = 0 for y in x: tot += 1 try: question = y['Title'].encode('utf-8') + ' ' except: question = '' question = question + y['Body'].encode('utf-8') question = utils.clean_question(question) t = len(question.split()) s += t # print t if t > 300: c += 1 if t > m: m = t print m print s / tot print c
def train(): openfile = open("Training_Body_Title_user.p", "rb") x = pickle.load(openfile) #print "x:",len(x) for loop in xrange(5): #print "loop: ",loop x_train = [] user = [] y_train = [] cnt = 0 trace = 0 for o, y in enumerate(x): try: question = y['Title'].encode('utf-8') + ' ' except: question = '' question = question + y['Body'].encode('utf-8') question = utils.clean_question(question) tag_string = y['tags'].encode('utf-8') #print tag_string tag_list = utils.get_tag_list(tag_string) question_enc = get_question_embedding(question) tag_enc = get_tag_encoding(tag_list) cnt = cnt + 1 x_train.append(question_enc) y_train.append(tag_enc) try: user.append(meta_model[str(user_num[y['OwnerUserId']])]) except: #trace += 1 #print trace user.append(np.zeros(128)) if cnt == batch_size: x_train = np.asarray(x_train) y_train = np.asarray(y_train) user = np.asarray(user) print "cnt: ", cnt, " loop: ", loop, " o: ", o # print (x_train.shape) #model.fit([x_train, user],y_train, epochs=1) model.fit(x_train, y_train, epochs=1) #model.save('model4_train_add_Body_Title_gru_epochs10.h5') #model.save_weights('model4_train_add_Body_Title_weights_gru_epochs10.h5') model.save( 'model4_train_DeepTagRecContent_usingAdd_sigmoid.h5') model.save_weights( 'model4_train_DeepTagRecContent_weights_usingAdd_sigmoid.h5' ) x_train = [] user = [] y_train = [] cnt = 0
def match_question_wiki(question_obj_list, wiki_paras): clues = [] sas = [] clue_sents = set() # get clue and clue sentences in a pairewise manner for i in range(len(question_obj_list)): qanta_id = question_obj_list[i]["qanta_id"] paras = wiki_paras[str(qanta_id)] for j in range(len(paras)): q1, q2 = clean_question(question_obj_list[i]["text"]), \ clean_question(paras[j]) clues, sas, clue_sents = get_pairwise_match( q1, q2, clues, sas, clue_sents, question_obj_list, i) # deduplicate clues if len(clues) > 1: clues = deduplicate_clues(clues) return clues, list(clue_sents)
def get_pairwise_match(q1, q2, clues, sas, clue_sents, question_obj_list, i): question_matcher = SequenceMatcher(None, q1, q2) matching_blocks = question_matcher.get_matching_blocks() longest_match = sorted(matching_blocks, key=lambda k: k.size, reverse=True)[0] if longest_match.size > 10: sa, sb, span = longest_match.a, longest_match.b, longest_match.size # print(i, j, longest_match, "q1: ", q1[sa:sa + span], # "q2: ", q2[sb:sb + span]) clue = q1[sa:sa + span] complete_clue = get_complete_clue(sa, sa + span, q1) if clue != '': clues.append(complete_clue.strip()) sas.append(sa) clue_sent = clean_question(get_clue_sent(question_obj_list[i], sa)) if len(clue_sent) > 1: css = clue_sent.strip().split(",") for s in css: clue_sents.add(s) return clues, sas, clue_sents
def test(): # openfile = open("ValidationPosts_15000.pickle", "rb") openfile = open("score_low.p", "rb") x = pickle.load(openfile) x_test = [] user = [] actual = [] cnt = 0 correct = 0 precision = 0.0 recall = 0.0 total = 0 count = 0 for y in x: #if total > 5: # break #question = y['Body'].encode('utf-8') question = y['Title'].encode('utf-8') question = question + ' ' + y['Body'].encode('utf-8') question = utils.clean_question(question) tag_string = y['Tags'].encode('utf-8') tag_list = utils.get_tag_list(tag_string) question_enc = get_question_embedding(question) tag_enc = get_tag_encoding(tag_list) try: user.append(meta_model[str(user_num[y['OwnerUserId']])]) except: #trace += 1 #print trace user.append(np.zeros(128)) cnt = cnt + 1 #print cnt x_test.append(question_enc) actual.append(np.asarray(tag_enc)) #print(cnt) user = np.asarray(user) x_test = np.asarray(x_test) #s = model.predict([x_test, user]) s = model.predict(x_test) actual = np.asarray(actual) predicted = s #run_user(s, user) #print predicted #break # (t_correct, t_total) = calc_precision_new(actual,s) #(t_num, t_den_p, t_den_r, t_total) = evaluate(actual,s) #num += t_num #den_p += t_den_p #den_r += t_den_r # correct += t_correct #total += t_total # print "correct = ", correct #print "total = ", total #print "=============" #predicted = [] #dict1 = pickle.load(open("predicted.p", 'rb')) #for i in dict1.keys(): # predicted.append(dict1[i]) #predicted = np.asarray(predicted) #actual = np.asarray(actual) #''' #print 'I m done' #break #break for i in [3, 5, 10]: #print actual precision = 0.0 recall = 0.0 total = 0 p, r, t = evaluate(actual, predicted, i) precision += p * t recall += r * t total += t precision = precision / total recall = recall / total #precision = (float(num)/den_p)/ float(total) #recall = (float(num)/den_r)/ float(total) print "Precision @" + str(i) + ": ", precision print "Recall @" + str(i) + ": ", recall