def prefloptable(): # iterate through each hands prefloptable = {} counter = 0 for currhand in totalhands: counter += 1 # one loop of this will calculate EHS for 1 hand, 1325 to go wintally = 0 totaltally = 0 handsdone = [] handsdone.append(currhand) prunedhands = pruner(currhand, totalhands, totalflops, totalturns, totalrivers)[0] # hand to remove, keep track of hands that have been removed before for opphand in [x for x in prunedhands if x not in handsdone]: # list comprehension # need to then remove this hand handsdone.append(opphand) totaltally += 1 myrank = rank.ranking(list(currhand), []) opprank = rank.ranking(list(opphand), []) # how to deal with ties? figure it out if myrank > opprank: wintally += 1 else: pass # need to consider ties totalwinrate = (wintally / totaltally) * 100 prefloptable[currhand] = totalwinrate # Store data (serialize) with open('prefloptable.pickle', 'wb') as handle: pickle.dump(prefloptable, handle, protocol=pickle.HIGHEST_PROTOCOL) return prefloptable
def get_rewards(self): player = self.get_active_player_index() opponent = 1 - player if self.history[-1] == 'f': return self.current_bets[opponent] player_rank = rank.ranking('player.hand', self.board) opponent_rank = rank.ranking('opponent.hand', self.board) if player_rank < opponent_rank: return self.current_bets[opponent] elif player_rank > opponent_rank: return -self.current_bets[player] else: return 0
def onSearchBtnClick(self, event): if self.selectedHero is None: print("Heroi não selecionado") return features = [ self.dataset.columns[i + 1] for i in range(self.ckboxList.GetCount()) if self.ckboxList.IsChecked(i) ] if len(features) == 0: print("Features não selecionadas") return #index = self.list_ctrl.GetFirstSelected() self.list_ctrl.DeleteAllItems() #hero_name = self.dataset.copy().iloc[index]['hero_names'] #features = ["Agility", "Accelerated Healing", "Lantern Power Ring", "Dimensional Awareness"] print("Buscando por " + "'" + str(self.selectedHero) + "'") print("Features: ") print(features) result = ranking(self.dataset.copy(), features, str(self.selectedHero), metodo=self.combo.GetValue()) result = result[0:10] result = result.iloc[:, ::-1] #print result self.list_ctrl.DeleteAllColumns() self.list_ctrl.InsertColumn(0, 'Score', width=80) self.list_ctrl.InsertColumn(1, 'Super-herói', width=80) self.__addItems(result[0:10]) #top 10 self.heroForm.SetValue('')
def search(kwords_lst): conn = happybase.Connection(host = settings.HBASE_HOST, port = settings.HBASE_PORT, compat = '0.90') ksegs = [] for kwords in kwords_lst: segs = jieba.cut(kwords, cut_all = True) unicode_segs = [] for seg in segs: unicode_segs.append(seg.encode('utf-8')) ksegs += unicode_segs ksegs = set(ksegs) # 500 is not the correct parameter # should pass the number of html documents in table 'WebData' result_urls = ranking.ranking(conn, ksegs, 500) url_table = conn.table('WebData') results = [] for url in result_urls: row = url_table.row(url) title = row['content:title'] results.append([url, title]) return results
def averageRankingSingleShot(descr_probe, descr_gallery, maxrank=50, iterations=100): ranks = np.zeros(maxrank) for i in xrange(iterations): descr_probe_i = get_random_elements(descr_probe) descr_gallery_i = get_random_elements(descr_gallery) descrs_query = [] query_labels = [] for p in descr_probe_i.keys(): query_labels.append(p) descrs_query.append(descr_probe_i[p]) descrs_gallery = [] gallery_labels = [] for p in descr_gallery_i.keys(): gallery_labels.append(p) descrs_gallery.append(descr_gallery_i[p]) r = rank.ranking(descrs_query, query_labels, descrs_gallery, gallery_labels, maxrank=maxrank) ranks += r return ranks * 1. / iterations
def test_post_ranking(self): form = '' handler = ranking() handler.request = Request({ 'REQUEST_METHOD': 'POST', 'PATH_INFO': '/ranking', }) handler.response = Response() handler.post()
def floptable(): # iterate through each hands floptable = {} counter = 0 for currhand in totalhands: counter += 1 # one loop of this will calculate EHS for 1 hand, 1325 to go wintally = 0 totaltally = 0 handsdone = [] handsdone.append(currhand) prunedhands = pruner(currhand, totalhands, totalflops, totalturns, totalrivers)[0] # hand to remove, keep track of hands that have been removed before for opphand in [x for x in prunedhands if x not in handsdone]: # need to then remove this hand handsdone.append(opphand) # removes illegal hands, flops, turns, rivers current = pruner(currhand, totalhands, totalflops, totalturns, totalrivers) final = pruner(opphand, current[0], current[1], current[2], current[3]) # checking every flop for each hand for flop in final[1]: totaltally += 1 myrank = rank.ranking(list(currhand), list(flop)) opprank = rank.ranking(list(opphand), list(flop)) # how to deal with ties? figure it out if myrank > opprank: wintally += 1 else: pass # need to consider ties totalwinrate = (wintally / totaltally) * 100 floptable[currhand] = totalwinrate print(totalwinrate) print(counter) # Store data (serialize) with open('floptable.pickle', 'wb') as handle: pickle.dump(floptable, handle, protocol=pickle.HIGHEST_PROTOCOL) return floptable
def userdata_receive_cbandit(request, userid): global LoudBandit global ModeBandit global TempoBandit global CB_CKPTSTATE if request.method == 'GET': pulse = request.GET.get('heartrate') timevalue = (((datetime.datetime.now().hour) * 60) + datetime.datetime.now().minute) upc, created = UserPlayCounter.objects.get_or_create(userid=userid) # Only used since rating is a required value in our serializer rating = 1.0 if (CB_CKPTSTATE != TempoBandit.get_checkpoint_state()): TempoBandit = CBandit.CBandit(CB_NUMBER_OF_STATES, CB_TEMPO_ACTIONS, CB_TEMPO_CKPT_PATH, cb_outputtempo) LoudBandit = CBandit.CBandit(CB_NUMBER_OF_STATES, CB_LOUD_ACTIONS, CB_LOUD_CKPT_PATH, cb_outputloud) ModeBandit = CBandit.CBandit(CB_NUMBER_OF_STATES, CB_MODE_ACTIONS, CB_MODE_CKPT_PATH, cb_outputmode) CB_CKPTSTATE = TempoBandit.get_checkpoint_state() if ((userid in cb_recommendation_cache) and cb_recommendation_cache.get(userid)): song = cb_recommendation_cache.get(userid).pop() # All songs that have been cached from one recommendation request will use the same ranking id rid = cb_rid_cache.get(userid) else: usernumber = upc.userindex bucketedpulse = Bucketizer.bucketize_pulse(int(pulse)) bucketedtime = Bucketizer.bucketize_time(timevalue) state = usernumber*CB_NUMBER_OF_STATES + bucketedpulse*CB_TIME_BUCKETS + bucketedtime # We get all ranking ids here but they will all be the same (since they are updated at the same time) # Might change this just to get one since its all we need. temporid, tempo = TempoBandit.predict(state) moderid, mode = ModeBandit.predict(state) loudrid, loudness = LoudBandit.predict(state) # Cache new songs based on bandit sugggestions cb_recommendation_cache[userid] = ranking.ranking(Bucketizer.bucketize_tempo(tempo), Bucketizer.bucketize_loudness(loudness), mode, userid) #all rankingids should be identical so it doesnt matter which one we choose cb_rid_cache[userid] = loudrid rid = loudrid song = cb_recommendation_cache.get(userid).pop() sc, created = SongCounter.objects.get_or_create(userid=userid, songid=song) delta = upc.playCounter - sc.lastPlayed data = Userdata.create(userid, song, pulse, rating, delta) data.ratingid = rid serializer = UserdataSerializer(data) return JsonResponse(serializer.data, status=200) # Not used at the moment elif request.method == 'PUT': data = JSONParser().parse(request) serializer = UserdataSerializer(userdata, data=data) if serializer.is_valid(): serializer.save() return JsonResponse(serializer.data) return JsonResponse(serializer.errors, status=400) # Not used at the moment elif request.method == 'DELETE': return HttpResponse(status=403)
def test_post_ranking(self): handler = ranking() handler.request = Request({ 'REQUEST_METHOD': 'GET', 'PATH_INFO': '/ranking', }) handler.response = Response() handler.get() self.failUnless(len(handler.get()) <= 10) self.assertEqual('200 OK', response.status)
def userdata_receive_dnn(request, userid): global LoudDNN global ModeDNN global TempoDNN global DNN_CKPTSTATE if request.method == 'GET': timevalue = (((datetime.datetime.now().hour) * 60) + datetime.datetime.now().minute) pulse = request.GET.get('heartrate') upc, created = UserPlayCounter.objects.get_or_create(userid=userid) # Rating is set to 1 since we want a song with high rating rating = 1.0 if (DNN_CKPTSTATE != TempoDNN.get_checkpoint_state()): LoudDNN = DNNModel.DNNModel(DNN_LOUD_CKPT_PATH, dnn_outputloud) ModeDNN = DNNModel.DNNModel(DNN_MODE_CKPT_PATH, dnn_outputmode) TempoDNN = DNNModel.DNNModel(DNN_TEMPO_CKPT_PATH, dnn_outputtempo) DNN_CKPTSTATE = TempoDNN.get_checkpoint_state() if ((userid in dnn_recommendation_cache) and dnn_recommendation_cache.get(userid)): song = dnn_recommendation_cache.get(userid).pop() else: data = {'user_id':[userid],'time':[timevalue],'heart_rate':[int(pulse)],'rating':[rating]} tempo = Bucketizer.bucketize_tempo(int(TempoDNN.get_predict_class_id(data_matrix=data))) mode = Bucketizer.bucketize_mode(int(ModeDNN.get_predict_class_id(data_matrix=data))) loudness = Bucketizer.bucketize_loudness(int(LoudDNN.get_predict_class_id(data_matrix=data))) # Cache new songs based on DNN sugggestions dnn_recommendation_cache[userid] = ranking.ranking(tempo, loudness, mode, userid) song = dnn_recommendation_cache.get(userid).pop() sc, created = SongCounter.objects.get_or_create(userid=userid, songid=song) delta = upc.playCounter - sc.lastPlayed data = Userdata.create(userid, song, pulse, rating, delta) serializer = UserdataSerializer(data) return JsonResponse(serializer.data, status=200) # Not used at the moment elif request.method == 'PUT': data = JSONParser().parse(request) serializer = UserdataSerializer(userdata, data=data) if serializer.is_valid(): serializer.save() return JsonResponse(serializer.data) return JsonResponse(serializer.errors, status=400) # Not used at the moment elif request.method == 'DELETE': return HttpResponse(status=403)
def process(self, data, election, user): if self.choice: c = choice.choice() c.ans = data["ans"] user.ballot.add_vote(c) self.choice = False if self.rank: r = ranking.ranking() r.rankings = data["ans"] user.ballot.add_vote(r) self.rank = False if self.writeIn: w = writeIn.writeIn() w.ans = data["ans"] user.ballot.add_vote(w) self.writeIn = False if self.voteOrWriteIn: self.voteOrWriteIn = False c = int(data["ans"]) index = len(user.ballot.votes) if c == len(election.voteActions[index].options): self.writeIn = True write = { "Instructions": "Please write in your choice: ", "type": "char25", } instance.send(write) return election, user else: c = choice.choice() c.ans = data["ans"] user.ballot.add_vote(c) if len(user.ballot.votes) < len(election.voteActions): q = self.getVote(election, len(user.ballot.votes)) instance.send(q) else: self.ballotComplete = True return election, user
def search(query1, i_i): logging.basicConfig(level=logging.DEBUG, format='%(filename)s %(levelname)s: %(asctime)s: %(message)s') logger = logging.getLogger('main') logger.info('Executing indexing module') logger.info('Reading file') # GIVEN QUERY FROM FRONT-END, FIND RELEVANT RESULTS query = query1# user input print('input:',query) matcher = match() q = preprocessing().the_works(query) CR = i_i.lookup_query(q) CR = matcher.boolean(CR) # added in case not every token matches doctoken_matchnums =[len(i) for i in CR.values()] if len(doctoken_matchnums) == 0: return '' scaler = max(doctoken_matchnums) CR = matcher.scale(CR,scaler) # RANK RELEVANT RESULTS r_ranking = ranking() resources = list(CR.keys()) max_freq = r_ranking.get_max_frequencies(index=CR) # , num_docs=len(i_i.storage.index) # Now save this into the persisted memory object within the index i_i.storage.max_frequency_terms_per_doc = max_freq res = r_ranking.relevance_ranking(query = query, num_results=5, index=i_i.index, resources=resources, max_freq=i_i.storage.max_frequency_terms_per_doc, N=len(i_i.storage.index), term_doc_matrix=i_i.doc_term_matrix_all) # GENERATE RANKED JSON_SNIPPETS FOR FRONT-END snipper = snip(r_ranking) json = snipper.get_snippets(res, resources=resources, query=query, i_i=i_i) # print('output:',json) return(json)
def SortPlayers(self, break_ties = 0): self.player_rankings = [] ranking_list = [] for player_1 in self.player_handler_list: inserted_flag = 0 for player_2, index in zip(ranking_list, range(len(ranking_list))): if player_1.GetScore() > player_2.GetScore(): ranking_list.insert(index, player_1) inserted_flag = 1 break if not inserted_flag: ranking_list.append(player_1) for player, index in zip(ranking_list, range(len(ranking_list))): inserted_flag = 0 for rank in self.player_rankings: if abs(rank.GetScore() - player.GetScore()) < .2: rank.AddPlayer(player) inserted_flag = 1 break if not inserted_flag: temp = ranking.ranking(index+1, [player]) self.player_rankings.append(temp)
def rankUI(root, funcName, databaseHandle: mysql.connector.connect): def closeThisWindow(): top.destroy() top = Toplevel(root) top.title("统计信息") titleFrame = Frame(top) titleLabel = Label(titleFrame, text="成 绩 排 名", font=('圆体-简', '30')) titleFrame.pack(side='top', fill=X, padx=10) titleLabel.pack(side='top', fill=X, padx=10) color = ("#ffffff", "#ececec") deptInfo = queryDept(databaseHandle) for dept in deptInfo: deptFrame = Frame(top) deptLabelFrame = Frame(deptFrame) deptLabel = Label(deptLabelFrame, text=dept[0], font=('圆体-简', '20')) leftLabel = Label(deptLabelFrame, text="~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ", font=('圆体-简', '20')) rightLabel = Label(deptLabelFrame, text="~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ", font=('圆体-简', '20')) deptFrame.pack(side='top', padx=10, pady=20) deptLabelFrame.pack(side='top', fill=X, padx=10) leftLabel.pack(side='left', padx=20, pady=20) deptLabel.pack(side='left', fill=X, padx=20) rightLabel.pack(side='left', padx=20, pady=20) infoTableFrame = Frame(deptFrame) infoTableFrame.pack(side='top', fill=X, padx=10) headFrame = Frame(infoTableFrame) headFrame.pack(side='top', padx=10) SnoHeadLabel = Label(headFrame, text='学号', width=10) SnoHeadLabel.pack(side='left', padx=10) SnameHeadLabel = Label(headFrame, text='姓名', width=10) SnameHeadLabel.pack(side='left', padx=10) avgGradeHeadLabel = Label(headFrame, text='平均分', width=10) avgGradeHeadLabel.pack(side='left', padx=10) rankHeadLabel = Label(headFrame, text='名次', width=10) rankHeadLabel.pack(side='left', padx=10) courseInfo = queryCourseByDept(databaseHandle, dept) for course in courseInfo: courseHeadLabel = Label(headFrame, text=course[0], width=10) courseHeadLabel.pack(side='left', padx=10) infoByDept = ranking(databaseHandle, dept) count = 1 for course in infoByDept: infoRowFrame = Frame(deptFrame, bg=color[count % 2]) infoRowFrame.pack(side='top', fill=X, padx=10) SnoLabel = Label(infoRowFrame, text=course[0], width=10, bg=color[count % 2]) SnoLabel.pack(side='left', padx=10) SnameLabel = Label(infoRowFrame, text=course[1], width=10, bg=color[count % 2]) SnameLabel.pack(side='left', padx=10) avgGradeLabel = Label(infoRowFrame, text=course[2], width=10, bg=color[count % 2]) avgGradeLabel.pack(side='left', padx=10) rankLabel = Label(infoRowFrame, text=count, width=10, bg=color[count % 2]) rankLabel.pack(side='left', padx=10) for i in range(len(courseInfo)): scInfo = querySCBySnoCno(course[0], courseInfo[i][1], databaseHandle) print(scInfo) gradeLabel = Label(infoRowFrame, bg=color[count % 2], width=10) gradeLabel.pack(side='left', padx=10) try: gradeLabel.config(text=scInfo[4]) except: gradeLabel.config(text='无成绩') count += 1 exitButtonFrame = Frame(top) exitButton = Button(exitButtonFrame, text='关闭', command=closeThisWindow) exitButtonFrame.pack(side='top', fill=X, padx=10) exitButton.pack(side='top', fill=X, padx=10) top.mainloop()
def index(): matches = db.session.query(Match).order_by(Match.created_asof.desc()) rankings = ranking(matches) return render_template('index.html', matches=matches, rankings=rankings)
print "\trecall macro:", recallmacro recallmicro = recall_score(y_test, y_pred, average='micro') print "\trecall micro:", recallmicro f1macro = f1_score(y_test, y_pred, average='macro') print "\tf1 macro:", f1macro f1micro = f1_score(y_test, y_pred, average='micro') print "\tf1 micro:", f1micro rank_options = [False] if test['method'] == 'dummy': rank_options = [True, False] for preshuffle in rank_options: df_with_ranking = rk.ranking(data_test, y_pred, y_prob, preshuffle=preshuffle, target=True) search_ids = df_with_ranking['srch_id'] diff_search_ids = search_ids.drop_duplicates() k = 0 ndcg_list = [] for id in diff_search_ids: mask = (df_with_ranking['srch_id'] == id) result_df = df_with_ranking.loc[mask] ndcg_result = ndcg.ndcg(result_df) ndcg_list.append(ndcg_result)
import menu as menu import jogo as jogo import ranking as ranking import tutorial as tutorial import gameover as gameover from PPlay.sound import * janela = Window(1000, 600) janela.set_title("Mimi") game_state = 0 dificuldade = 1 tema = Sound("sons/jogo.ogg") tema.set_volume(50) tema.set_repeat(True) tema.play() while True: if game_state == 0: game_state = menu.menu(janela) if game_state == 1: game_state = jogo.jogo(janela) if game_state == 2: game_state = tutorial.tutorial(janela) if game_state == 3: game_state = ranking.ranking(janela) if game_state == 4: game_state = gameover.gameover(janela) janela.update()
select_cols = [ 'prop_starrating', 'prop_review_score', 'prop_location_score2', 'price_usd', 'promotion_flag', 'no_bookings_prop', 'no_found_prop' ] rank_options = [False] slices_to_do = range(17, 25) for i in slices_to_do: data_file = "data/test_set_added_variables_%i.csv" % (i) data_test_slice = dp.DataAggregator(data_file) data_test_slice.read_data() data_test_df = data_test_slice.df X_test = make_X(data_test_df, select_cols) y_pred = model.predict(X_test) y_prob = model.predict_proba(X_test) for preshuffle in rank_options: df_with_ranking = rk.ranking(data_test_df, y_pred, y_prob, preshuffle=preshuffle, target=False) final_df = df_with_ranking[['srch_id', 'prop_id']] final_df.to_csv('prediction_file%d.csv' % (i), index=False) print "slice %d done" % (i)
print "start classfieing" model = pkl.load(open( 'Classifiers_final\gradient_boosting_Boosting-False_max_leaf_nodes-4-learning_rate-0.1-n_estimators-100-subsample-0.5-random_state-2-min_samples_split-5-max_depth-None.pkl', 'r')) select_cols = ['prop_starrating', 'prop_review_score', 'prop_location_score2', 'price_usd', 'promotion_flag', 'no_bookings_prop', 'no_found_prop'] rank_options = [False] for i in slices_to_do: data_file = "data/test_set_added_variables_%i.csv" % (i) data_test_slice = dp.DataAggregator(data_file) data_test_slice.read_data() data_test_df = data_test_slice.df X_test = make_X(data_test_df, select_cols) y_pred = model.predict(X_test) y_prob = model.predict_proba(X_test) for preshuffle in rank_options: df_with_ranking = rk.ranking(data_test_df, y_pred, y_prob, preshuffle=preshuffle, target=False) final_df = df_with_ranking[['srch_id', 'prop_id']] final_df.to_csv('prediction_file%d.csv' % (i), index=False) print "slice %d done" % (i)
print "\taccuracy:", accuracy recallmacro = recall_score(y_test, y_pred, average='macro') print "\trecall macro:", recallmacro recallmicro = recall_score(y_test, y_pred, average='micro') print "\trecall micro:", recallmicro f1macro = f1_score(y_test, y_pred, average='macro') print "\tf1 macro:", f1macro f1micro = f1_score(y_test, y_pred, average='micro') print "\tf1 micro:", f1micro rank_options = [False] if test['method'] == 'dummy': rank_options = [True, False] for preshuffle in rank_options: df_with_ranking = rk.ranking(data_test, y_pred, y_prob, preshuffle=preshuffle, target = True) search_ids = df_with_ranking['srch_id'] diff_search_ids = search_ids.drop_duplicates() k = 0 ndcg_list = [] for id in diff_search_ids: mask = (df_with_ranking['srch_id'] == id) result_df = df_with_ranking.loc[mask] ndcg_result = ndcg.ndcg(result_df) ndcg_list.append(ndcg_result) meanndcg = sum(ndcg_list) / float(len(ndcg_list)) f.write('%s; %s; %s; %s; %s; %f; %f; %f; %f; %f; %f\n' % (
clf = ensemble.RandomForestClassifier(**params) start_time = datetime.now() clf.fit(X_train, y_train) print clf.classes_ print "trained in", datetime.now() - start_time y_pred = clf.predict(X_train) y_prob = clf.predict_proba(X_train) print "class probs", y_prob print "classes found", np.unique(y_pred) print "accuracy:", clf.score(X_train, y_train) print "recall macro:", recall_score(y_train, y_pred, average='macro') print "recall micro:", recall_score(y_train, y_pred, average='micro') print "f1 macro:", f1_score(y_train, y_pred, average='macro') print "f1 micro:", f1_score(y_train, y_pred, average='micro') df_with_ranking = rk.ranking(traindf, y_pred, y_prob) search_ids = df_with_ranking['srch_id'] diff_search_ids = search_ids.drop_duplicates() k = 0 ndcg_list = [] for id in diff_search_ids: mask = (df_with_ranking['srch_id'] == id) result_df = df_with_ranking.loc[mask] ndcg = ndcg.ndcg(result_df, k) ndcg_list.append([ndcg])
import ranking if __name__ == "__main__": a = ranking.ranking([ ['a', 'b', 5, 5], ['a', 'c', 5, 3], ['b', 'c', 4, 3], ['a', 'c', 5, 3], ['a', 'd', 5, 1], ['b', 'c', 5, 3], ['b', 'd', 5, 1], ['c', 'd', 3, 1], ['a', 'd', 4, 3], ['d', 'a', 4, 1], ]) print(a.massey()) print(a.colley()) a.find_dup() print(a.massey()) print(a.colley()) print(a.borda([ [['A',3],['B',1],['D',2]], [['A',2],['B',1],['D',4],['C',3]], [['E',1]] ]))
wh_q_lst.append(binary_form) processed_q_lst.append(binary_form) labels.append(wh_word) # get the best guess sentence fuzzy_ans = [] predictions = ranking.fuzzyCompare(sentences, fuzzy_lst) threshold = 89 for (best_sentence, score) in predictions: if score < threshold: fuzzy_ans.append("No.") else: fuzzy_ans.append("Yes.") wh_guess = [] ind = ranking.ranking(sentences, wh_q_lst) for i in ind: wh_guess.append(sentences[i]) ###################### #combine all candidates ind_wh = 0 ind_binary = 0 candidates = [] for lab in labels: if lab != "BINARY" and lab != "OTHERS": candidates.append(wh_guess[ind_wh]) ind_wh += 1 else: candidates.append(fuzzy_ans[ind_binary]) ind_binary += 1
def get_documents(query,score_dict,pos_dict): global words_list global phrase_list global negphrase_list global negwords_list global index common_docid_set = set() pre_process_query(query) all_negative_phrase_docs = defaultdict() ## stores the docids of all the phrases that are negated i.e it stores docids which contain the negated phrase all_negative_word_docs = defaultdict() ## stores the docids of all words that are negated i.e it stores docids which contain the negated word ## processing phrases in the query that are not negated -- put their information in the 2 result dictionaries process_phrase(phrase_list,score_dict,pos_dict) ## processing phrases in the query that are negated -- put their info in a separate dict 'all_negative_phrase_docs' ## functionality is the same as the processing for a normal phrase, but stored in a separate dict for phrase in negphrase_list: phrase_words = phrase.strip().split() ## split phrase into phrase-words for i in range(0,len(phrase_words)): ## stemming the phrase words as index is stemmed phrase_words[i] = porter.stem(phrase_words[i]) if len(phrase_words) == 1: ## if only one word in the phrase words_list.append(phrase_words[0]) ## add it to the word list to process as a normal word in the query else: ## else if phrase_words[0] in index.keys(): ## if the first phrase-word is in index common_docid_set = set(index[phrase_words[0]].keys()) ## add it's docids to the set containing the common docids for i in range(1,len(phrase_words)): if phrase_words[i] in index.keys(): ## if the next phrase-word is in the index common_docid_set = set(index[phrase_words[i]].keys()) & common_docid_set ## find the docids that are common with the previous words in the phrase else: ## if one of the phrase words is not in index, it implies the phrase does not occur in any document common_docid_set.clear() ## hence empty the common doc set and break break for docid in common_docid_set: ## for each doc containing the phrase-words, find if they occur together as a phrase prev_pos_set = set(index[phrase_words[0]][docid]) ## stores the positions of the first phrase-word curr_pos_set = set() for j in range(1,len(phrase_words)): for pos in index[phrase_words[i]][docid]: ## for each position of the current phrase-word in current doc curr_pos_set.add(pos-j) ## decrement the positions by j to match the start position of the phrase correct_pos_set = curr_pos_set & prev_pos_set ## stores only the start positions of those instances where the words appear next to each other if len(correct_pos_set) == 0: ## if there are no such positions in thid doc where the words appear as a phrase break ## break and move to the next doc else: prev_pos_set = correct_pos_set if len(correct_pos_set) == 0: ## if pos set is empty, it implies that no phrase match in current document continue ## hence continue to next document else: ## else make the entry in the doc dict for positive phrase if docid not in all_negative_phrase_docs: ## if new docid entry in dict score = len(correct_pos_set)*len(phrase_words) ## score = no of occurrences * no of phrase words all_negative_phrase_docs[docid] = score ## storing docids with no of occurrences else: ## if existing docid, add occurrences and new positions score = all_negative_phrase_docs[docid] + (len(correct_pos_set)*len(phrase_words)) ## add the scores for the next phrase in the same doc all_negative_phrase_docs[docid] = score ## storing the scores just to maintain consistency, scores will not be used for document retrieval ## processing individual words in the query that are not negated -- put their info in the 2 result dictionaries process_words(words_list,score_dict,pos_dict) ## processing individual words in the query that are negated -- put their info in a separate dict 'all_negative_word_docs' ## functionality is the same as processing a normal word, but stored in a separate dict for q_negword in negwords_list: q_negword = q_negword.strip() if q_negword in index: for docid in index[q_negword].keys(): score = 0 if docid not in all_negative_word_docs: all_negative_word_docs[docid] = len(index[q_negword][docid]) ## stores the docids in a separate dict for negated words else: score = all_negative_word_docs[docid] + len(index[q_negword][docid]) ## storing the scores just to maintain consistency in code, scores are not used in document retrieval all_negative_word_docs[docid] = score ## get the set of documents that contains negated phrases or negated words negative_phrase_set = set() negative_word_set = set() if len(all_negative_phrase_docs.keys()) != 0: ## if there is a negated phrase in the query, the all_negative_phrase_docs will contain docids containing the phrase negative_phrase_set = set(score_dict.keys()) - set(all_negative_phrase_docs.keys()) ## thus taking the difference from the whole set of documents will give the set not containing that phrase if len(all_negative_word_docs.keys()) != 0: ## if there is a negated word in the query, the all_negative_word_docs will contain docids containing the word negative_word_set = set(score_dict.keys()) - set(all_negative_word_docs.keys()) ## thus taking the difference from the whole set of documents will give the set not containing that word total_negative_set = negative_phrase_set | negative_word_set ## union of these 2 sets gives the set of documents not containing the phrase/words that are negated in the query ## to get the final set of documents, we need to do a fuzzy or between the normal(score_dict) and negated set (total_negative_set) of documents ## As per the following logic, if there is a docid common to both the normal set and the negated set, it will retain it and show phrase/word instances from the normal set. ## else the negated docid set is identified by assigning them a negative score for docid in total_negative_set: ## for each docid in the negated set of documents if score_dict[docid] == 0: score_dict[docid] = -1 ## pickle the final dictionaries for ranking ## pickle.dump(score_dict,open("score_dict","wb")) ## pickle.dump(pos_dict,open("pos_dict","wb")) ## call the ranking function to rand and display snippets ranking.ranking(query,score_dict,pos_dict)
def start(): print("Opening dblp index") pubIx = open_dir("index/dblp_index/Pubblication_Index") venIx = open_dir("index/dblp_index/Venue_Index") query_immesse = 10 while query_immesse >= 0: searcher = pubIx.searcher(weighting=scoring.Frequency) searcher2 = venIx.searcher(weighting=scoring.Frequency) phrase = input("Insert query:\n>>") phrase_no_rank, choice, topk = choice_ranking(phrase) queries = divqueries(phrase_no_rank) print(queries) q1, q2 = setqueries(queries) print(q1 + '\t' + q2) print('\n') schema = create_scheme()[0] parser = whoosh.qparser.MultifieldParser( ['author', 'title', 'year'], schema=pubIx.schema) #default is title query = parser.parse(q1) results = searcher.search(query, limit=None) schema = create_scheme()[1] parser = whoosh.qparser.MultifieldParser( ['author', 'title', 'year'], schema=venIx.schema) #default is title query = parser.parse(q2) results2 = searcher2.search(query, limit=None) t, g = getquerywords(queries) rank = ranking(query=t, result=results, choice=choice, ix=pubIx.doc_count(), searcher=searcher, pub=True) sorted_result = rank.rank() #print_sorted_result(sorted_result,choice) rank = ranking(query=g, result=results2, choice=choice, searcher=searcher2, ix=venIx.doc_count(), pub=False) sorted_result2 = rank.rank() #print_sorted_result(sorted_result2,choice) result = merge_results(pub_result=sorted_result, choice=choice, venue_result=sorted_result2) Ta_result = Threshold(result, topk).run() f = open('Result.txt', 'a', encoding='utf-8') for i in Ta_result[0:topk]: if i[0][0][0] is None: final = i[0][1][0] elif i[0][1][0] is None: if return_fuzzy_choice(choice): final = i[0][0][0][0] else: final = i[0][0][0] else: if return_fuzzy_choice(choice): final = list(set( i[0][0][0][0] + i[0][1][0])) #list(set().union(i[0][0][0],i[0][1][0])) else: final = list(set(i[0][0][0] + i[0][1][0])) print_result_TA(final, i[1], f) f.close() import subprocess subprocess.run(['more', str(os.path.abspath('Result.txt'))], shell=True) status_cmd = subprocess.CompletedProcess( ['more', str(os.path.abspath('Result.txt'))], returncode=0).returncode if status_cmd == 0: os.remove(os.path.abspath('Result.txt')) query_immesse -= 1
def contentquery(self): ST=time.time() urllist=[] totalword=len(self.wordlist) for x in self.wordlist: print self.worddic[x] print "total word:", totalword for dbname in self.wordlist: if self.tempdb.has_key(dbname): urllist=urllist+self.Stringload(zlib.decompress(self.tempdb[dbname])) if totalword>1: ranklist=ranking.ranking(urllist) urlcomp=ranklist.dicuniques(totalword) else: urlcomp=urllist totalsize=len(urlcomp) ralist=[] pagestartat=self.page*self.pagesize if (pagestartat+self.pagesize)>totalsize: pageendat=totalsize else: pageendat=pagestartat+self.pagesize print time.time()-ST if totalword!=1: rangestsart=0 rangeend=totalsize if totalsize>500 or pagestartat>=500: rangestsart=(pagestartat//500)*500 rangeend=rangestsart+500 pagestartat=pagestartat-rangestsart pageendat=pagestartat+self.pagesize #for i in xrange(0,totalsize): count=0 searchtime=0.0 linktime=0.0 for i in xrange(rangestsart,rangeend): bastscore=0 mirs=0 spliturl=urlcomp[i] if totalword>=3: sword=3 else: sword=totalword if len(spliturl)==2 and spliturl[0]==totalword: at=time.time() self.pct,title=self.purei.queryPurecontent(spliturl[1]) bt=time.time()-at matchstart=0 scorelist=[] searchtime=searchtime+bt for match in re.finditer(self.uni.decode("utf-8"),self.pct): matchstart=match.start() if matchstart: bastscore=60 if (matchstart+150)> len(self.pct): mirs=len(self.pct)-matchstart scorelist.append((match.start(),match.end())) startat=self.findpunctuation(matchstart) scorelist=self.wordmarkup(scorelist,startat-mirs) abstract=startat-mirs destcontent=self.pct[abstract:abstract+150] url=urllib.quote(self.urldb[self.serialdb[spliturl[1]]]) ralist.append((destcontent,bastscore,str(spliturl[0]),url,title,scorelist)) if len(spliturl)==2 and spliturl[0]>=sword and bastscore==0: at=time.time() # self.pct,title=self.purei.queryPurecontent(spliturl[1]) r=[] for dbname in self.wordlist: if self.tempdb.has_key(dbname): picklelist=[] for match in re.finditer(self.worddic[dbname],self.pct): picklelist.append((match.start(),match.end())) r=r+picklelist r=sorted(r, key=operator.itemgetter(0)) r=ranklist.wordlinker(r) bastscore, scorelist = ranklist.counttheimportantpart(r) #print scorelist if len(scorelist)>0: startat=scorelist[0][0] startat=self.findpunctuation(startat) if (startat+150)> len(self.pct): mirs=len(self.pct)-startat scorelist=self.wordmarkup(scorelist,startat-mirs) abstract=startat-mirs destcontent=self.pct[abstract:abstract+150] url=urllib.quote(self.urldb[self.serialdb[spliturl[1]]]) ralist.append((destcontent,bastscore,str(spliturl[0]),url,title,scorelist)) bt=time.time()-at linktime=linktime+bt print 'totalword2:', print time.time()-ST if totalword==1: for i in xrange(pagestartat,pageendat): bastscore=0 mirs=0 spliturl=urlcomp[i] self.pct,title=self.purei.queryPurecontent(spliturl) matchstart=0 scorelist=[] picklelist=[] for match in re.finditer(self.uni,self.pct): matchstart=match.start() picklelist.append((match.start(),match.end())) if (matchstart+100) > len(self.pct): mirs=len(self.pct)-matchstart scorelist=picklelist startat=scorelist[0][0] startat=self.findpunctuation(startat) scorelist=self.wordmarkup(scorelist,startat-mirs) abstract=startat-mirs destcontent=self.pct[abstract:abstract+150] url=urllib.quote(self.urldb[self.serialdb[spliturl[0:4]]]) #print destcontent,str(1),url,title,scorelist ralist.append((destcontent,100,str(1),url,title,scorelist)) print 'totalword1:', print time.time()-ST return (totalsize,sorted(ralist, key=operator.itemgetter(1),reverse=True)) print "search:",str(searchtime) print "Link:",str(linktime) return (totalsize,sorted(ralist, key=operator.itemgetter(1),reverse=True)[pagestartat:pageendat])