def search(args): keyword = args.get('keyword') # Store history in database if SearchItem.objects(keyword=keyword).count() == 0: search_item = SearchItem(keyword=keyword) else: search_item = SearchItem.objects(keyword=keyword).get() search_item.count += 1 search_item.save() # Load cache # search_results = g.get('search_results', None) # if search_results is None: # g.search_results = {} # search_results = g.search_results # # search_id_to_results = g.get('search_id_to_results', None) # if search_id_to_results is None: # g.search_id_to_results = {} # search_id_to_results = g.search_id_to_results query_result = PaperProcessor(keyword) papers = query_result.papers_array # paper_ids = [x["DBID"] for x in papers] # search_item.update(add_to_set__papers=paper_ids) if flask_login.current_user.is_authenticated: search_history = SearchHistory(item=search_item, user=User.objects(id=flask_login.current_user.id).get(), papers=[x for x in query_result.papers_array]) else: search_history = SearchHistory(item=search_item, papers=[x for x in query_result.papers_array]) search_history.save() # # Word bag # bag = AbstractProcessor().process_list(return_list) # words = [[y, bag[y]] for y in sorted(list(bag.keys()), key=lambda x: bag[x], reverse=True)[:30]] # Return result return jsonify( response=str(search_history.id), meta_info={ 'page_count': math.ceil(len(papers)/RESULTS_PER_PAGE) } )
def generate_papers_array(self): paper_ids = [paper["DBID"] for title, paper in self.papers.items()] search_item = SearchItem.objects(keyword=self.keyword).get() search_item.update(add_to_set__papers=paper_ids) # self.papers_array = list(self.papers.values()) # logging.info("Have {} papers in total.".format(len(self.papers_array))) # self.papers_array.sort(key=lambda x: x["Score"]) # self.papers_array.reverse() # for index, paper in enumerate(self.papers_array): # paper["ID"] = index search_item.reload() self.papers_array = self.get_scores() if not self.papers_array: self.papers_array = search_item.papers
def get_scores(self): try: item = SearchItem.objects(keyword=self.keyword).get() if item.model: regressor = pickle.loads(item.model) papers = item.papers x = [vectorize_paper(paper) for paper in papers] y = regressor.predict(x) # return papers return itemgetter(*[t[0] for t in sorted(enumerate(y), key=lambda i: i[1], reverse=True)])(papers) except Exception as e: logging.debug(e) return None # def add_missing_info(self): # self.add_journal_if_self() # # TODO jounal if # def add_journal_if_self(self): # for k,v in self.papers.items(): # if 'Journal' not in v or not v['Journal']: # v["Journal_IF"] = 0 # continue # try: # stripped_journal_name = re.sub('[\W_]+', '', v["Journal"].upper()) # v["Journal_IF"] = Journal.get(name==stripped_journal_name).impact_factor # except Exception as e: # try: # if len(stripped_journal_name) >= 16: # v["Journal_IF"] = Journal.get( # name.startswith(stripped_journal_name[:16])).impact_factor # if len(stripped_journal_name) >= 12: # v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:12])).impact_factor # elif len(stripped_journal_name) >= 8: # v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:8])).impact_factor # elif len(stripped_journal_name) >= 4: # v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:4])).impact_factor # else: # v["Journal_IF"] = 0 # except Exception as e: # v["Journal_IF"] = 0 # @staticmethod # def add_journal_if(paper_list): # for paper in paper_list: # if 'Journal' not in paper or not paper['Journal']: # paper["Journal_IF"] = 0 # continue # try: # stripped_journal_name = re.sub('[\W_]+', '', paper["Journal"].upper()) # paper["Journal_IF"] = Journal.get(Journal.title==stripped_journal_name).impact_factor # except DoesNotExist: # try: # if len(stripped_journal_name) >= 16: # paper["Journal_IF"] = Journal.get( # Journal.title.startswith(stripped_journal_name[:16])).impact_factor # if len(stripped_journal_name) >= 12: # paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:12])).impact_factor # elif len(stripped_journal_name) >= 8: # paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:8])).impact_factor # elif len(stripped_journal_name) >= 4: # paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:4])).impact_factor # else: # paper["Journal_IF"] = 0 # except DoesNotExist: # paper["Journal_IF"] = 0 # def ranking(self): # model = self.check_model() # if model: # clf = model[0] # number_clicks = model[1] # maximum_ml_score = -1 # for k,v in self.papers.items(): # if "Journal_IF" in v and "Year" in v: # x = [[v["Year"], v["Journal_IF"]]] # score_ml = clf.predict(x)[0] # v["Score_ML"] = score_ml # if score_ml > maximum_ml_score: # maximum_ml_score = score_ml # weight = 1 - math.pow(0.5, 0.1*number_clicks) # v["Weight"] = weight # for k,v in self.papers.items(): # if "Score_ML" in v: # v["Score_ML"] *= 1 / maximum_ml_score # # logging.debug("{}: {}".format(v["Title"], v["Score_ML"])) # v["Score"] = v["Score"]*(1-v["Weight"]) + v["Score_ML"]*v["Weight"] # else: # pass # TODO: models # def check_model(self): # ALWAYS_CREATE_NEW_MODEL_AND_DONT_SAVE = True # if ALWAYS_CREATE_NEW_MODEL_AND_DONT_SAVE: # new_model = self.train_model() # return new_model # try: # search_term = SearchTerm.get(SearchTerm.keyword == self.keyword) # model = Model.get(Model.search_term == search_term) # if datetime.datetime.now() - model.last_modified > datetime.timedelta(days = 1): # new_model = self.train_model() # if new_model: # model.model = pickle.dumps(new_model) # model.last_modified = datetime.datetime.now() # model.save() # return new_model # else: # return pickle.loads(model.model) # except DoesNotExist: # new_model = self.train_model() # if new_model: # Model.create( # search_term = SearchTerm.get(SearchTerm.keyword == self.keyword), # model = pickle.dumps(new_model) # ) # return new_model # def train_model(self): # x, y = [], [] # #clicks = SearchTerm.get(SearchTerm.keyword == self.keyword).clicks # clicks = Click.select(Paper, Click).join(Paper).switch(Click).join(SearchTerm).where(SearchTerm.keyword == self.keyword) # if clicks.count() == 0: # return False # for click in clicks: # x.append( # [ # click.paper.year, # click.paper.journal_if # ] # ) # y.append(click.click_count) # #clf = svm.SVR(kernel="rbf") # #clf.fit(x, y) # #return [clf, sum(y)] # gp = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1) # gp.fit(x, y) # return [gp, sum(y)]