Esempio n. 1
0
def train_model(item):
    x, y = [], []
    if len(item.papers) > 0 and ClickCount.objects(search_item=item).count() > 0:
        try:
            click_counts = ClickCount.objects(search_item=item)
            h = {}
            for click_count in click_counts:
                h[str(click_count.paper.id)] = click_count.count
            for paper in item.papers:
                if str(paper.id) in h:
                    count = h[str(paper.id)]
                else:
                    count = 0
                x.append(vectorize_paper(paper))
                y.append(count)
            regressor = tree.DecisionTreeRegressor()
            regressor.fit(x, y)
            return regressor
        except:
            print(x)
            print(y)
    else:
        return None
Esempio n. 2
0
    def get_scores(self):
        try:
            item = SearchItem.objects(keyword=self.keyword).get()
            if item.model:
                regressor = pickle.loads(item.model)
            papers = item.papers
            x = [vectorize_paper(paper) for paper in papers]
            y = regressor.predict(x)
            # return papers
            return itemgetter(*[t[0] for t in sorted(enumerate(y), key=lambda i: i[1], reverse=True)])(papers)
        except Exception as e:
            logging.debug(e)
            return None

    # def add_missing_info(self):
    #     self.add_journal_if_self()

    # # TODO jounal if
    # def add_journal_if_self(self):
    #     for k,v in self.papers.items():
    #         if 'Journal' not in v or not v['Journal']:
    #             v["Journal_IF"] = 0
    #             continue
    #         try:
    #             stripped_journal_name = re.sub('[\W_]+', '', v["Journal"].upper())
    #             v["Journal_IF"] = Journal.get(name==stripped_journal_name).impact_factor
    #         except Exception as e:
    #             try:
    #                 if len(stripped_journal_name) >= 16:
    #                     v["Journal_IF"] = Journal.get(
    #                         name.startswith(stripped_journal_name[:16])).impact_factor
    #                 if len(stripped_journal_name) >= 12:
    #                     v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:12])).impact_factor
    #                 elif len(stripped_journal_name) >= 8:
    #                     v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:8])).impact_factor
    #                 elif len(stripped_journal_name) >= 4:
    #                     v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:4])).impact_factor
    #                 else:
    #                     v["Journal_IF"] = 0
    #             except Exception as e:
    #                 v["Journal_IF"] = 0

    # @staticmethod
    # def add_journal_if(paper_list):
    #     for paper in paper_list:
    #         if 'Journal' not in paper or not paper['Journal']:
    #             paper["Journal_IF"] = 0
    #             continue
    #         try:
    #             stripped_journal_name = re.sub('[\W_]+', '', paper["Journal"].upper())
    #             paper["Journal_IF"] = Journal.get(Journal.title==stripped_journal_name).impact_factor
    #         except DoesNotExist:
    #             try:
    #                 if len(stripped_journal_name) >= 16:
    #                     paper["Journal_IF"] = Journal.get(
    #                         Journal.title.startswith(stripped_journal_name[:16])).impact_factor
    #                 if len(stripped_journal_name) >= 12:
    #                     paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:12])).impact_factor
    #                 elif len(stripped_journal_name) >= 8:
    #                     paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:8])).impact_factor
    #                 elif len(stripped_journal_name) >= 4:
    #                     paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:4])).impact_factor
    #                 else:
    #                     paper["Journal_IF"] = 0
    #             except DoesNotExist:
    #                 paper["Journal_IF"] = 0

    # def ranking(self):
    #     model = self.check_model()
    #     if model:
    #         clf = model[0]
    #         number_clicks = model[1]
    #         maximum_ml_score = -1
    #         for k,v in self.papers.items():
    #             if "Journal_IF" in v and "Year" in v:
    #                 x = [[v["Year"], v["Journal_IF"]]]
    #                 score_ml = clf.predict(x)[0]
    #                 v["Score_ML"] = score_ml
    #                 if score_ml > maximum_ml_score:
    #                     maximum_ml_score = score_ml
    #                 weight = 1 - math.pow(0.5, 0.1*number_clicks)
    #                 v["Weight"] = weight
    #         for k,v in self.papers.items():
    #             if "Score_ML" in v:
    #                 v["Score_ML"] *= 1 / maximum_ml_score
    #                 # logging.debug("{}: {}".format(v["Title"], v["Score_ML"]))
    #                 v["Score"] = v["Score"]*(1-v["Weight"]) + v["Score_ML"]*v["Weight"]
    #     else:
    #         pass

    # TODO: models
    # def check_model(self):
    #     ALWAYS_CREATE_NEW_MODEL_AND_DONT_SAVE = True
    #     if ALWAYS_CREATE_NEW_MODEL_AND_DONT_SAVE:
    #         new_model = self.train_model()
    #         return new_model
    #     try:
    #         search_term = SearchTerm.get(SearchTerm.keyword == self.keyword)
    #         model = Model.get(Model.search_term == search_term)
    #         if datetime.datetime.now() - model.last_modified > datetime.timedelta(days = 1):
    #             new_model = self.train_model()
    #             if new_model:
    #                 model.model = pickle.dumps(new_model)
    #                 model.last_modified = datetime.datetime.now()
    #                 model.save()
    #             return new_model
    #         else:
    #             return pickle.loads(model.model)
    #     except DoesNotExist:
    #         new_model = self.train_model()
    #         if new_model:
    #             Model.create(
    #                 search_term = SearchTerm.get(SearchTerm.keyword == self.keyword),
    #                 model = pickle.dumps(new_model)
    #             )
    #         return new_model

    # def train_model(self):
    #     x, y = [], []
    #     #clicks = SearchTerm.get(SearchTerm.keyword == self.keyword).clicks
    #     clicks = Click.select(Paper, Click).join(Paper).switch(Click).join(SearchTerm).where(SearchTerm.keyword == self.keyword)
    #     if clicks.count() == 0:
    #         return False
    #     for click in clicks:
    #         x.append(
    #             [
    #                 click.paper.year,
    #                 click.paper.journal_if
    #             ]
    #         )
    #         y.append(click.click_count)
    #     #clf = svm.SVR(kernel="rbf")
    #     #clf.fit(x, y)
    #     #return [clf, sum(y)]
    #     gp = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)
    #     gp.fit(x, y)
    #     return [gp, sum(y)]