def top_year(year, connection): resp = connection.read(""" SELECT SUM(t.pdf) as downloads, t.article, a.url, a.title, a.abstract, a.collection, a.posted, a.doi FROM article_traffic t INNER JOIN articles a ON t.article=a.id WHERE t.year = %s AND a.posted >= '%s-01-01' AND a.posted <= '%s-12-31' GROUP BY 2,3,4,5,6,7,8 ORDER BY 1 DESC LIMIT 25 """, (year,year,year)) if len(resp) == 0: return [] results = [models.SearchResultArticle(a, connection) for a in resp] return results
def paper_query(q, categories, timeframe, metric, page, page_size, connection): """Returns a list of the most downloaded papers that meet a given set of constraints. Arguments: - connection: a database Connection object. - q: A search string to compare against article abstracts, titles and author names. (Title matches are weighted more heavily.) - categories: A list of bioRxiv categories the results can be in. - timeframe: A description of the range of dates on which to base the rankings (i.e. "alltime" or "lastmonth") - metric: Which article-level statistic to use when sorting results - page: Which page of the results to display (0-indexed) - page_size: How many entries should be returned Returns: - An list of Article objects that meet the search criteria, sorted by the specified metric in descending order. """ # We build two queries, 'select' and 'countselect': one to get the # current page of results, and one to figure out the total number # of results select = "SELECT " if metric == "downloads": select += "r.downloads" elif metric == "twitter": select += "SUM(r.count)" select += ", a.id, a.url, a.title, a.abstract, a.collection, a.posted, a.doi" countselect = "SELECT COUNT(DISTINCT a.id)" params = () query = "" if q != "": # if there's a text search specified params = (q, ) query += f' FROM {config.db["schema"]}.articles AS a INNER JOIN {config.db["schema"]}.' if metric == "twitter": query += "crossref_daily" elif metric == "downloads": query_times = { "alltime": "alltime_ranks", "ytd": "ytd_ranks", "lastmonth": "month_ranks", } query += query_times[timeframe] if metric == "twitter": query += " AS r ON r.doi=a.doi" elif metric == "downloads": query += " AS r ON r.article=a.id" if q != "": query += """, plainto_tsquery(%s) query, coalesce(setweight(a.title_vector, 'A') || setweight(a.abstract_vector, 'C') || setweight(a.author_vector, 'D')) totalvector """ # add a WHERE clause if we need one: # (all-time twitter stats don't require it) if metric == "downloads" or (metric == "twitter" and timeframe != "alltime" ) or len(categories) > 0: query += " WHERE " if metric == "downloads": query += "r.downloads > 0" if q != "" or len(categories) > 0: query += " AND " if q != "": query += "query @@ totalvector " if len(categories) > 0 or (metric == "twitter" and timeframe != "alltime"): query += " AND " if len(categories) > 0: query += "collection=ANY(%s)" if q != "": params = (q, categories) else: params = (categories, ) if metric == "twitter" and timeframe != "alltime": query += " AND " if metric == "twitter" and timeframe != "alltime": query += "r.source_date > now() - interval " query_times = {"day": 2, "week": 7, "month": 30, "year": 365} query += f"'{query_times[timeframe]} days' " # this is the last piece of the query we need for the one # that counts the total number of results countselect += query resp = connection.read(countselect, params) total = resp[0][0] # continue building the query to get the full list of results: if metric == "twitter": query += " GROUP BY a.id" query += " ORDER BY " if metric == "downloads": query += "r.rank ASC" elif metric == "twitter": query += "SUM(r.count) DESC" query += f" LIMIT {page_size}" if page > 0: query += f" OFFSET {page * page_size}" query += ";" select += query result = connection.read(select, params) results = [models.SearchResultArticle(a, connection) for a in result] return results, total