def fetch_content(query_text, column, text_column, page, sources=None): session = g.session tsq = build_tsquery(query_text) search = None if column == db.WebPages.title: query = session \ .query(db.WebPages, func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq))) \ .filter( \ func.to_tsvector("english", column).match(tsq, postgresql_regconfig='english') \ ) \ .order_by(func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq)).desc()) elif column == db.WebPages.tsv_content: query = session \ .query(db.WebPages, func.ts_rank_cd(column, func.to_tsquery(tsq))) \ .filter( column.match(tsq) ) if "'" in query_text or '"' in query_text: search = query_text.replace("!", " ").replace("?", " ").replace( "'", " ").replace('"', " ").replace(',', " ").replace('.', " ").strip() while " " in search: search = search.replace(" ", " ") search = search.strip() search = '%{}%'.format(search.lower()) query = query.filter(func.lower(text_column).like(search)) query = query.order_by( func.ts_rank_cd(column, func.to_tsquery(tsq)).desc()) if sources: query = query.filter(db.WebPages.netloc.in_(sources)) else: raise ValueError("Wat?") print(str(query.statement.compile(dialect=postgresql.dialect()))) print("param: '%s', '%s', '%s'" % (tsq, sources, search)) try: entries = paginate(query, page, per_page=50) except sqlalchemy.exc.ProgrammingError: traceback.print_exc() print("ProgrammingError - Rolling back!") g.session.rollback() raise except sqlalchemy.exc.InternalError: traceback.print_exc() print("InternalError - Rolling back!") g.session.rollback() raise except sqlalchemy.exc.OperationalError: traceback.print_exc() print("InternalError - Rolling back!") g.session.rollback() raise return entries
def fetch_content(query_text, column, text_column, page, sources=None): session = g.session tsq = build_tsquery(query_text) search = None if column == db.WebPages.title: query = session \ .query(db.WebPages, func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq))) \ .filter( \ func.to_tsvector("english", column).match(tsq, postgresql_regconfig='english') \ ) \ .order_by(func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq)).desc()) elif column == db.WebPages.tsv_content: query = session \ .query(db.WebPages, func.ts_rank_cd(column, func.to_tsquery(tsq))) \ .filter( column.match(tsq) ) if "'" in query_text or '"' in query_text: search = query_text.replace("!", " ").replace("?", " ").replace("'", " ").replace('"', " ").replace(',', " ").replace('.', " ").strip() while " " in search: search = search.replace(" ", " ") search = search.strip() search = '%{}%'.format(search.lower()) query = query.filter( func.lower(text_column).like(search) ) query = query.order_by(func.ts_rank_cd(column, func.to_tsquery(tsq)).desc()) if sources: query = query.filter(db.WebPages.netloc.in_(sources)) else: raise ValueError("Wat?") print(str(query.statement.compile(dialect=postgresql.dialect()))) print("param: '%s', '%s', '%s'" % (tsq, sources, search)) try: entries = paginate(query, page, per_page=50) except sqlalchemy.exc.ProgrammingError: traceback.print_exc() print("ProgrammingError - Rolling back!") g.session.rollback() raise except sqlalchemy.exc.InternalError: traceback.print_exc() print("InternalError - Rolling back!") g.session.rollback() raise except sqlalchemy.exc.OperationalError: traceback.print_exc() print("InternalError - Rolling back!") g.session.rollback() raise return entries
def fetch_content(query_text, column, page, sources=None): session = db.get_session() tsq = build_tsquery(query_text) if column == db.WebPages.title: query = session \ .query(db.WebPages, func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq))) \ .filter( \ func.to_tsvector("english", column).match(tsq, postgresql_regconfig='english') \ ) \ .order_by(func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq)).desc()) elif column == db.WebPages.tsv_content: query = session \ .query(db.WebPages, func.ts_rank_cd(column, func.to_tsquery(tsq))) \ .filter( column.match(tsq) ) \ .order_by(func.ts_rank_cd(column, func.to_tsquery(tsq)).desc()) if sources: query = query.filter(db.WebPages.netloc.in_(sources)) else: raise ValueError("Wat?") # print(str(query.statement.compile(dialect=postgresql.dialect()))) # print("param: '%s'" % tsq) try: entries = paginate(query, page, per_page=50) except sqlalchemy.exc.ProgrammingError: traceback.print_exc() print("ProgrammingError - Rolling back!") db.get_session().rollback() raise except sqlalchemy.exc.InternalError: traceback.print_exc() print("InternalError - Rolling back!") db.get_session().rollback() raise except sqlalchemy.exc.OperationalError: traceback.print_exc() print("InternalError - Rolling back!") db.get_session().rollback() raise return entries
def fetch_all_by_tsquery(self, tsquery, *expr, **kwargs): """ Returns all structures whose abstract matches the keywords given in the keyword string. Parameters ---------- tsquery : str A string containing the lexme that will be used for searching. The string will be automatically casted into a tsquery through the to_tsquery() function. weights : list, default=[0.1, 0.2, 0.4, 1.0] The weights specify how heavily to weigh each category of word, in the order [D-weight, C-weight, B-weight, A-weight]. normalization : int, default=32 Option that specifies whether and how a document's length should impact its rank. The integer option controls several behaviors, so it is a bit mask: you can specify one or more behaviors using | (for example, 2|4). - 0 ignores the document length - 1 divides the rank by 1 + the logarithm of the document length - 2 divides the rank by the document length - 4 divides the rank by the mean harmonic distance between extents (this is implemented only by ts_rank_cd) - 8 divides the rank by the number of unique words in document - 16 divides the rank by 1 + the logarithm of the number of unique words in document - 32 divides the rank by itself + 1 min_words : int, default=10 max_words : int, default=25 Returns ------- resultset : list List containing tuples in the form (Structure, snippet, rank). The snippet is the part of the abstract that matches the query. Ordered by rank. Notes ----- - http://www.postgresql.org/docs/current/interactive/datatype-textsearch.html Examples -------- >>> StructureAdaptor().fetch_all_by_tsquery('imatinib') [(<Structure(3FW1)>, u'[Imatinib] represents the first in a class', 0.54545500000000002), (<Structure(2GQG)>, u'[imatinib], an inhibitor of BCR-ABL. Although', 0.375), (<Structure(3CS9)>, u'[imatinib] against [imatinib]-resistant Bcr-Abl. Consistent', 0.33333299999999999), (<Structure(2HYY)>, u'[imatinib] as first-line therapy', 0.230769), (<Structure(3G0F)>, u'[imatinib] mesylate and sunitinib malate', 0.230769), (<Structure(3G0E)>, u'[imatinib] mesylate and sunitinib malate', 0.230769), (<Structure(1T46)>, u'[Imatinib] or Gleevec) demonstrates that', 0.090909100000000007)] """ weights = kwargs.get('weights', [0.1, 0.2, 0.4, 1.0]) normalization = kwargs.get('normalization', 32) min_words, max_words = kwargs.get('min_words', 10), kwargs.get('max_words', 25) if kwargs.get('plain'): tsquery = func.plainto_tsquery('english', tsquery) else: tsquery = func.to_tsquery('english', tsquery) # basic query construct query = self.query.join('XRefs').filter(and_(*expr)) query = query.join( citations, and_(citations.c.pubmed_id == cast(XRef.xref, Integer), XRef.source == 'PubMed')) # GIST index that is used for searching index = func.to_tsvector('english', citations.c.abstract).op('@@')(tsquery) query = query.filter(index).order_by("rank DESC") # calculated rank of the hit rank = func.ts_rank_cd( weights, func.to_tsvector('english', citations.c.abstract), tsquery, normalization).label('rank') query = query.add_column(rank) if kwargs.get('snippet'): # configuration of the preview snippet headline_conf = 'StartSel=[, StopSel=], MaxWords={0}, MinWords={1}'.format( max_words, min_words) # function to create a preview snippet of the matched area snippet = func.ts_headline('english', citations.c.abstract, tsquery, headline_conf).label('snippet') # add snippet column to query query = query.add_column(snippet) return query