Ejemplo n.º 1
0
def _build_tsv(A, B=[], C=[], D=[]):
    """
    Given lists for A, B, C, and D, builds a tsvector from them.
    """
    tsv = func.setweight(
        func.to_tsvector(
            REGCONFIG,
            _join_with_space([func.coalesce(bit, "") for bit in A])), "A")
    if B:
        tsv = tsv.concat(
            func.setweight(
                func.to_tsvector(
                    REGCONFIG,
                    _join_with_space([func.coalesce(bit, "") for bit in B])),
                "B"))
    if C:
        tsv = tsv.concat(
            func.setweight(
                func.to_tsvector(
                    REGCONFIG,
                    _join_with_space([func.coalesce(bit, "") for bit in C])),
                "C"))
    if D:
        tsv = tsv.concat(
            func.setweight(
                func.to_tsvector(
                    REGCONFIG,
                    _join_with_space([func.coalesce(bit, "") for bit in D])),
                "D"))
    return tsv
Ejemplo n.º 2
0
 def __table_args__(self):
     return (
         db.Index('idx_tsv_title',
                  func.to_tsvector("english", self.title),
                  postgresql_using="gin"),
         db.Index('idx_tsv_title_description',
                  func.to_tsvector(
                      "english",
                      self.title + ' ' + coalesce(self.description, '')),
                  postgresql_using="gin"),
     )
Ejemplo n.º 3
0
    def build_query(params, full=False):
        columns = [users.c.id, users.c.name, users.c.email_address] if full \
                  else [users.c.id]

        query = select(columns)

        query = query.order_by(users.c.id)

        if 'email' in params:
            query = query.where(users.c.email_address == params['email'])

        if 'name' in params:
            search_string = " & ".join(
                "{}:*".format(word) for word in params['name'].split(' ')
            )
            name_vector = func.to_tsvector(users.c.name)

            query = query.where(name_vector.match(search_string))
            query = query.order_by(
                desc(func.ts_rank(name_vector, func.to_tsquery(search_string)))
            )

        # if 'page_size' in params:
        #     query = query.limit(params['page_size'])
        return query
Ejemplo n.º 4
0
class Goods(Base):
    __tablename__ = 'goods'

    id = Column(Integer, primary_key=True)
    height = Column(Integer, nullable=False)
    width = Column(Integer, nullable=False)
    depth = Column(Integer, nullable=False)
    weight = Column(Integer, nullable=False)
    description = Column(Text)
    invoice_num = Column(Integer,
                         ForeignKey('invoices.num',
                                    onupdate='restrict',
                                    ondelete='restrict'),
                         nullable=False)

    invoice = relationship("Invoice", backref="goods")

    __table_args__ = (
        Index('goods_descriptions_index',
              func.to_tsvector('english', description),
              postgresql_using='gin'),
        Index('height_index', height, postgresql_using='hash'),
        Index('invoice_num_index', invoice_num, postgresql_using='hash'),
    )

    def __str__(self):
        return f"Goods [id={self.id}, height={self.height}, width={self.width}, depth={self.depth}, " \
               f"weight={self.weight}, description={self.description}, invoice_num={self.invoice_num}]"
Ejemplo n.º 5
0
class FoodDetail(db.Model, ReturnHelper):
    """
    Daily Bites food_detail table - provides details for food recorded
    """

    __tablename__ = FOOD_DETAIL_TABLE

    food_id = db.Column(db.String(36),
                        db.ForeignKey("nut_per_100_gram.food_id"),
                        primary_key=True)
    food_desc = db.Column(db.String(256))
    barcode = db.Column(db.String(50))
    brand = db.Column(db.String(256))

    __table_args__ = (db.Index(
        "ix_food_detail_food_desc",
        func.to_tsvector("english", food_desc),
        postgresql_using="gin",
    ), )

    min_fields = {"food_id"}
    search_result_fields = {*min_fields, "barcode", "brand"}
    all_fields = {*search_result_fields, "food_desc"}

    @classmethod
    def get_food_detail_by_id(cls, food_id):
        return cls.query.filter_by(food_id=food_id).first()
Ejemplo n.º 6
0
def search_foods(search_query):
    query = FoodDetail.query.filter(
        func.to_tsvector("english",
                         FoodDetail.food_desc).op("@@")(func.plainto_tsquery(
                             "english", search_query)))
    foods = [food.search_result_view() for food in query.all()]
    return foods
Ejemplo n.º 7
0
def tokenize():
    recreate_database()
    s = Session()
    all_reviews = s.query(Reviews).all()
    for review in all_reviews:
        current_token = func.to_tsvector('english', review.review)
        review.tokens = current_token
    s.commit()
Ejemplo n.º 8
0
def add_simple_text_search(query, text_columns, keywords, include_rank=True):
    rank = None
    keywords_j = ' & '.join(keywords)
    fts_config = 'simple'
    filters = [
        func.to_tsvector(fts_config, column).match(keywords_j)
        for column in text_columns
    ]
    if len(filters) > 1:
        filter = or_(*filters)
    else:
        filter = filters[0]
    query = query.filter(filter)
    if include_rank:
        ranks = [
            func.ts_rank(func.to_tsvector(fts_config, column),
                         func.to_tsquery(fts_config, keywords_j))
            for column in text_columns
        ]
        rank = reduce(lambda a, b: a + b, ranks)
        query = query.add_column(rank.label('score'))
    return query, rank
Ejemplo n.º 9
0
	def upsertReponseContent(self, job, response):
		while 1:
			try:

				# If we have already fetched the page, push what we have back
				# into the history table.
				last = None
				if job.content:
					last = self.pushBackHistory(job, response)

				job.title    = response['title']
				job.content  = response['contents']
				job.mimetype = response['mimeType']

				# Update the tsv_content column if we have data for it.
				if response['contents']:
					job.tsv_content = func.to_tsvector(func.coalesce(response['contents']))

				if "text" in job.mimetype:
					job.is_text  = True
				else:
					job.is_text  = False

				job.state    = 'complete'

				# Disabled for space-reasons.
				# if 'rawcontent' in response:
				# 	job.raw_content = response['rawcontent']

				job.fetchtime = datetime.datetime.now()

				self.db.get_session().commit()
				self.log.info("Marked plain job with id %s, url %s as complete!", job.id, job.url)
				break
			except sqlalchemy.exc.OperationalError:
				self.db.get_session().rollback()
			except sqlalchemy.exc.InvalidRequestError:
				self.db.get_session().rollback()
Ejemplo n.º 10
0
    def score_result(self, searchterm=None):
        """
        Technical debt here: the identifiers in the indexed document are hard-coded
        into the scoring loop
        https://github.com/Chemical-Curation/resolver/pull/16#discussion_r536216962
        """
        matches = self.get_matches(searchterm)
        max_score = 0
        if matches:
            max_key = max(matches, key=matches.get)
            max_score = matches[max_key]
        return max_score


ix_identifiers = db.Index(
    "ix_identifiers",
    Substance.identifiers,
    text("(identifiers->'values') jsonb_path_ops"),
    postgresql_using="gin",
)
ix_identifiers_tsv = db.Index(
    "ix_identifiers_tsv",
    Substance.identifiers,
    func.to_tsvector("english", Substance.identifiers),
    postgresql_using="gin",
)

if __name__ == "__main__":
    ix_identifiers.create(bind=db.engine)
    ix_identifiers_tsv.create(bind=db.engine)
Ejemplo n.º 11
0
def search_mv_foods(search_query):
    query = TotalNutritionView.query.filter(
        func.to_tsvector("english", TotalNutritionView.food_desc).op("@@")(
            func.plainto_tsquery("english", search_query)))
    foods = [food.search_result_view() for food in query.all()]
    return foods
Ejemplo n.º 12
0
async def scrape_post(post_id, feed_id, loop, page_number):
    # Connect to database
    session = models.Session()

    # Get current UTC time in seconds
    now = int(datetime.utcnow().strftime('%s'))

    # Get HTML tree from post's webpage, specifying page number if given
    if page_number:
        post_html = requests.get('https://news.ycombinator.com/item?id=' +
                                 str(post_id) + '&p=' + str(page_number))

    else:
        post_html = requests.get('https://news.ycombinator.com/item?id=' +
                                 str(post_id))

    post_content = post_html.content

    post_soup = BeautifulSoup(post_content, 'html.parser')

    # If post page contains a "More" link to more comments, create asynchronous
    # task to scrape that page for its comments
    if (post_soup.find('a', 'morelink')):
        page_number = post_soup.find('a',
                                     'morelink').get('href').split('&p=')[1]

        loop.create_task(scrape_post(post_id, feed_id, loop, page_number))

    # Get all comment rows from HTML tree
    comment_rows = post_soup.select('tr.athing.comtr')

    # Set starting comment feed rank to 0
    comment_feed_rank = 0

    for comment_row in comment_rows:
        # Get comment id
        comment_id = comment_row.get('id')

        # Check if comment exists in database
        comment_exists = session.query(
            models.Comment.id).filter_by(id=comment_id).scalar()

        # Get core comment data if it is not in database already
        if not comment_exists:
            # If comment has content span, get text from span
            if comment_row.find('div', 'comment').find_all('span'):
                comment_content = comment_row.find(
                    'div', 'comment').find_all('span')[0].get_text()

                # Remove the last word ('reply') from the comment content
                # and strip trailing whitespace
                comment_content = comment_content.rsplit(' ', 1)[0].strip()

                total_word_count = len(comment_content.split())

            # Otherwise, comment is flagged, so get flagged message as text
            # and strip trailing whitespace
            else:
                comment_content = comment_row.find(
                    'div', 'comment').get_text().strip()

                total_word_count = 0

            # Get UTC timestamp for comment's posting time by subtracting
            # the number of days/hours/minutes ago given on the webpage from
            # the current UTC timestamp
            comment_time_unit = comment_row.find('span',
                                                 'age').a.get_text().split()[1]

            if 'day' in comment_time_unit:
                comment_created = now - 86400 * int(
                    comment_row.find('span', 'age').a.get_text().split()[0])

            elif 'hour' in comment_time_unit:
                comment_created = now - 3600 * int(
                    comment_row.find('span', 'age').a.get_text().split()[0])

            else:
                comment_created = now - 60 * int(
                    comment_row.find('span', 'age').a.get_text().split()[0])

            comment_created = time.strftime('%Y-%m-%d %H:%M',
                                            time.localtime(comment_created))

            # Get comment's level in tree by getting indentation width
            # value divided by value of one indent (40px)
            level = int(
                comment_row.find('td', 'ind').contents[0].get('width')) / 40

            # Set parent comment as blank if comment is the top-level
            # comment
            if level == 0:
                parent_comment = None

            # Otherwise, get preceding comment in comment tree
            else:
                parent_comment = session.query(models.Comment).with_entities(
                    models.Comment.id).join(models.FeedComment).filter(
                        models.Comment.level == (level - 1)
                    ).filter(models.FeedComment.feed_id == feed_id).filter(
                        models.Comment.post_id == post_id).order_by(
                            models.FeedComment.feed_rank).limit(1).one()[0]

            # Get username of user who posted comment
            try:
                comment_username = comment_row.find('a', 'hnuser').get_text()

            except AttributeError:
                comment_username = ''

            # Add scraped comment data to database
            comment = models.Comment(content=comment_content,
                                     created=comment_created,
                                     id=comment_id,
                                     level=level,
                                     parent_comment=parent_comment,
                                     post_id=post_id,
                                     total_word_count=total_word_count,
                                     username=comment_username,
                                     word_counts=func.to_tsvector(
                                         'simple_english',
                                         comment_content.lower()))

            session.add(comment)

        # Increment comment feed rank to get current comment's rank
        comment_feed_rank += 1

        # Add feed-based comment data to database
        feed_comment = models.FeedComment(comment_id=comment_id,
                                          feed_id=feed_id,
                                          feed_rank=comment_feed_rank)

        session.add(feed_comment)

    session.commit()

    # Print message if there are no more pages of comments to scrape
    if not post_soup.find('a', 'morelink'):
        print('Post ' + str(post_id) + ' and its comments scraped')

    return
Ejemplo n.º 13
0
def casestudies_search():
    search_query = get_json_from_request()

    offset = get_nonnegative_int_or_400(request.args, 'from', 0)
    result_count = get_positive_int_or_400(
        request.args, 'size', current_app.config['DM_API_SUPPLIERS_PAGE_SIZE'])

    sort_dir = search_query.get('sort_dir', 'asc')
    sort_by = search_query.get('sort_by', None)
    domains = search_query.get('domains', None)
    seller_types = search_query.get('seller_types', None)
    search_term = search_query.get('search_term', None)
    framework_slug = request.args.get('framework', 'digital-marketplace')

    q = db.session.query(CaseStudy).join(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \
        .outerjoin(SupplierFramework).outerjoin(Framework)
    q = q.filter(
        Supplier.status != 'deleted',
        or_(Framework.slug == framework_slug, ~Supplier.frameworks.any()))
    tsquery = None
    if search_term:
        if ' ' in search_term:
            tsquery = func.plainto_tsquery(search_term)
        else:
            tsquery = func.to_tsquery(search_term + ":*")
        q = q.add_column(
            func.ts_headline(
                'english',
                func.concat(CaseStudy.data['approach'].astext, ' ',
                            CaseStudy.data['role'].astext), tsquery,
                'MaxWords=150, MinWords=75, ShortWord=3, HighlightAll=FALSE, FragmentDelimiter=" ... " '
            ))
    else:
        q = q.add_column("''")
    q = q.add_column(Supplier.name)
    q = q.add_column(postgres.array_agg(Supplier.data))
    q = q.group_by(CaseStudy.id, Supplier.name)

    if domains:
        d_agg = postgres.array_agg(cast(Domain.name, TEXT))
        q = q.having(d_agg.contains(array(domains)))

    if seller_types:
        selected_seller_types = select(
            [postgres.array_agg(column('key'))],
            from_obj=func.json_each_text(Supplier.data[('seller_type', )]),
            whereclause=cast(column('value'), Boolean)).as_scalar()

        q = q.filter(selected_seller_types.contains(array(seller_types)))

    if sort_dir in ('desc', 'z-a'):
        ob = [desc(CaseStudy.data['title'].astext)]
    else:
        ob = [asc(CaseStudy.data['title'].astext)]

    if search_term:
        ob = [
            desc(
                func.ts_rank_cd(
                    func.to_tsvector(
                        func.concat(
                            Supplier.name, CaseStudy.data['title'].astext,
                            CaseStudy.data['approach'].astext)), tsquery))
        ] + ob

        condition = func.to_tsvector(
            func.concat(Supplier.name, CaseStudy.data['title'].astext,
                        CaseStudy.data['approach'].astext)).op('@@')(tsquery)

        q = q.filter(condition)
    q = q.order_by(*ob)

    raw_results = list(q)
    results = []

    for x in range(len(raw_results)):
        result = raw_results[x][0].serialize()
        if raw_results[x][1] is not None and raw_results[x][1] != '':
            result['approach'] = raw_results[x][1]
        if raw_results[x][2] is not None:
            result['supplierName'] = raw_results[x][2]
        if raw_results[x][3] is not None and raw_results[x][3][0] is not None:
            result['seller_type'] = raw_results[x][3][0].get('seller_type')
        results.append(result)

    total_results = len(results)

    sliced_results = results[offset:(offset + result_count)]

    result = {
        'hits': {
            'total': total_results,
            'hits': [{
                '_source': r
            } for r in sliced_results]
        }
    }

    try:
        response = jsonify(result), 200
    except Exception as e:
        response = jsonify(message=str(e)), 500

    return response
Ejemplo n.º 14
0
def create_tsvector(*args):
    exp = args[0]
    for e in args[1:]:
        exp += ' ' + e
    return func.to_tsvector('english', exp)
Ejemplo n.º 15
0
def to_tsvector(*args):
    exp = args[0]
    for i in args[1:]:
        exp += ' ' + i
    return func.to_tsvector('english', exp)
Ejemplo n.º 16
0
def initialize_test_database(postgresql, mock_get):
    db_port = postgresql.dsn()['port']
    db_host = postgresql.dsn()['host']
    db_user = postgresql.dsn()['user']
    database = postgresql.dsn()['database']
    os.environ['DB_NAME'] = database

    os.environ['DB_CONNECTION'] = ('postgresql://' + db_user + '@' + db_host +
                                   ':' + str(db_port) + '/' + database)

    models.engine = create_engine(os.environ['DB_CONNECTION'])

    models.Session = sessionmaker(bind=models.engine)

    # Create custom text dictionary and database tables
    management.initialize_database()

    alembicArgs = [
        '--raiseerr',
        'upgrade',
        'head',
    ]

    alembic.config.main(argv=alembicArgs)

    # Run fake Hacker News scrape to get sample feed, post, and comment from
    # past hour
    hacker_news.scrape_loop()

    # Connect to database
    session = models.Session()

    # Add sample feed to database for remaining time periods ('day', 'week',
    # 'all')
    past_day_feed = models.Feed(id=2,
                                created=(datetime.combine(
                                    date.today(),
                                    datetime.min.time())).isoformat())
    session.add(past_day_feed)

    past_week_feed = models.Feed(id=3,
                                 created=(datetime.combine(
                                     date.today() - timedelta(days=5),
                                     datetime.min.time())).isoformat())
    session.add(past_week_feed)

    all_feed = models.Feed(id=4,
                           created=(datetime.combine(
                               date.today() - timedelta(weeks=2),
                               datetime.min.time())).isoformat())
    session.add(all_feed)

    # Add sample post to database for remaining time periods ('day', 'week',
    # 'all')
    posts_file = 'fixtures/test-post.json'
    with open(posts_file, 'r') as post_data:
        post = json.load(post_data)

    past_day_post = models.Post(created=(datetime.combine(
        date.today() - timedelta(hours=20), datetime.min.time())).isoformat(),
                                id=4,
                                link=post['link'],
                                title=post['title'],
                                type='job',
                                username=post['username'],
                                website=post['website'])

    session.add(past_day_post)

    past_week_post = models.Post(created=(datetime.combine(
        date.today() - timedelta(days=5), datetime.min.time())).isoformat(),
                                 id=5,
                                 link=post['link'],
                                 title=post['title'],
                                 type='ask',
                                 username=post['username'],
                                 website=post['website'])

    session.add(past_week_post)

    all_post = models.Post(created=(datetime.combine(
        date.today() - timedelta(weeks=2), datetime.min.time())).isoformat(),
                           id=6,
                           link=post['link'],
                           title=post['title'],
                           type='show',
                           username=post['username'],
                           website=post['website'])

    session.add(all_post)

    # Add sample feed_post data to database for each feed and post
    past_day_feed_post = models.FeedPost(feed_id=past_day_feed.id,
                                         feed_rank=3,
                                         point_count=3,
                                         post_id=past_day_post.id,
                                         comment_count=3)

    session.add(past_day_feed_post)

    past_week_feed_post = models.FeedPost(feed_id=past_week_feed.id,
                                          feed_rank=5,
                                          point_count=5,
                                          post_id=past_week_post.id,
                                          comment_count=5)

    session.add(past_week_feed_post)

    all_feed_post = models.FeedPost(feed_id=all_feed.id,
                                    feed_rank=7,
                                    point_count=7,
                                    post_id=all_post.id,
                                    comment_count=7)

    session.add(all_feed_post)

    # Add sample comment to database for remaining time periods ('day', 'week',
    # 'all')
    comment_file = 'fixtures/test-comment.json'
    with open(comment_file, 'r') as comment_data:
        comment = json.load(comment_data)

    past_day_comment = models.Comment(
        content=comment['content'],
        created=(datetime.combine(date.today() - timedelta(hours=20),
                                  datetime.min.time())).isoformat(),
        id=6,
        level=0,
        parent_comment=None,
        post_id=past_day_post.id,
        total_word_count=len(comment['content'].split()),
        username=comment['username'],
        word_counts=func.to_tsvector('simple_english',
                                     comment['content'].lower()))

    session.add(past_day_comment)

    past_week_comment = models.Comment(
        content=comment['content'],
        created=(datetime.combine(date.today() - timedelta(days=5),
                                  datetime.min.time())).isoformat(),
        id=7,
        level=0,
        parent_comment=None,
        post_id=past_week_post.id,
        total_word_count=len(comment['content'].split()),
        username=comment['username'],
        word_counts=func.to_tsvector('simple_english',
                                     comment['content'].lower()))

    session.add(past_week_comment)

    all_comment = models.Comment(
        content=comment['content'],
        created=(datetime.combine(date.today() - timedelta(weeks=2),
                                  datetime.min.time())).isoformat(),
        id=8,
        level=0,
        parent_comment=None,
        post_id=all_post.id,
        total_word_count=len(comment['content'].split()),
        username=comment['username'],
        word_counts=func.to_tsvector('simple_english',
                                     comment['content'].lower()))

    session.add(all_comment)

    # Add sample feed_comment data to database for each feed and comment
    past_day_feed_comment = models.FeedComment(comment_id=past_day_comment.id,
                                               feed_id=past_day_feed.id,
                                               feed_rank=3)

    session.add(past_day_feed_comment)

    past_week_feed_comment = models.FeedComment(
        comment_id=past_week_comment.id,
        feed_id=past_week_feed.id,
        feed_rank=5)

    session.add(past_week_feed_comment)

    all_feed_comment = models.FeedComment(comment_id=all_comment.id,
                                          feed_id=all_feed.id,
                                          feed_rank=7)

    session.add(all_feed_comment)

    session.commit()

    session.close()
Ejemplo n.º 17
0
class ItemModel(db.Model):

    __abstract__ = True

    id = db.Column('id', db.Text, nullable=False, primary_key=True)
    title = db.Column('title', db.Text, nullable=False, index=True)
    abstract = db.Column('abstract', db.Text, index=True)
    type = db.Column('type', db.Text, index=True)

    spatial_data_service_type = db.Column(
        'spatial_data_service_type', db.Text, index=True)
    spatial_data_service_version = db.Column(
        'spatial_data_service_version', db.Text, index=True)
    spatial_data_service_operations = db.Column('spatial_data_service_operations', ARRAY(db.Text), index=True)
    spatial_data_service_queryables = db.Column('spatial_data_service_queryables', ARRAY(db.Text), index=True)


    format = db.Column('format', db.Text, index=True)
    keywords = db.Column('keywords', ARRAY(JSONB))
    publisher_name = db.Column('publisher_name', db.Text, index=True)
    publisher_email = db.Column('publisher_email', db.Text, index=True)
    publisher_id = db.Column('publisher_id', db.Text, index=True)
    language = db.Column('language', db.Text, index=True)
    date_start = db.Column('date_start', db.Date, index=True)
    date_end = db.Column('date_end', db.Date, index=True)
    creation_date = db.Column('creation_date', db.Date, index=True)
    publication_date = db.Column('publication_date', db.Date, index=True)
    revision_date = db.Column('revision_date', db.Date, index=True)

    geographic_location = db.Column('geographic_location', Geometry(
        geometry_type='POLYGON'), index=True)

    resource_locator = db.Column('resource_locator', db.Text, index=True)
    license = db.Column('license', db.Text, index=True)
    open_dataset = db.Column('open_dataset', db.Boolean, default=False)
    topic_category = db.Column('topic_category', ARRAY(db.Text), index=True)

    reference_system = db.Column('reference_system', db.Text, index=True)
    spatial_resolution = db.Column('spatial_resolution', db.Integer, index=True)
    scales = db.Column('scale', ARRAY(JSONB))
    version = db.Column('version', db.Text, index=True)
    conformity = db.Column('conformity', db.Text, index=True)
    additional_resources = db.Column(
        'additional_resources', ARRAY(JSONB))
    public_access_limitations = db.Column(
        'public_access_limitations', db.Text, index=True)

    metadata_language = db.Column('metadata_language', db.Text, index=True)
    metadata_point_of_contact_name = db.Column(
        'metadata_point_of_contact_name', db.Text, index=True)
    metadata_point_of_contact_email = db.Column(
        'metadata_point_of_contact_email', db.Text, index=True)
    metadata_date = db.Column('metadata_date', db.Date, index=True)
    metadata_version = db.Column('metadata_version', db.Text, index=True)
    resources = db.Column('resources', ARRAY(JSONB))
    lineage = db.Column('lineage', db.Text, index=True)
    parent_id = db.Column('parent_id', db.Text, index=True)
    parent_data_source_id = db.Column('parent_data_source_id', db.Text, index=True)

    item_geojson = db.Column('item_geojson', JSONB)

    contract_template_id = db.Column('contract_template_id', db.Integer, index=True)
    contract_template_version = db.Column('contract_template_version', db.Text, index=True)

    pricing_models = db.Column('pricing_models', JSONB)
    statistics = db.Column('statistics', JSONB)
    delivery_method = db.Column('delivery_method', db.Text, index=True)

    responsible_party = db.Column('responsible_party', JSONB)

    automated_metadata = db.Column('automated_metadata', ARRAY(JSONB))

    harvested_from = db.Column('harvested_from', db.Text, index=True)
    harvest_json = db.Column('harvest_json', JSONB)

    use_only_for_vas = db.Column('use_only_for_vas', db.Boolean, default=False)
    vetting_required = db.Column('vetting_required', db.Boolean, default=False)
    ingestion_info = db.Column('ingestion_info', ARRAY(JSONB))

    created_at = db.Column('created_at', db.Date, index=True)
    submitted_at = db.Column('submitted_at', db.Date, index=True)
    accepted_at = db.Column('accepted_at', db.Date, index=True)

    visibility = db.Column('visibility', ARRAY(db.Text), index=True)

    suitable_for = db.Column('suitable_for', ARRAY(db.Text), index=True)

    ts_vector = func.to_tsvector('english', item_geojson)

    def update(self, id, data):
        properties = data.get('properties')
        if data.get('geometry') is not None:
            geom = data['geometry']
            self.geographic_location = shape(geom).wkt
        for key in properties:
            if properties[key] is not None:
                setattr(self, key, properties[key])
        self.id = id
        item_geojson = self.serialize()
        self.item_geojson = item_geojson
        return

    # convert to geojson format
    def serialize(self):
        # build properties object
        p = {}
        geom = None
        for c in inspect(self).attrs.keys():
            attr = getattr(self, c)
            if c == 'geographic_location' and attr is not None:
                # Convert to a shapely Polygon object to get mapping
                if isinstance(attr, str):
                    g = shapely.wkt.loads(attr)
                    geom = mapping(g)
                else:
                    g = to_shape(attr)
                    geom = mapping(g)

            elif isinstance(attr, datetime.date):
                p[c] = attr.isoformat()
            elif c == 'id' or c == 'item_geojson':
                continue
            else:
                p[c] = attr
        # build geojson
        item_geojson = \
            {
                "id": self.id,
                "type": "Feature",
                "geometry": geom,
                "properties": p
            }
        return item_geojson
Ejemplo n.º 18
0
def casestudies_search():
    search_query = get_json_from_request()

    offset = get_nonnegative_int_or_400(request.args, 'from', 0)
    result_count = get_positive_int_or_400(request.args, 'size', current_app.config['DM_API_SUPPLIERS_PAGE_SIZE'])

    sort_dir = search_query.get('sort_dir', 'asc')
    sort_by = search_query.get('sort_by', None)
    domains = search_query.get('domains', None)
    seller_types = search_query.get('seller_types', None)
    search_term = search_query.get('search_term', None)
    framework_slug = request.args.get('framework', 'digital-marketplace')

    q = db.session.query(CaseStudy).join(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \
        .outerjoin(SupplierFramework).outerjoin(Framework)
    q = q.filter(Supplier.status != 'deleted', or_(Framework.slug == framework_slug, ~Supplier.frameworks.any()))
    tsquery = None
    if search_term:
        if ' ' in search_term:
            tsquery = func.plainto_tsquery(search_term)
        else:
            tsquery = func.to_tsquery(search_term + ":*")
        q = q.add_column(func.ts_headline(
            'english',
            func.concat(
                CaseStudy.data['approach'].astext,
                ' ',
                CaseStudy.data['role'].astext),
            tsquery,
            'MaxWords=150, MinWords=75, ShortWord=3, HighlightAll=FALSE, FragmentDelimiter=" ... " '
        ))
    else:
        q = q.add_column("''")
    q = q.add_column(Supplier.name)
    q = q.add_column(postgres.array_agg(Supplier.data))
    q = q.group_by(CaseStudy.id, Supplier.name)

    if domains:
        d_agg = postgres.array_agg(cast(Domain.name, TEXT))
        q = q.having(d_agg.contains(array(domains)))

    if seller_types:
        selected_seller_types = select(
            [postgres.array_agg(column('key'))],
            from_obj=func.json_each_text(Supplier.data[('seller_type',)]),
            whereclause=cast(column('value'), Boolean)
        ).as_scalar()

        q = q.filter(selected_seller_types.contains(array(seller_types)))

    if sort_dir in ('desc', 'z-a'):
        ob = [desc(CaseStudy.data['title'].astext)]
    else:
        ob = [asc(CaseStudy.data['title'].astext)]

    if search_term:
        ob = [desc(func.ts_rank_cd(func.to_tsvector(
            func.concat(Supplier.name, CaseStudy.data['title'].astext,
                        CaseStudy.data['approach'].astext)), tsquery))] + ob

        condition = func.to_tsvector(func.concat(Supplier.name,
                                                 CaseStudy.data['title'].astext,
                                                 CaseStudy.data['approach'].astext)).op('@@')(tsquery)

        q = q.filter(condition)
    q = q.order_by(*ob)

    raw_results = list(q)
    results = []

    for x in range(len(raw_results)):
        result = raw_results[x][0].serialize()
        if raw_results[x][1] is not None and raw_results[x][1] != '':
            result['approach'] = raw_results[x][1]
        if raw_results[x][2] is not None:
            result['supplierName'] = raw_results[x][2]
        if raw_results[x][3] is not None and raw_results[x][3][0] is not None:
            result['seller_type'] = raw_results[x][3][0].get('seller_type')
        results.append(result)

    total_results = len(results)

    sliced_results = results[offset:(offset + result_count)]

    result = {
        'hits': {
            'total': total_results,
            'hits': [{'_source': r} for r in sliced_results]
        }
    }

    try:
        response = jsonify(result), 200
    except Exception as e:
        response = jsonify(message=str(e)), 500

    return response
def to_tsvector_ix(*columns):
    s = " || ' ' || ".join(columns)
    return func.to_tsvector('english', text(s))
Ejemplo n.º 20
0
def add_text_search(query,
                    join_columns,
                    keywords,
                    locales,
                    include_rank=True,
                    lse=None):
    from assembl.models.langstrings import LangStringEntry
    rank = None
    keywords_j = ' & '.join(keywords)
    lse = lse or aliased(LangStringEntry)
    join_conds = [
        lse.langstring_id == join_column for join_column in join_columns
    ]
    if len(join_conds) > 1:
        join_cond = or_(*join_conds)
    else:
        join_cond = join_conds[0]
    query = query.join(lse, join_cond)
    if locales:
        active_text_indices = get('active_text_indices', 'en')
        locales_by_config = defaultdict(list)
        any_locale = 'any' in locales
        for locale in locales:
            fts_config = postgres_language_configurations.get(locale, 'simple')
            if fts_config not in active_text_indices:
                fts_config = 'simple'
            locales_by_config[fts_config].append(locale)
        conds = {}
        # TODO: to_tsquery vs plainto_tsquery vs phraseto_tsquery
        for fts_config, locales in locales_by_config.items():
            conds[fts_config] = (or_(
                *[((lse.locale == locale) | lse.locale.like(locale + "_%"))
                  for locale in locales]) if 'any' not in locales else None,
                                 func.to_tsvector(fts_config, lse.value))
        filter = [
            cond & v.match(keywords_j, postgresql_regconfig=conf)
            for (conf, (cond, v)) in conds.items() if cond is not None
        ]
        if any_locale:
            (_, v) = conds['simple']
            filter.append(v.match(keywords_j, postgresql_regconfig='simple'))
        query = query.filter(or_(*filter))
        if include_rank:
            if len(conds) > 1:
                if any_locale:
                    (_, v) = conds['simple']
                    else_case = func.ts_rank(
                        v, func.to_tsquery('simple', keywords_j))
                else:
                    else_case = 0
                rank = case(
                    [(cond, func.ts_rank(v, func.to_tsquery(conf, keywords_j)))
                     for (conf,
                          (cond, v)) in conds.items() if cond is not None],
                    else_=else_case).label('score')
            else:
                (conf, (cond, v)) = next(iter(conds.items()))
                rank = func.ts_rank(v,
                                    func.to_tsquery(conf,
                                                    keywords_j)).label('score')
            query = query.add_column(rank)
    else:
        fts_config = 'simple'
        query = query.filter(
            func.to_tsvector(fts_config,
                             lse.value).match(keywords_j,
                                              postgresql_regconfig=fts_config))
        if include_rank:
            rank = func.ts_rank(func.to_tsvector(fts_config, lse.value),
                                func.to_tsquery(fts_config,
                                                keywords_j)).label('score')
            query = query.add_column(rank)
    return query, rank