Ejemplo n.º 1
0
def clicked_urls_query():
    return select([
        search_table.c.id,
        func.unnest(search_table.c.clicked_urls).label('result'),
        query_table.c.search_term_lowercase,
    ]).select_from(search_table.join(query_table)).where(
        query_table.c.high_volume == True)
Ejemplo n.º 2
0
def suggest_sequence(attribute):
    """Suggest values from an array attribute of an entity"""
    entity = Sequence
    query = (db.session.query(func.unnest(getattr(entity, attribute))
            .label('label'))
            .distinct()
            .filter(entity.user_login == session['user']))
    if request.values:
        query = query.join(TopicDomainClass)
        for key, value in request.values.items():
            query = query.filter(getattr(TopicDomainClass, key) == value)
    return storify(query.all(),
            'label', 'label')
Ejemplo n.º 3
0
def make_comment(post_type):
    Session = sessionmaker(bind=engine)
    db = Session()

    dead_comments = db.query(Comment.text) \
        .filter(Comment.dead == True, Comment.text != None) \
        .order_by(func.random()).limit(30000)

    dead_comment_sim = train_from_query(dead_comments, CommentSim)

    comment_query = db.query(Comment.text).filter(Comment.dead == False)
    if post_type != "normal":
        comment_query = comment_query.filter(Comment.id.in_(
            db.query(func.unnest(Story.all_kids)).filter_by(**queries[post_type]["query"])
        ))
    comment_query = comment_query.order_by(func.random()).limit(60000)
    random_user_query = db.query(Comment.by).order_by(func.random())

    comment_sim = train_from_query(comment_query, CommentSim)

    user_names = (by[0] for by in random_user_query.limit(random.randint(0, 50)))
    comments = []

    for user_name in user_names:
        is_dead = random.randint(2, 100) < 5
        sim = comment_sim if not is_dead else dead_comment_sim

        comment_length, comment = random.randint(0, 200), ""

        while len(comment) < comment_length:
            if (comment_length - len(comment)) < 25:
                break
            elif (comment_length - len(comment)) < 50:
                comment += sim.make_short_sentence(50, tries=10000,
                                                   max_overlap_total=10,
                                                   max_overlap_ratio=0.5) + "\n"
            else:
                comment += sim.make_short_sentence(100,
                                                   tries=10000,
                                                   max_overlap_total=10,
                                                   max_overlap_ratio=0.5) + "\n"
        comment = comment.replace(".", ". ")
        comment_data = {"text": comment, "by": user_name, "dead": is_dead}
        comments.append(comment_data)

    return comments
Ejemplo n.º 4
0
def get_skipped_urls(conn):
    """
    Get every query/result pair where the result was skipped

    This is done by unnesting everything so we have (session, result, query) tuples,
    and then excluding any clicked tuples from the passed over tuples.
    It probably makes sense(?)
    """
    passed_over = select([
        search_table.c.id,
        func.unnest(search_table.c.passed_over_urls).label('result'),
        query_table.c.search_term_lowercase,
    ]).select_from(search_table.join(query_table)).where(
        query_table.c.high_volume == True)

    clicked = clicked_urls_query()

    stmt = except_(passed_over, clicked)

    return pd.read_sql(stmt, conn, index_col='id')
Ejemplo n.º 5
0
    def get(self):
        args = self.parse_arg()

        releases = db.session.query(
            func.substring(cast(Release.date, db.String), 0,8).label('month'), 
            func.unnest(Release.activities).label('activity'), 
            func.count(Release.value).label('count'), 
            func.sum(Release.value).label('total_value'))

        releases = self.filter_request(releases, args)
        releases = releases.group_by('month', 'activity')
        releases = self.sort_request(releases, args)
        
        release_count = releases.count()

        (releases, offset, limit) = self.offset_limit(releases, args)



        #Generate output structure
        output = dict()
            
        output["meta"] = {
            "count": release_count,
            "pagination" : {"offset" : offset, "limit":  limit}
        }

        months_dict = {}

        for release in releases:
            r = release._asdict()
            current_month = r["month"]
            del r["month"]

            if current_month not in months_dict:
                months_dict[current_month] = []


            months_dict[current_month].append(r)

        output["releases"] = [{"month" : month, "activities": activities} for month,activities in months_dict.iteritems()] 

        #Ugly hack to remove activities that were not requested by provided because
        #some services are in 2 different activities        
        if 'activity' in args and args['activity'] != None:
            activity_list = args['activity'].split(';')
            for j, m in reversed(list(enumerate(output["releases"]))):

                for  i, a in reversed(list(enumerate(m["activities"]))):

                    if a["activity"] not in activity_list:
                            del m["activities"][i]

                if len(output["releases"][j]["activities"]) == 0:
                    del output["releases"][j]

        #Another hack to aggregate activities not in the top N
        if 'aggregate' in args and (args['aggregate'] == "value" or args['aggregate'] == "count"):
            top = []
            activities = db.session.query(
                Release.activities[1].label('activity'), 
                func.sum(Release.value).label('total_value'), 
                func.count(Release.value).label('count'))

            activities = self.filter_request(activities, args)
            activities = activities.filter(Release.activities[1] != "Autres")         
            activities = activities.group_by('activity')

            if args['aggregate'] == "value":
                activities = activities.order_by("total_value desc")
            else :
                activities = activities.order_by("count desc")

            activities = activities[0:app.config["AGG_ACTIVITIES"]]

            top = [a._asdict()['activity'] for a in activities]
        
            for m in output["releases"]:

                count = 0
                total_value = 0
                for  i, a in reversed(list(enumerate(m["activities"]))):

                    if a["activity"] not in top:
                        count += a["count"]
                        total_value += a["total_value"]
                        del m["activities"][i]
                        
                if count > 0 and total_value > 0:

                    autres = {"activity": "Autres", "count": count,"total_value": total_value}
                    m["activities"].append(autres)



        return output