def main():
    cutme = read_in_args_to_cut()
    db = connect_db()
    args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg")
    args = {}
    done = 0
    for (command_id, arg) in args_cursor.fetchall():
        if arg in cutme: continue
        done += 1
        if done % 100 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        if arg == '"':
            continue
        cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id])
        cmd = cmds_cursor.fetchall()[0][0]
        if not arg in args:
            args[arg] = {}
        if not cmd in args[arg]:
            args[arg][cmd] = 0
        args[arg][cmd] += 1
    db.close()
    cmds = list(set(chain.from_iterable([v.keys() for v in args.values()])))
    cmds_idxs = zip(cmds, range(len(cmds)))
    for (cmd, idx) in cmds_idxs:
        print json.dumps((cmd, idx))
def main():
    db = connect_db()
    #args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg")
    args_cursor = db.execute("select command_id, arg from (select command_id, arg from commands, args, templates, queries where queries.id = commands.query_id and queries.id = args.query_id and templates.query_id = queries.id group by arg, command, template) order by arg")
    args = {}
    done = 0
    for (command_id, arg) in args_cursor.fetchall():
        done += 1
        if done % 100 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        if arg == '"':
            continue
        arg = transform_argument(arg)
        cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id])
        cmd = cmds_cursor.fetchall()[0][0]
        cmd = transform_command(cmd)
        if not arg in args:
            args[arg] = {}
        if not cmd in args[arg]:
            args[arg][cmd] = 0
        args[arg][cmd] += 1
    db.close()
    max_term_freq = {}
    for (arg, cmd_cnts) in args.iteritems():
        for (cmd, cnt) in cmd_cnts.iteritems():
            if not cmd in max_term_freq:
                max_term_freq[cmd] = 0
            #max_term_freq[cmd] = max(max_term_freq[cmd], cnt)
            max_term_freq[cmd] += cnt
    #cmds = list(set(chain.from_iterable([v.keys() for v in args.values()])))
    cmds_idxs = zip(max_term_freq.items(), range(len(max_term_freq)))
    for ((cmd, f), idx) in cmds_idxs:
        print json.dumps((cmd, f, idx))
def main():
    cutme = read_in_args_to_cut()
    db = connect_db()
    args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg")
    args = {}
    done = 0
    for (command_id, arg) in args_cursor.fetchall():
        if arg in cutme: continue
        done += 1
        if done % 100 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        if arg == '"':
            continue
        cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id])
        cmd = cmds_cursor.fetchall()[0][0]
        if not arg in args:
            args[arg] = {}
        if not cmd in args[arg]:
            args[arg][cmd] = 0
        args[arg][cmd] += 1
    db.commit()
    for (arg, cmd_counts) in args.iteritems():
        print json.dumps([arg, cmd_counts.items()])
    db.close()
def main():
    db = connect_db()
    # args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg")
    args_cursor = db.execute(
        "select command_id, arg from (select command_id, arg from commands, args, templates, queries where queries.id = commands.query_id and queries.id = args.query_id and templates.query_id = queries.id group by arg, command, template) order by arg"
    )
    args = {}
    done = 0
    for (command_id, arg) in args_cursor.fetchall():
        done += 1
        if done % 100 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        if arg == '"':
            continue
        cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id])
        cmd = cmds_cursor.fetchall()[0][0]
        if not arg in args:
            args[arg] = {}
        if not cmd in args[arg]:
            args[arg][cmd] = 0
        args[arg][cmd] += 1
    db.commit()
    cmds = list(set(chain.from_iterable([v.keys() for v in args.values()])))
    cmds_idxs = zip(cmds, range(len(cmds)))
    for (cmd, idx) in cmds_idxs:
        print json.dumps((cmd, idx))
def main():
    cut = read_in_args_to_cut()
    db = connect_db()
    args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg")
    args = {}
    done = 0
    for (command_id, arg) in args_cursor.fetchall():
        arg = arg.strip()
        done = progress(done)
        if arg == '"':
            continue
        if arg in cut:
            continue
        new_arg = transform_arg(arg)
        if new_arg != arg:
            continue
        cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id])
        cmd = cmds_cursor.fetchall()[0][0]
        cmd = transform_cmd(cmd)
        if cmd == "search":
            continue
        if not arg in args:
            args[arg] = {}
        if not cmd in args[arg]:
            args[arg][cmd] = 0
        args[arg][cmd] += 1
    db.commit()
    cmds = list(set(chain.from_iterable([v.keys() for v in args.values()])))
    cmds_idxs = zip(cmds, range(len(cmds)))
    for (cmd, idx) in cmds_idxs:
        print json.dumps((cmd, idx))
Ejemplo n.º 6
0
def main():
    db = connect_db()
    select_cursor = db.execute("SELECT id, name FROM users")
    for (id, name) in select_cursor.fetchall():
        matches = re.findall('[\d]{9}', name)
        if len(matches) > 0:
            update_user_type_as_machine(db, id)
        matches = re.findall('[a-z]{2}[\d]{5}', name)
        if len(matches) > 0:
            update_user_type_as_machine(db, id)
        matches = re.findall('[a-z]{1}[\d]{6}', name)
        if len(matches) > 0:
            update_user_type_as_machine(db, id)
        matches = re.findall('\d[a-z]{5}\d', name)
        if len(matches) > 0:
            update_user_type_as_machine(db, id)
        matches = re.findall('[a-z]{2}[\d]{3}[a-z]', name)
        if len(matches) > 0:
            update_user_type_as_machine(db, id)
        matches = re.findall('[\d][a-z]{5}\d', name)
        if len(matches) > 0:
            update_user_type_as_machine(db, id)
        matches = re.findall('savedsearch', name)
        if len(matches) > 0:
            update_user_type_as_machine(db, id)
        matches = re.findall('[\d]{8}', name)
        if len(matches) > 0:
            update_user_type_as_machine(db, id)
    db.close()
def main():
    cut = read_in_args_to_cut()
    db = connect_db()
    args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg")
    args = {}
    done = 0
    for (command_id, arg) in args_cursor.fetchall():
        arg = arg.strip()
        done = progress(done)
        if arg == '"':
            continue
        if arg in cut:
            continue
        new_arg = transform_arg(arg)
        if new_arg != arg:
            continue
        cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id])
        cmd = cmds_cursor.fetchall()[0][0]
        cmd = transform_cmd(cmd)
        if cmd == "search":
            continue
        if not arg in args:
            args[arg] = {}
        if not cmd in args[arg]:
            args[arg][cmd] = 0
        args[arg][cmd] += 1
    db.commit()
    for (arg, cmd_counts) in args.iteritems():
        print json.dumps([arg, cmd_counts.items()])
Ejemplo n.º 8
0
def main():
    db = connect_db()
    select_cursor = db.execute("SELECT id, text FROM queries WHERE id<?", [31651])
    for (id, query) in select_cursor.fetchall():
        if query.find(' rex ') > -1:
            delete_template_commands_args(db, id)
    db.close()
Ejemplo n.º 9
0
def main():
    counts = []
    #labels = []
    db = connect_db()
    #cursor = db.execute("SELECT count(*) AS cnt, arg FROM args GROUP BY arg ORDER BY cnt DESC")
    cursor = db.execute("select count(distinct queries.user_id) as cnt, arg from args, queries where args.query_id = queries.id group by arg order by cnt desc")
    for (cnt, arg) in cursor.fetchall():
        counts.append(cnt)
        #labels.append(arg)
    db.close()    

    #counts = np.log(counts)
    ind = range(max(counts)+1)
    #ind = np.logspace(0, np.log10(max(counts)), num=20)
    #ind = [0] + list(ind)
    #ind = np.linspace(0, max(counts), num=20)
    #print ind
    total = len(counts) 
    cdf = [float(len(filter(lambda x: x <= i, counts)))/float(total) for i in ind]
    #print zip(ind, cdf)
    plt.plot(ind, cdf)
    plt.ylabel("Percent of arguments with counts less than or equal to C")
    plt.xlabel("C")
    plt.xscale("log")
    plt.grid(True)
    plt.show()
Ejemplo n.º 10
0
def main():
    db = connect_db()
    cursor = db.execute("SELECT command, text FROM commands, queries \
                            WHERE commands.query_id = queries.id \
                            AND operation_type=?", ["FilterSelection"])
    types = defaultdict(int)
    total = 0
    for (command, query) in cursor.fetchall():
        total += 1
        added_to_something = False
        if command.find("DEDUP") == 0:
            types["filter out duplicates"] += 1
            added_to_something = True
        if command.find("HEAD") == 0:
            types["filter on index"] += 1
            added_to_something = True
        if command.find("REGEX") == 0:
            c = command.lstrip("REGEX")
            types["filter on string match"] += 1
            added_to_something = True
        if command == "SEARCH" or command == "SEARCH(SOURCE)":
            types["filter on string match"] += 1
            added_to_something = True
        else:
            c = command.strip("SEARCH")
            c = c.strip("WHERE")
            s = get_filter_stage(query)
            if c.find("(AND") == 0 or c.find("(EQ") == 0:
                types["filter on multiple conditions"] += 1
                added_to_something = True
            if c.find("(OR") == 0:
                types["filter on multiple conditions"] += 1
                added_to_something = True
            if c.find("(NOT") == 0 or c.find("(NE") == 0:
                types["filter on multiple conditions"] += 1
                added_to_something = True
            if c.find("(GT") > -1 or c.find("GE") > -1 or c.find("(LT") > -1 or c.find("(LE") > -1:
                types["filter with numerical bound"] += 1
                added_to_something = True
            if c.find("(ISNULL") > -1 or c.find("(ISNOTNULL") > -1:
                types["filter on nullness"] += 1
                added_to_something = True
            if c.find("SOURCETYPE") > -1 or c.find("SOURCE") > -1 or c.find("HOST") > -1 or c.find("EVENTTYPE") > -1 or query.find("index") > -1:
                types["filter on source"] += 1
                added_to_something = True
            if c.find("EARLIEST") > -1 or c.find("LATEST") > -1:
                types["filter on time"] += 1
                added_to_something = True
            if re.match(time_pattern,s):
                types["filter on time"] += 1
                added_to_something = True
        if not added_to_something:
            print "Missed a case!", command
    db.close()    
    print_results(types, total)
def main():
    db = connect_db()
    select_cursor = db.cursor()
    select_cursor.execute("SELECT command, id FROM commands")
    for (command, id) in select_cursor.fetchall():
        category = lookup_category(command) 
        print "Put command", command, "in category", category 
        update_cursor = db.cursor()
        update_cursor.execute("UPDATE commands SET operation_type=? WHERE id=?", [category, id])
        db.commit()
    db.close()
def read_in_sessions():
    sessions = defaultdict(list)
    db = connect_db()
    select_cursor = db.execute("SELECT session_id, query_id, template FROM templated_sessions ORDER BY session_id")
    for (session_id, query_id, template) in select_cursor.fetchall():
        time_cursor = db.execute("SELECT time FROM queries WHERE id=?", [query_id])
        time = float(time_cursor.fetchall()[0][0])
        template = ParseTreeNode.from_dict(json.loads(template))
        sessions[session_id].append((time, query_id, template))
    db.close()
    return sessions
Ejemplo n.º 13
0
def main():

    db = connect_db()
    cursor = db.execute("select count(distinct users.name) as cnt, template from templates, users, queries where templates.query_id = queries.id and queries.user_id = users.id group by template order by cnt desc")
    data = cursor.fetchall() 
    for (cnt, template) in data:
        d = json.loads(template)
        template = ParseTreeNode.from_dict(d)
        template.print_tree()
        print "count: ", cnt
        print "" 
        print ""
    db.close()
def main():
    
    db = connect_db()
    big_sessions_cursor = db.execute("SELECT session_id, COUNT(session_id) AS cnt FROM queries GROUP BY session_id ORDER BY cnt DESC")
    for (session_id, queries_per_session) in big_sessions_cursor.fetchall():
        if queries_per_session > 30:
            session_owner_cursor = db.execute("SELECT user_id FROM sessions WHERE id=? LIMIT 1", [session_id])
            user_id = session_owner_cursor.fetchall()[0][0]
            owner_name_cursor = db.execute("SELECT name, user_type FROM users WHERE id=? LIMIT 1", [user_id])
            for (name, user_type) in owner_name_cursor.fetchall():
                if user_type == "person":
                    print ','.join([str(name), str(session_id), str(queries_per_session)])
    db.close()
Ejemplo n.º 15
0
def load_and_print_redundant_edges():
    db = connect_db()
    cursor = db.execute("SELECT arg, query_id FROM args")
    curr = prev = ""
    query_set = set()
    for (arg, query_id) in cursor.fetchall():
        curr = query_id
        if not curr == prev:
            print_edges(list(query_set))
            query_set.clear()
        query_set.add(arg)
        prev = curr
    db.close()
def main():
    db = connect_db()
    arg_clusters = read_in_arg_clusters()
    cluster_labels = read_in_cluster_labels()
    arg_labels = make_arg_labels(arg_clusters, cluster_labels)
    for (arg, labels) in arg_labels.iteritems():
        if len(labels) > 1:
            print "adding to", arg, "id -2 label multiple"
            multiple_cursor = db.cursor()
            multiple_cursor.execute("UPDATE args SET clique_id=?, clique_label=? WHERE arg=?", [-1, "multiple", arg])
            db.commit()
        if len(labels) == 1:
            cluster_id = min(arg_clusters[arg])
            label = labels.pop()
            print "adding to", arg, "id", cluster_id, "label", label
            single_cursor = db.cursor()
            single_cursor.execute("UPDATE args SET clique_id=?, clique_label=? WHERE arg=?", [cluster_id, label, arg])
            db.commit()
    db.close()
def main():
    db = connect_db()
    args_cursor = db.execute("SELECT count(id) as cnt, clique_label, command_id FROM args GROUP BY clique_label, command_id")
    argrps = {}
    done = 0
    for (cnt, label, cmdid) in args_cursor.fetchall():
        done += 1
        if done % 100 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        cmds_cursor = db.execute("SELECT category FROM commands WHERE id=?", [cmdid])
        category = cmds_cursor.fetchall()[0][0]
        if not label in argrps:
            argrps[label] = {}
        if not category in argrps[label]:
            argrps[label][category] = 0
        argrps[label][category] += 1
    db.commit()
    for (arglabel, cmdgrps) in argrps.iteritems():
        print json.dumps([arglabel, cmdgrps.items()])
Ejemplo n.º 18
0
def main():
    db = connect_db()
    done = 0
    select_cursor = db.execute("SELECT min(id), text FROM queries WHERE id>? GROUP BY text", [0])
    schemas = {}
    for (id, query) in select_cursor.fetchall():
        done += 1
        if done % 10 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        schema = queryutils.extract_schema(query) 
        if not schema is None:
            schemas[id] = (text, schema)
            print "query:", id
            print "schema:"
            print schema
            print
    
    users_by_query_text = {}
    for (id, (text, schema)) for schemas.iteritems():
def main():
    done = 0
    db = connect_db()
    select_cursor = db.execute("SELECT min(id), text FROM queries WHERE id>? GROUP BY text", [0])
    for (id, query) in select_cursor.fetchall():
        done += 1
        if done % 100 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        check_cursor = db.execute("SELECT * FROM commands \
                                    WHERE query_id=?", [str(id)])
        if len(check_cursor.fetchall()) == 0:
            parsetree = queryutils.parse_query(query)
            if parsetree is None:
                last_query = query
                last_failed = True
                continue
            commands_and_args = parsetree.command_arg_tuple_list()
            insert_commands_and_args(db, id, commands_and_args)
    db.close()
def read_in_templatized_sessions():
    session_queries = defaultdict(list)
    session_templates = defaultdict(list)
    
    db = connect_db()
   
    cursor = db.execute("SELECT queries.session_id, templates.query_id, templates.template \
                            FROM queries, templates, users \
                            WHERE queries.id = templates.query_id \
                                AND users.id = queries.user_id \
                                AND users.user_type = ?", ["person"])
    num_queries = 0
    for (session_id, query_id, template) in cursor.fetchall():
        session_templates[session_id].append((query_id, template))
        num_queries += 1

    print "Formed session_templates dict with length", len(session_templates)
    print "\t and with number of queries", num_queries

    complete_sessions = {}
    for (session_id, query_templates) in session_templates.iteritems():
        cursor = db.execute("SELECT session_id, id FROM queries \
                                WHERE session_id = ?", [session_id])
        queries = []
        for (sid, qid) in cursor.fetchall():
            queries.append(qid)
        if len(queries) == len(query_templates):
            print "Have a complete session of length", len(query_templates)
            complete_sessions[session_id] = query_templates
            for (query_id, template) in query_templates:
                print "This is happening."
                insert_cursor = db.cursor()
                insert_cursor.execute("INSERT INTO templated_sessions \
                                        (session_id, query_id, template) \
                                        VALUES (?,?,?)", 
                                        [session_id, query_id, template])
                db.commit()
     
    db.close()
    return complete_sessions
Ejemplo n.º 21
0
def main():
    done = 0
    commands_plus_counts = defaultdict(int)
    last_query = ""
    db = connect_db()
    select_cursor = db.execute("SELECT template FROM templates where template not null")
    for (template) in select_cursor.fetchall():
        done += 1
        if done % 100 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        d = json.loads(template) # convert to parse tree node here
        t = ParseTreeNode.from_dict(d)
        commands_plus = t.get_commands_plus_list()
        for c in commands_plus:
            commands_plus_counts[c] += 1
    
    top_commands_plus = sorted(commands_plus.items(), key=lambda x: x[1], reverse=True)[:20]
    for (commands_plus, counts) in top_commands_plus:
        print counts, commands_plus

    db.close()
Ejemplo n.º 22
0
def main():
    counts = []
    #labels = []
    db = connect_db()
    #cursor = db.execute("SELECT count(*) AS cnt, arg FROM args GROUP BY arg ORDER BY cnt DESC")
    cursor = db.execute("select count(distinct queries.user_id) as cnt, arg from args, queries where args.query_id = queries.id group by arg order by cnt desc")
    for (cnt, arg) in cursor.fetchall():
        counts.append(cnt)
        #labels.append(arg)
    db.close()    

    ind = range(len(counts))
    counts = np.log(counts)
    print counts[0:20]
    #width = 1.0
    #bars = plt.bar(ind, counts, width, color="green", alpha=0.5)
    plt.plot(ind, counts)
    plt.ylabel("log( Number of times argument appears )")
    plt.xlabel("Argument rank")
    #plt.xticks(rotation=90)
    #plt.xticks([i+width/2 for i in ind], labels)
    plt.grid(True)
    plt.show()
def main():
    db = connect_db()
    #args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg")
    args_cursor = db.execute("select command_id, arg from (select command_id, arg from commands, args, templates, queries where queries.id = commands.query_id and queries.id = args.query_id and templates.query_id = queries.id group by arg, command, template) order by arg")
    args = {}
    done = 0
    for (command_id, arg) in args_cursor.fetchall():
        done += 1
        if done % 100 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        if arg == '"':
            continue
        cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id])
        cmd = cmds_cursor.fetchall()[0][0]
        if not arg in args:
            args[arg] = {}
        if not cmd in args[arg]:
            args[arg][cmd] = 0
        args[arg][cmd] += 1
    db.commit()
    for (arg, cmd_counts) in args.iteritems():
        print json.dumps([arg, cmd_counts.items()])
    db.close()
Ejemplo n.º 24
0
def main():
    done = 0
    db = connect_db()
    #select_cursor = db.execute("SELECT id, text FROM queries WHERE id>?", [273347])
    select_cursor = db.execute("SELECT min(id), text FROM queries WHERE id>? GROUP BY text", [0])
    for (id, query) in select_cursor.fetchall():
        done += 1
        if done % 10 == 0:
            sys.stderr.write(str(done) + " done\n")
            sys.stderr.flush()
        check_cursor = db.execute("SELECT * FROM templates \
                                    WHERE query_id=?", [str(id)])
        if len(check_cursor.fetchall()) == 0:
            template = queryutils.extract_template(query) 
            if template is None:
                last_query = query
                last_failed = True
                continue
            s = template.dumps()
            insert_cursor = db.cursor()
            insert_cursor.execute("INSERT INTO templates (query_id, template) \
                            VALUES (?,?)", [id, s])
            db.commit()
    db.close()
def read_data_from_db():
    counts = []
    optypes = []
    db = connect_db()
    cursor = db.execute("select count(*) as count, operation_type from commands group by operation_type order by count")
    for (cnt, operation_type) in cursor.fetchall():
        if operation_type == "FilterSelection":
            operation_type = "Filter"
        if operation_type == "InputtingSelection":
            operation_type = "Input"
        if operation_type == "TransformingProjection":
            operation_type = "Transform"
        if operation_type == "Projection":
            operation_type = "Project"
        if operation_type == "ExtendedProjection":
            operation_type = "Extend"
        if operation_type == "WindowingProjection":
            operation_type = "Window"
        if operation_type == "Aggregation":
            operation_type = "Aggregate"
        counts.append(cnt)
        optypes.append(operation_type)
    db.close()    
    return counts, optypes
Ejemplo n.º 26
0
def load_db():
    global db
    db = connect_db()
    load_main()
    db.close()
Ejemplo n.º 27
0
def execute_db_script(script):
    with closing(connect_db()) as db:
        with app.open_resource(script) as f:
            db.cursor().executescript(f.read())
        db.commit()
def extract_context(cleanarg, rawlist):
    c = Context(cleanarg)
    db = connect_db()
    number_commands = 0
    #print
    for arg in rawlist:
        #print "arg: ", arg
        cursor = db.execute("SELECT command, text FROM queries, commands, args \
                                WHERE queries.id = args.query_id \
                                AND args.command_id = commands.id \
                                AND arg=?", [arg])
        for (command, query) in cursor.fetchall():
            #print "\tcmd:", command

            number_commands += 1
            set = False
            set_field = False
            if command.find("SEARCH") == 0 or command.find("WHERE") == 0:
                for comparator in ["=", ">", "<", "!=", "=="]:
                    if query.find(arg + comparator) > -1 or query.find(arg + " " + comparator) > -1:
                        c.grp_filtered_on_as_field += 1
                        set = True
                        set_field = True
                if not set_field:        
                    c.grp_filtered_on_as_value += 1
                    set = True

            if command.find("DEDUP") == 0:
                c.grp_filtered_on_as_field += 1
                set = True

            if command.find("TOP") == 0:
                try:
                    int(c.arg)
                    c.grp_argument_to_option += 1
                    set = True
                except:
                    c.grp_argument_to_top += 1
                    set = True

            if command.find("SORT") == 0:
                try:
                    int(arg)
                    c.grp_argument_to_option += 1
                    set = True
                except:
                    if query.find("=" + arg) == -1:
                        c.grp_sorted_by += 1
                        set = True

            if command.find("FIELDS(PLUS)") == 0 or command.find("TABLE") == 0 or command.find("EXPORT") == 0:
                c.grp_projected += 1
                set = True

            if command.find("FIELDS(MINUS)") == 0:
                c.grp_unprojected += 1
                set = True

            if command.find("INPUTLOOKUP") == 0 or command.find("ABSTRACT") == 0:
                if query.find("=" + arg) > -1:
                    c.grp_argument_to_option += 1
                    set = True

            if command.find("HEAD") == 0:
                c.grp_argument_to_option += 1
                set = True

            if command.find("STATS") == 0 or command.find("TIMECHART") == 0 or command.find("CHART") == 0:
                for function in ["count", "min", "avg", "max", "sum", "c", "values", "range", "last", "distinct_count", "dc", "mode", "var"]:
                    if query.find(function + " " + arg) > -1 or query.find(function + "(" + arg + ")") > -1 or query.find(function + " (" + arg + ")") > -1:
                        c.grp_argument_to_aggregation += 1
                        set = True
            
            if (command.find("STATS") == 0 or command.find("EVAL") == 0 or command.find("CHART") == 0 or command.find("TIMECHART") == 0 or command.find("RENAME") == 0) and command.find("AS") > -1:
                if query.find("as " + arg) > -1 or query.find("AS " + arg) > -1:
                    c.grp_argument_renamed_as += 1
                    set = True
            
            if (command.find("STATS") == 0 or command.find("EVAL") == 0 or command.find("CHART") == 0 or command.find("TIMECHART") == 0) and command.find("OVER") > -1:
                if query.find("over " + arg) > -1 or query.find("OVER " + arg) > -1:
                    c.grp_field_used_as_function_domain += 1
                    set = True

            if (command.find("STATS") == 0 or command.find("EVAL") == 0 or command.find("CHART") == 0 or command.find("TIMECHART") == 0) and command.find("BY") > -1:
                if query.find("by " + arg) > -1 or query.find("BY " + arg) > -1 or query.find("by") < query.rfind(arg):
                    c.grp_grouped_by += 1
                    set = True

            if command.find("EVAL") > -1 and (command.find("DIVIDES") > -1 or command.find("TIMES") > -1 or command.find("PLUS") > -1):
                c.grp_argument_to_arithmetic_transformation += 1
                set = True

            if command.find("EVAL") > -1:
                if query.find(arg + "=") > -1:
                    c.grp_field_used_in_conditional += 1
                    set = True
                else:
                    c.grp_value_used_in_other_transformation += 1
                    #print command
                    set = True

            if command.find("MULTIKV") == 0 or command.find("CONVERT") == 0 or command.find("BUCKET") == 0 or command.find("REX") == 0 or command.find("REPLACE") == 0 or command.find("REGEX") == 0 or command.find("MAKEMV") == 0 or command.find("MVEXPAND") == 0:
                c.grp_field_used_in_other_transformation += 1
                #print command
                set = True
            
            if command.find("TIMECHART") == 0:
                if query.find("=" + arg) > -1 or query.find("= " + arg) > -1:
                    try:
                        int(arg)
                        set = True
                        c.grp_argument_to_option += 1
                    except:
                        pass

            if arg == "false" and not set:
                c.grp_argument_to_option += 1
                set = True

            if not set:
                print "\t\t\tNo case for this one!"
    
    db.close()
    if c.number_set() < number_commands: 
        print "Missed a case!", arg
        exit()
    return c
def main():
    len_one_sessions = 0
    generalizing_parsetrees = 0
    specializing_parsetrees = 0
    identical_parsetrees = 0
    total = 0
    inds = []
    gens = []
    specs = []
    others = []
    sessions = read_in_sessions()
    for (session_id, queries) in sessions.iteritems():
        queries = sorted(queries, key=lambda x: x[0])
        if len(queries) == 1:
            len_one_sessions += 1
        elif is_identical_parsetrees(queries):
            identical_parsetrees += 1
            inds.append((session_id, queries))
        elif is_generalizing_parsetrees(queries):
            generalizing_parsetrees += 1
            gens.append((session_id, queries))
        elif is_specializing_parsetrees(queries):
            specializing_parsetrees += 1 
            specs.append((session_id, queries))
        else:
            others.append((session_id, queries))
        total += 1
    other = total - len_one_sessions - generalizing_parsetrees - specializing_parsetrees - identical_parsetrees
    print "Length one sessions: ", len_one_sessions
    print "Generalizing parsetrees sessions: ", generalizing_parsetrees
    print "Specializing parsetree sessions: ", specializing_parsetrees
    print "Identical parsetree sessions: ", identical_parsetrees
    print "Other (unknown): ", other
    print "Total: ", total
    print
    
    # Print inds info.
    inds_avg_time_length = 0.
    inds_min_time_length = 1e12
    inds_max_time_length = 0.
    inds_avg_item_length = 0.
    inds_min_item_length = 1e12
    inds_max_item_length = 0.
    for (sid, templates) in inds:
        first = True
        for (time, qid, template) in templates:
            if first:
                first = False
                firsttime = time
        time_length = time - firsttime
        item_length = len(templates)
        inds_max_time_length = max(time_length, inds_max_time_length)
        inds_min_time_length = min(time_length, inds_min_time_length)
        inds_avg_time_length += time_length
        inds_max_item_length = max(item_length, inds_max_item_length)
        inds_min_item_length = min(item_length, inds_min_item_length)
        inds_avg_item_length += item_length
    inds_avg_time_length = inds_avg_time_length / len(inds)
    inds_avg_item_length = inds_avg_item_length / len(inds)
    print "Identical sessions max time length:", inds_max_time_length
    print "Identical sessions min time length:", inds_min_time_length
    print "Identical sessions avg time length:", inds_avg_time_length
    print
    print "Identical sessions max item length:", inds_max_item_length
    print "Identical sessions min item length:", inds_min_item_length
    print "Identical sessions avg item length:", inds_avg_item_length
    print

    # Print gens info.
    gens_avg_time_length = 0.
    gens_min_time_length = 1e12
    gens_max_time_length = 0.
    gens_avg_item_length = 0.
    gens_min_item_length = 1e12
    gens_max_item_length = 0.
    for (sid, templates) in gens:
        first = True
        for (time, qid, template) in templates:
            if first:
                first = False
                firsttime = time
        time_length = time - firsttime
        item_length = len(templates)
        gens_max_time_length = max(time_length, gens_max_time_length)
        gens_min_time_length = min(time_length, gens_min_time_length)
        gens_avg_time_length += time_length
        gens_max_item_length = max(item_length, gens_max_item_length)
        gens_min_item_length = min(item_length, gens_min_item_length)
        gens_avg_item_length += item_length
    gens_avg_time_length = gens_avg_time_length / len(gens)
    gens_avg_item_length = gens_avg_item_length / len(gens)
    print "Generalizing sessions max time length:", gens_max_time_length
    print "Generalizing sessions min time length:", gens_min_time_length
    print "Generalizing sessions avg time length:", gens_avg_time_length
    print
    print "Generalizing sessions max item length:", gens_max_item_length
    print "Generalizing sessions min item length:", gens_min_item_length
    print "Generalizing sessions avg item length:", gens_avg_item_length
    print
                 
    # Print specs info.
    specs_avg_time_length = 0.
    specs_min_time_length = 1e12
    specs_max_time_length = 0.
    specs_avg_item_length = 0.
    specs_min_item_length = 1e12
    specs_max_item_length = 0.
    for (sid, templates) in specs:
        first = True
        for (time, qid, template) in templates:
            if first:
                first = False
                firsttime = time
        length = time - firsttime
        item_length = len(templates)
        specs_max_time_length = max(length, specs_max_time_length)
        specs_min_time_length = min(length, specs_min_time_length)
        specs_avg_time_length += specs_avg_time_length
        specs_max_item_length = max(item_length, specs_max_item_length)
        specs_min_item_length = min(item_length, specs_min_item_length)
        specs_avg_item_length += item_length
    specs_avg_time_length = length / len(specs)
    specs_avg_item_length = specs_avg_item_length / len(specs)
    print "Specializing sessions max time length:", specs_max_time_length
    print "Specializing sessions min time length:", specs_min_time_length
    print "Specializing sessions avg time length:", specs_avg_time_length
    print
    print "Specializing sessions max item length:", specs_max_item_length
    print "Specializing sessions min item length:", specs_min_item_length
    print "Specializing sessions avg item length:", specs_avg_item_length
    print
                 
    # Print others.
    db = connect_db()
    for (sid, templates) in others:
        print
        for (time, qid, template) in templates:
            qcursor = db.execute("SELECT text FROM queries WHERE id=?", [qid])
            query = qcursor.fetchall()[0][0]
            print query.strip()
            #template.print_tree()
            print
        print "__________________________________________________________________________"
    db.close()
Ejemplo n.º 30
0
def main():
    db = connect_db()
    cursor = db.execute("SELECT command, text FROM commands, queries \
                            WHERE commands.query_id = queries.id \
                            AND operation_type=?", ["ExtendedProjection"])
    types = defaultdict(int)
    total = 0
    for (command, query) in cursor.fetchall():
        if command == "EVAL" or command == "EVAL(EQ)":
            continue
        total += 1
        added_to_something = False
        if command.find("REX") == 0:
            types["string extraction"] += 1
            added_to_something = True
        if command.find("EVAL") == 0:
            if command.find("CASE") > -1:
                types["conditional"] += 1
                added_to_something = True
            if command.find("COALESCE") > -1:
                types["conditional"] += 1
                added_to_something = True
            if command.find("CONCAT") > -1:
                types["string manipulation"] += 1
                added_to_something = True
            if command.find("DIVIDES") > -1:
                types["arithmetic"] += 1
                added_to_something = True
            if command.find("FLOOR") > -1:
                types["arithmetic"] += 1
                added_to_something = True
            if command.find("IF") > -1:
                types["conditional"] += 1
                added_to_something = True
            if command.find("LIKE") > -1:
                types["string match"] += 1
                added_to_something = True
            if command.find("MATCH") > -1:
                types["string match"] += 1
                added_to_something = True
            if command.find("MINUS") > -1:
                types["arithmetic"] += 1
                added_to_something = True
            if command.find("MVCOUNT") > -1:
                types["multivalue"] += 1
                added_to_something = True
            if command.find("MVINDEX") > -1:
                types["multivalue"] += 1
                added_to_something = True
            if command.find("PLUS") > -1:
                types["arithmetic"] += 1
                added_to_something = True
            if command.find("REPLACE") > -1:
                types["string concatenation"] += 1
                added_to_something = True
            if command.find("ROUND") > -1:
                types["arithmetic"] += 1
                added_to_something = True
            if command.find("SPLIT") > -1:
                types["string concatenation"] += 1
                added_to_something = True
            if command.find("STRPTIME") > -1:
                types["time manipulation"] += 1
                added_to_something = True
            if command.find("SUBSTR") > -1:
                types["string manipulation"] += 1
                added_to_something = True
            if command.find("TIMES") > -1:
                types["arithmetic"] += 1
                added_to_something = True
            if command.find("TONUMBER") > -1:
                types["conversion"] += 1
                added_to_something = True
            if command.find("TRIM") > -1:
                types["string manipulation"] += 1
                added_to_something = True
            if command.find("UPPER") > -1:
                types["string manipulation"] += 1
                added_to_something = True

        if not added_to_something:
            print "Missed a case!", command


    cursor = db.execute("SELECT command, text FROM commands, queries \
                            WHERE commands.query_id = queries.id \
                            AND operation_type=?", ["Projection"])
    for (command, query) in cursor.fetchall():
        total += 1
        added_to_something = False
        if command.find("MINUS") > -1:
            types["unproject"] += 1
            added_to_something = True
        else:
            types["project"] += 1
            added_to_something = True

        if not added_to_something:
            print "Missed a case!", command


    cursor = db.execute("SELECT command, text FROM commands, queries \
                            WHERE commands.query_id = queries.id \
                            AND operation_type=?", ["TransformingProjection"])
    for (command, query) in cursor.fetchall():
        total += 1
        added_to_something = False
        if command.find("BUCKET") == 0:
            types["bucket"] += 1
            added_to_something = True
        if command.find("CONVERT") == 0:
            types["conversion"] += 1
            added_to_something = True
        if command.find("FILLNULL") == 0:
            types["fill null"] += 1
            added_to_something = True
        if command.find("MAKEMV") == 0:
            types["multivalue"] += 1
            added_to_something = True
        if command.find("MULTIKV") == 0:
            types["string extraction"] += 1
            added_to_something = True
        if command.find("REPLACE") == 0:
            types["string manipulation"] += 1
            added_to_something = True

        if not added_to_something:
            print "Missed a case!", command

    db.close()    
    print_results(types, total)