def main(): done = 0 seen = defaultdict(int) for queries in queryutils.get_queries(limit=800*BYTES_IN_MB): for query in queries: template = queryutils.extract_template(query) if template is None: continue s = template.str_tree() seen[s] += 1 done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() print "Number of templates: ", len(seen.keys()) templates = sorted(seen.items(), key=lambda x: x[1], reverse=True) for (template, count) in templates: print "Template: \n", template print "Count: ", count print
def main(): done = 0 db = connect_db() #select_cursor = db.execute("SELECT id, text FROM queries WHERE id>?", [273347]) select_cursor = db.execute("SELECT min(id), text FROM queries WHERE id>? GROUP BY text", [0]) for (id, query) in select_cursor.fetchall(): done += 1 if done % 10 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() check_cursor = db.execute("SELECT * FROM templates \ WHERE query_id=?", [str(id)]) if len(check_cursor.fetchall()) == 0: template = queryutils.extract_template(query) if template is None: last_query = query last_failed = True continue s = template.dumps() insert_cursor = db.cursor() insert_cursor.execute("INSERT INTO templates (query_id, template) \ VALUES (?,?)", [id, s]) db.commit() db.close()