def optimizedConstraints(constraints): mod_constraints = {k:constraints[k] for k in constraints} for pos_config in pos_config_sets: pos_constraints = pos_config["constraints"] column_names = pos_config["names"] # Get the columns that we are going to combine into a precomputed hash for those columns pos_columns = filter(lambda x: x in pos_constraints, constraints.keys()) # Check whether or not there are at least two parts of speech in question if len(pos_columns) >= 2: # Get a dictionary for which to compute a hash dict_to_hash = {col:constraints[col] for col in pos_columns} dict_as_sha = dbhash.columnsDictToSHA(dict_to_hash) # modify options to include the new keys hashed_column_name = column_names[len(pos_columns) - 2] for p in pos_columns: mod_constraints.pop(p) mod_constraints[hashed_column_name] = dict_as_sha return mod_constraints
def posCountsForLine(dbconn, line, position='leading'): cursor = dbconn.connection.cursor() if position=='leading': dict_to_hash = {k:line[k] for k in line if k in pos_next_constraints} query = """SELECT COUNT(*) FROM leading_pos_counts WHERE leading_4gram=%s""" values = (dbhash.columnsDictToSHA(dict_to_hash), ) elif position=='lagging': dict_to_hash = {k:line[k] for k in line if k in pos_prev_constraints} query = """SELECT COUNT(*) FROM lagging_pos_counts WHERE lagging_4gram=%s""" values = (dbhash.columnsDictToSHA(dict_to_hash), ) else: raise ValueError("position must be one of ['leading', 'lagging']") cursor.execute(query, values) res = cursor.fetchall() if len(res) > 0: return res[0][0] return 0
def countPOS(commit_interval=1000, print_interval=1000): read_conn = mysql.connector.connect(user="******", password="******", host="localhost", database="wikisonnet") write_conn = mysql.connector.connect(user="******", password="******", host="localhost", database="wikisonnet") read_cursor = read_conn.cursor(dictionary=True) write_cursor = write_conn.cursor() query = """SELECT """ + ", ".join(leading_pos_keys + lagging_pos_keys) + """ FROM iambic_lines""" read_cursor.execute(query) written = 0 toWrite = read_cursor.rowcount commit_timer = commit_interval print_timer = print_interval for row in read_cursor: leading_dict = {k:row[k] for k in row if k in leading_pos_keys} lagging_dict = {k:row[k] for k in row if k in lagging_pos_keys} query = """INSERT INTO leading_pos_counts (leading_4gram, count) VALUES (%s, 1) ON DUPLICATE KEY UPDATE count=count+1;""" values = (columnsDictToSHA(leading_dict), ) write_cursor.execute(query, values) query = """INSERT INTO lagging_pos_counts (lagging_4gram, count) VALUES (%s, 1) ON DUPLICATE KEY UPDATE count=count+1;""" values = (columnsDictToSHA(lagging_dict), ) write_cursor.execute(query, values) written = written+1 print_timer = print_timer-1 if print_timer==0: print "Updated {} of {}".format(written, toWrite) print_timer = print_interval commit_timer = commit_timer-1 if commit_timer==0: write_conn.commit() commit_timer = commit_interval read_cursor.close() write_conn.commit() read_conn.close() write_conn.close()
def populatePOSHashes(commit_interval=1000, print_interval=1000): read_conn = mysql.connector.connect(user="******", password="******", host="localhost", database="wikisonnet") write_conn = mysql.connector.connect(user="******", password="******", host="localhost", database="wikisonnet") read_cursor = read_conn.cursor(dictionary=True) write_cursor = write_conn.cursor() query = """SELECT id, pos_m2, pos_m1, pos_0, pos_1, pos_len_m2, pos_len_m1, pos_len, pos_len_p1 FROM iambic_lines""" read_cursor.execute(query) written=0 commit_timer = commit_interval print_timer = print_interval tasks = { "leading_4gram" : ["pos_m2", "pos_m1", "pos_0", "pos_1"], "leading_3gram" : ["pos_m2", "pos_m1", "pos_0"], "leading_2gram" : ["pos_m1", "pos_0"], "lagging_4gram" : ["pos_len_m2", "pos_len_m1", "pos_len", "pos_len_p1"], "lagging_3gram" : ["pos_len_m2", "pos_len_m1", "pos_len"], "lagging_2gram" : ["pos_len_m1", "pos_len"] } for row in read_cursor: shas = {} for task_title in tasks: d = {key:row[key] for key in tasks[task_title]} dict_sha = columnsDictToSHA(d) shas[task_title] = dict_sha query = """INSERT INTO pos_hashes VALUES (%s, %s, %s, %s, %s, %s, %s)""" values = (row["id"], shas["leading_4gram"], shas["leading_3gram"], shas["leading_2gram"], shas["lagging_4gram"], shas["lagging_3gram"], shas["lagging_2gram"]) write_cursor.execute(query, values) written+=1 print_timer = print_timer-1 if print_timer==0: print "Updated {}".format(written) print_timer = print_interval commit_timer = commit_timer-1 if commit_timer==0: write_conn.commit() commit_timer = commit_interval write_conn.commit() read_cursor.close() read_conn.close() write_conn.close()