Esempio n. 1
0
def optimizedConstraints(constraints):
    mod_constraints = {k:constraints[k] for k in constraints}

    for pos_config in pos_config_sets:

        pos_constraints = pos_config["constraints"]
        column_names = pos_config["names"]

        # Get the columns that we are going to combine into a precomputed hash for those columns
        pos_columns = filter(lambda x: x in pos_constraints, constraints.keys())

        # Check whether or not there are at least two parts of speech in question
        if len(pos_columns) >= 2:

            # Get a dictionary for which to compute a hash
            dict_to_hash = {col:constraints[col] for col in pos_columns}
            dict_as_sha = dbhash.columnsDictToSHA(dict_to_hash)

            # modify options to include the new keys
            hashed_column_name = column_names[len(pos_columns) - 2]
            for p in pos_columns:
                mod_constraints.pop(p)
            mod_constraints[hashed_column_name] = dict_as_sha

    return mod_constraints
Esempio n. 2
0
def posCountsForLine(dbconn, line, position='leading'):
    cursor = dbconn.connection.cursor()

    if position=='leading':
        dict_to_hash = {k:line[k] for k in line if k in pos_next_constraints}
        query = """SELECT COUNT(*) FROM leading_pos_counts WHERE leading_4gram=%s"""
        values = (dbhash.columnsDictToSHA(dict_to_hash), )
    elif position=='lagging':
        dict_to_hash = {k:line[k] for k in line if k in pos_prev_constraints}
        query = """SELECT COUNT(*) FROM lagging_pos_counts WHERE lagging_4gram=%s"""
        values = (dbhash.columnsDictToSHA(dict_to_hash), )
    else:
        raise ValueError("position must be one of ['leading', 'lagging']")

    cursor.execute(query, values)
    res = cursor.fetchall()
    if len(res) > 0:
        return res[0][0]
    return 0
Esempio n. 3
0
def countPOS(commit_interval=1000, print_interval=1000):
    read_conn = mysql.connector.connect(user="******", password="******", host="localhost", database="wikisonnet")
    write_conn = mysql.connector.connect(user="******", password="******", host="localhost", database="wikisonnet")
    read_cursor = read_conn.cursor(dictionary=True)
    write_cursor = write_conn.cursor()
    query = """SELECT """ + ", ".join(leading_pos_keys + lagging_pos_keys) + """ FROM iambic_lines"""
    read_cursor.execute(query)
    written = 0
    toWrite = read_cursor.rowcount
    commit_timer = commit_interval
    print_timer = print_interval
    for row in read_cursor:
        leading_dict = {k:row[k] for k in row if k in leading_pos_keys}
        lagging_dict = {k:row[k] for k in row if k in lagging_pos_keys}

        query = """INSERT INTO leading_pos_counts (leading_4gram, count) VALUES (%s, 1) ON DUPLICATE KEY UPDATE count=count+1;"""
        values = (columnsDictToSHA(leading_dict), )
        write_cursor.execute(query, values)

        query = """INSERT INTO lagging_pos_counts (lagging_4gram, count) VALUES (%s, 1) ON DUPLICATE KEY UPDATE count=count+1;"""
        values = (columnsDictToSHA(lagging_dict), )
        write_cursor.execute(query, values)

        written = written+1

        print_timer = print_timer-1
        if print_timer==0:
            print "Updated {} of {}".format(written, toWrite)
            print_timer = print_interval
        commit_timer = commit_timer-1
        if commit_timer==0:
            write_conn.commit()
            commit_timer = commit_interval
    read_cursor.close()
    write_conn.commit()
    read_conn.close()
    write_conn.close()
Esempio n. 4
0
def populatePOSHashes(commit_interval=1000, print_interval=1000):
    read_conn = mysql.connector.connect(user="******", password="******", host="localhost", database="wikisonnet")
    write_conn = mysql.connector.connect(user="******", password="******", host="localhost", database="wikisonnet")
    read_cursor = read_conn.cursor(dictionary=True)
    write_cursor = write_conn.cursor()
    query = """SELECT id, pos_m2, pos_m1, pos_0, pos_1, pos_len_m2, pos_len_m1, pos_len, pos_len_p1 FROM iambic_lines"""
    read_cursor.execute(query)
    written=0
    commit_timer = commit_interval
    print_timer = print_interval

    tasks = {
        "leading_4gram" : ["pos_m2", "pos_m1", "pos_0", "pos_1"],
        "leading_3gram" : ["pos_m2", "pos_m1", "pos_0"],
        "leading_2gram" : ["pos_m1", "pos_0"],
        "lagging_4gram" : ["pos_len_m2", "pos_len_m1", "pos_len", "pos_len_p1"],
        "lagging_3gram" : ["pos_len_m2", "pos_len_m1", "pos_len"],
        "lagging_2gram" : ["pos_len_m1", "pos_len"]
    }

    for row in read_cursor:
        shas = {}
        for task_title in tasks:
            d = {key:row[key] for key in tasks[task_title]}
            dict_sha = columnsDictToSHA(d)
            shas[task_title] = dict_sha
        query = """INSERT INTO pos_hashes VALUES (%s, %s, %s, %s, %s, %s, %s)"""
        values = (row["id"], shas["leading_4gram"], shas["leading_3gram"], shas["leading_2gram"], shas["lagging_4gram"], shas["lagging_3gram"], shas["lagging_2gram"])
        write_cursor.execute(query, values)
        written+=1

        print_timer = print_timer-1
        if print_timer==0:
            print "Updated {}".format(written)
            print_timer = print_interval
        commit_timer = commit_timer-1
        if commit_timer==0:
            write_conn.commit()
            commit_timer = commit_interval

    write_conn.commit()
    read_cursor.close()
    read_conn.close()
    write_conn.close()