def calculate_TF_IDF(): """Calculates the term frequency and inverse document frequency of the keywords and store them in the database tables. """ conn = util.getDBConnection() sql = "select word from clean_keywords" print sql rows = util.executeSQL(conn, sql) word_tf = {} word_df = {} for row in rows: word = row[0] sql1 = "select doc_id from keywords where name='" + word + "'" print sql1 res = util.executeSQL(conn, sql1) for row1 in res: pkg_id = row1[0] key = word + ':' + str(pkg_id) if key in word_tf: tf_count = word_tf[key] word_tf[key] = tf_count + 1 else: word_tf[key] = 1 if word in word_df: df_count = word_df[word] word_df[word] = df_count + 1 else: word_df[word] = 1 for word, df in word_df.iteritems(): sql = 'update clean_keywords set df=' + str( df) + " where word='" + word + "'" print sql util.executeSQL(conn, sql) for word_pkgid, tf in word_tf.iteritems(): word, pkg_id = word_pkgid.split(":") sql = 'update keywords set tf=' + str( tf) + " where name='" + word + "' and doc_id=" + str(pkg_id) print sql util.executeSQL(conn, sql)
def calculate_TF_IDF(): """Calculates the term frequency and inverse document frequency of the keywords and store them in the database tables. """ conn = util.getDBConnection() sql = "select word from clean_keywords" print sql rows = util.executeSQL(conn, sql) word_tf = {} word_df = {} for row in rows: word = row[0] sql1 = "select doc_id from keywords where name='" + word + "'" print sql1 res = util.executeSQL(conn, sql1) for row1 in res: pkg_id = row1[0] key = word + ':' + str(pkg_id) if key in word_tf: tf_count = word_tf[key] word_tf[key] = tf_count + 1 else: word_tf[key] = 1 if word in word_df: df_count = word_df[word] word_df[word] = df_count + 1 else: word_df[word] = 1 for word, df in word_df.iteritems(): sql = 'update clean_keywords set df=' + str(df) + " where word='" + word + "'" print sql util.executeSQL(conn, sql) for word_pkgid, tf in word_tf.iteritems(): word, pkg_id = word_pkgid.split(":") sql = 'update keywords set tf=' + str(tf) + " where name='" + word + "' and doc_id=" + str(pkg_id) print sql util.executeSQL(conn, sql)
print sql util.executeSQL(conn, sql) for word_pkgid, tf in word_tf.iteritems(): word, pkg_id = word_pkgid.split(":") sql = 'update keywords set tf=' + str( tf) + " where name='" + word + "' and doc_id=" + str(pkg_id) print sql util.executeSQL(conn, sql) if __name__ == '__main__': try: kwd_index = 1 pkg_id = 1 conn = util.getDBConnection() delete_table_data() # delete the existing data # @todo(Argparse) for _dir in os.listdir(constants.PATH): insert_package(_dir, pkg_id) # @todo(Logging) print _dir _files = get_package_files(os.path.join(constants.PATH, _dir)) for root, _file in _files: process_file(root, _file, pkg_id) if has_enough_keywords(pkg_id): pkg_id += 1 populate_clean_keywords() calculate_TF_IDF() except Exception as e:
sql = 'update clean_keywords set df=' + str(df) + " where word='" + word + "'" print sql util.executeSQL(conn, sql) for word_pkgid, tf in word_tf.iteritems(): word, pkg_id = word_pkgid.split(":") sql = 'update keywords set tf=' + str(tf) + " where name='" + word + "' and doc_id=" + str(pkg_id) print sql util.executeSQL(conn, sql) if __name__ == '__main__': try: kwd_index = 1 pkg_id = 1 conn = util.getDBConnection() delete_table_data() # delete the existing data # @todo(Argparse) for _dir in os.listdir(constants.PATH): insert_package(_dir, pkg_id) # @todo(Logging) print _dir _files = get_package_files(os.path.join(constants.PATH, _dir)) for root, _file in _files: process_file(root, _file, pkg_id) if has_enough_keywords(pkg_id): pkg_id += 1 populate_clean_keywords() calculate_TF_IDF() except Exception as e: