def clustering_txt(app, tag): all_records = get_all_sentence_records(app, tag) all_sentences = [x[2] for x in all_records] if len(all_records) == 0: # there is no difference item candidate return elif len(all_records) == 1: # there is only one difference item candidate clusters = [1] else: distance_matrix = calculate_distance_matrix(all_sentences) distArray = ssd.squareform(distance_matrix) Z = sch.linkage(distArray, method='single') clusters = sch.fcluster(Z, T_THRESHOLD, criterion='distance') # save result to database db = connect_db() cur = db.cursor() for i, cluster_id in enumerate(clusters): record = list(all_records[i]) + [ int(''.join(['100', str(cluster_id)])) ] # txt: 100xxx, img: 200xxx sql = "INSERT INTO cluster_txt " + \ "(app, duplicate_tag, diff_sentence, diff_sentence_index, report_id, cluster_id) " + \ "VALUES (%s, %s, %s, %s, %s, %s)" try: cur.execute(sql, record) db.commit() except Exception as e: traceback.print_exc() close_db(db)
def clustering_txt(app, tag): all_records = get_all_sentence_records(app, tag) # retrieve all different sentence candadite of a certain group. tag here is the group id all_sentences = [x[2] for x in all_records] if len(all_records) == 0: # there is no difference item candidate return elif len(all_records) == 1: # there is only one difference item candidate clusters = [1] else: # more than one difference item candidate distance_matrix = calculate_distance_matrix(all_sentences) distArray = ssd.squareform(distance_matrix)# numpy.ndarray : 返回上三角构成的数组 Z = sch.linkage(distArray, method = 'single') #You can also plot this. # https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html clusters = sch.fcluster(Z, T_THRESHOLD, criterion = 'distance') # The cluster result for the diff sentence candidates of a group # save result to database db = connect_db() cur = db.cursor() for i, cluster_id in enumerate(clusters): record = list(all_records[i]) + [int(''.join(['100',str(cluster_id)]))] # txt: 100xxx, img: 200xxx # what is this record ? why 100 + clusterid? 注意 duplicate_tag 就是 group id sql = "INSERT INTO cluster_txt " + \ "(app, duplicate_tag, diff_sentence, diff_sentence_index, report_id, cluster_id) " + \ "VALUES (?,?, ?, ?, ?, ?)" try: cur.execute(sql, record) db.commit() except Exception as e: traceback.print_exc() close_db(db)
def join(app, tag, all_txt_clusters, all_img_clusters, link_matrix): db = connect_db() cur = db.cursor() AUTO_CLUSTER_ID = 0 linked_img_clusters = set() # just_cluster_id for i, txt_cluster in enumerate(all_txt_clusters): txt_cluster_id = txt_cluster.get_cluster_id() linked_img_clusters_i = get_all_linked_img_clusters( link_matrix, i, all_img_clusters) if len( linked_img_clusters_i ) == 0: # there is no image candidate cluster that is linked with this cluster cur.execute( "INSERT INTO cluster_combine (app, duplicate_tag, cluster_tag, " + "cluster_id_txt) VALUES (%s, %s, %s, %s)", (app, tag, AUTO_CLUSTER_ID, txt_cluster_id)) db.commit() AUTO_CLUSTER_ID += 1 for img_cluster in linked_img_clusters_i: # save the relationship to database img_cluster_id = img_cluster.get_cluster_id() cur.execute( "INSERT INTO cluster_combine (app, duplicate_tag, cluster_tag, " + "cluster_id_txt, cluster_id_img) VALUES (%s, %s, %s, %s, %s)", (app, tag, AUTO_CLUSTER_ID, txt_cluster_id, img_cluster_id)) db.commit() AUTO_CLUSTER_ID += 1 linked_img_clusters.add(img_cluster_id) # save rest image candidate clusters to database unlinked_img_clusters = set([x.get_cluster_id() for x in all_img_clusters ]) - linked_img_clusters for img_cluster in unlinked_img_clusters: cur.execute( "INSERT INTO cluster_combine (app, duplicate_tag, cluster_tag, " + "cluster_id_img) VALUES (%s, %s, %s, %s)", (app, tag, AUTO_CLUSTER_ID, img_cluster)) db.commit() AUTO_CLUSTER_ID += 1 close_db(db)