def clusterSameFloor(self, childList): avgNumList = [] for child in childList: numList = [link['wordNum'] for link in getLink(child)] avgNumList.append(sum(numList)/len(numList)) sumList = [0] for i in range(1, len(avgNumList)+1): sumList.append(sumList[i-1]+avgNumList[i-1]) gapList = [0] for i in range(0, len(avgNumList)-1): for j in range(i+1, len(avgNumList)): gapList.append(abs(avgNumList[j]-(sumList[j]-sumList[i]+0.0)/(j-i))) gapList.sort(reverse = True) for gap in gapList: nodeClusterList = [] start = 0 for i in range(1, len(avgNumList)): if abs(avgNumList[i]-(sumList[i]-sumList[start]+0.0)/(i-start)) >= gap: nodeClusterList.append(childList[start:i]) start = i nodeClusterList.append(childList[start:]) if all([self.getChaotic(cluster) < self.ct for cluster in nodeClusterList]): clusterList = [] for cluster in nodeClusterList: oneCluster = [] for node in cluster: oneCluster += [link['href'] for link in getLink(node)] clusterList.append(oneCluster) return clusterList return None
def clusterLink(domTree, clusterList, md1, md2): childList = [] for child in domTree.children: try: if getLink(child): childList.append(child) except: continue if not childList: return True if not getLink(domTree): return True isOneClusterList = [ clusterLink(child, clusterList, md1, md2) for child in childList ] if all(isOneClusterList): if md1.isOneCluster(domTree) and md2.isOneCluster([domTree]): return True else: clusterList += md1.clusterSameFloor(childList, md2) return False else: tmpChildList = [] for i in range(0, len(childList)): if not isOneClusterList[i]: clusterList += md1.clusterSameFloor(tmpChildList, md2) tmpChildList = [] else: tmpChildList.append(childList[i]) clusterList += md1.clusterSameFloor(tmpChildList, md2) return False
def getMinGap(self, dom1, dom2): minGap = None linkList1 = getLink(dom1) linkList2 = getLink(dom2) if linkList1 and linkList2: minGap = self.getDis(linkList1[0], linkList2[0]) for link1 in linkList1: for link2 in linkList2: minGap = min(self.getDis(link1, link2), minGap) return minGap
def findSubTree(dom, linkSet): subTree = dom for desc in dom.descendants: tmpLinkSet = set([link['href'] for link in getLink(desc)]) if linkSet.issubset(tmpLinkSet): subTree = desc return subTree
def run(dataDir): htmlList = listdir(dataDir) htmlList = [html for html in htmlList if html.split('.')[-1] == 'html'] naviNum = 0 nNaviNum = 0 naviDic = {} naviDic['navi'] = set() naviDic['nNavi'] = set() for html in htmlList: try: dom = bs(open(path.join(dataDir, html))) except Exception, e: print e break for link in getLink(dom): try: if link['cluster'] == 'nav' or link['cluster'] == 'list': naviNum += 1 naviDic['navi'].add(link['clusterindex']) else: nNaviNum += 1 naviDic['nNavi'].add(link['clusterindex']) except: naviDic['nNavi'].add(-1) nNaviNum += 1
def clusterLinkAgg(dom, para=1): md = ldi.linkDistance(dom, para) linkList = getLink(dom) linkClusterList = [] for link in linkList: linkClusterList.append([link]) while True: if len(linkClusterList) <= 1: break first = 0 second = 1 try: minGap = singleLink(linkClusterList[first], linkClusterList[second], md) #minGap = singleLink2D(linkClusterList[first], linkClusterList[second], md) except: print(len(linkClusterList)) return for i in range(0, len(linkClusterList)): for j in range(i + 1, len(linkClusterList)): gap = singleLink(linkClusterList[i], linkClusterList[j], md) #gap = singleLink2D(linkClusterList[i], linkClusterList[j], md) if gap < minGap: minGap = gap first = i second = j if minGap > md.dt: #if minGap > md.dt*1.414: break linkClusterList[first] += linkClusterList[second] linkClusterList.pop(second) return linkClusterList
def runClusterLink(dom, md1, md2): clusterList = [] if clusterLink(dom, clusterList, md1, md2): tmpCluster = getLink(dom) if tmpCluster: clusterList.append(tmpCluster) clusterList = [cluster for cluster in clusterList if cluster] return clusterList
def main(): dom = bs(open('../../data/clean_eval/'+sys.argv[1]+'.html')) linkList = getLink(dom) linkLenList = [len(link.text.split()) for link in linkList] print(linkLenList) lc = linkChaotic(dom) linkLenList = [link['wordNum'] for link in linkList] print(linkLenList)
def clusterSameFloor(self, childList, md2): clusterList = [] if not childList: return clusterList oneCluster = getLink(childList[0]) for i in range(1, len(childList)): tmp = list(childList[i - 1].next_siblings) tmp1 = [childList[i - 1]] + tmp[0:tmp.index(childList[i]) + 1] if self.getMinGap( childList[i], childList[i - 1]) > self.dt or not md2.isOneCluster(tmp1): clusterList.append(oneCluster) oneCluster = getLink(childList[i]) else: oneCluster += getLink(childList[i]) clusterList.append(oneCluster) return clusterList
def getChaotic(self, domTreeList): linkList = [] for domTree in domTreeList: try: linkList += getLink(domTree) except: continue if linkList: wordNumList = [float(link['wordNum']) for link in linkList] return np.var(np.array(wordNumList)) else: return None
def isOneCluster(self, domTree): childList = [] for child in domTree.children: try: if getLink(child): childList.append(child) except: continue maxGap = 0 for i in range(1, len(childList)): maxGap = max(self.getMinGap(childList[i - 1], childList[i]), maxGap) return maxGap <= self.dt
def clusterLinkKM(dom, k=2): md = ldi.linkDistance(dom, 1) linkList = getLink(dom) a = [[link['index']] for link in linkList] #a = [[link['index'], link.parent['index']] for link in linkList] #a = [[link['index'], link['link_color'], link['link_size']] for link in linkList] #a = [[link['index'], link['link_color'], link['link_size'], link.parent['index']] for link in linkList] if not a: return [] cluster = KMeans(k) labelList = cluster.fit_predict(array(a)) linkClusterList = [[] for i in range(0, k)] for i in range(0, len(labelList)): linkClusterList[labelList[i]].append(linkList[i]) return linkClusterList
def denThres(self, domTree): linkList = getLink(domTree) linkStrList = [] for link in linkList: linkStrList.append(link.text) imgList = link.find_all('img') if not imgList: linkStrList.append('string') for img in link.find_all('img'): if 'alt' in img.attrs: linkStrList.append(img['alt']) else: linkStrList.append('string') linkStrLen = sum([len(linkStr.split()) for linkStr in linkStrList]) dt = self.alpha * (linkStrLen + self.beta) / ( float(len(domTree.text.split())) + linkStrLen + self.beta) return dt
def getLinkDen(self, subDomList): linkList = [] for subDom in subDomList: linkList += getLink(subDom) linkStrList = [] for link in linkList: linkStrList.append(link.text) imgList = link.find_all('img') if not imgList: linkStrList.append('string') for img in link.find_all('img'): if 'alt' in img.attrs: linkStrList.append(img['alt']) else: linkStrList.append('string') linkStrLen = sum([len(linkStr.split()) for linkStr in linkStrList]) ld = (linkStrLen + self.beta) / (float(len(subDom.text.split())) + linkStrLen + self.beta) return ld
def disThres(self, domTree, alpha=1): disList = [] disList.append(0) linkList = getLink(domTree) for i in range(1, len(linkList)): dis = self.getDis(linkList[i], linkList[i - 1]) if dis: disList.append(dis) disList.sort(reverse=True) judgeList = [ disList[i] * len(disList) * alpha + i * disList[0] for i in range(0, len(disList)) ] minIndex = 0 for i in range(0, len(judgeList)): if judgeList[i] < judgeList[minIndex]: minIndex = i dt = disList[minIndex] return dt
def clusterLink1D(dom, para=1): md = ldi.linkDistance(dom, para) linkList = getLink(dom) if not linkList: return [] linkClusterList = [] tmp = [linkList[0]] for i in range(1, len(linkList)): if md.getDis(linkList[i], linkList[i - 1]) > md.dt: linkClusterList.append(tmp) tmp = [] else: tmp.append(linkList[i]) if tmp: linkClusterList.append(tmp) linkClusterList = [ linkCluster for linkCluster in linkClusterList if linkCluster ] return linkClusterList
def clusterLinkSC(dom, k=2, affinity='nearest_neighbors', n_neighbors=1, gamma=0.021): md = ldi.linkDistance(dom, 1) linkList = getLink(dom) a = [[link['index']] for link in linkList] #a = [[link['index'], link.parent['index']] for link in linkList] #a = [[link['index'], link['link_color'], link['link_size']] for link in linkList] #a = [[link['index'], link['link_color'], link['link_size'], link.parent['index']] for link in linkList] if not a: return [] cluster = SpectralClustering(k, gamma=gamma) labelList = cluster.fit_predict(array(a)) linkClusterList = [[] for i in range(0, k)] for i in range(0, len(labelList)): linkClusterList[labelList[i]].append(linkList[i]) return linkClusterList
def runK(dataDir, clmd): kMin = 1 kMax = 100 htmlList = listdir(dataDir) htmlList = [html for html in htmlList if html.split('.')[-1] == 'html'] ariList = [] amiList = [] for html in htmlList: dom = bs(open(path.join(dataDir, html))) linkList = getLink(dom) tmpSet = set() for link in linkList: if 'clusterindex' in link.attrs: tmpSet.add(link['clusterindex']) try: linkClusterList = clmd(dom, len(tmpSet)) except: print(tb.format_exc()) continue ariList.append(cm.ari(linkClusterList)) amiList.append(cm.ami(linkClusterList)) return [ariList, amiList]
def clusterLinkDB(dom, para=0.5): md = ldi.linkDistance(dom, para) linkList = getLink(dom) a = [[link['index']] for link in linkList] #a = [[link['index'], link.parent['index']] for link in linkList] #a = [[link['index'], link['link_color'], link['link_size']] for link in linkList] #a = [[link['index'], link['link_color'], link['link_size'], link.parent['index']] for link in linkList] if not a: return [] if not md.dt: return linkList #cluster = DBSCAN(md.dt*1.414, min_samples=1) cluster = DBSCAN(md.dt, min_samples=1) labelList = cluster.fit_predict(array(a)) tmpSet = set() for label in labelList: tmpSet.add(label) linkClusterList = [[] for i in range(0, len(tmpSet))] for i in range(0, len(labelList)): linkClusterList[labelList[i]].append(linkList[i]) linkClusterList = [ linkCluster for linkCluster in linkClusterList if linkCluster ] return linkClusterList
def main(): htmlFileDir = '../../data/cleanEval' htmlFileDir = '../../data/SSD/Big5/techweb.com' #htmlFileDir = '../../data/SSD/myriad40' ''' for num in range(0, 10): htmlFilePath = path.join(htmlFileDir, str(num+1)+'.html') try: (oLinkMatrix, oGroundList, oClusterIndex) = genMatrix([bs(open(htmlFilePath))]) scaler = MinMaxScaler() linkMatrix = scaler.fit_transform(oLinkMatrix) est = KMeans(n_clusters=2) y = est.fit_predict(linkMatrix) print metric(y, oGroundList) except: print(tb.format_exc()) continue ''' domList = [] total = 0 testRatio = 0.5 search = False for num in range(0, 100): htmlFilePath = path.join(htmlFileDir, str(num + 1) + '.html') try: domList.append(bs(open(htmlFilePath))) total += len(getLink(domList[-1])) except: continue print(total) (oLinkMatrix, oGroundList, oClusterIndex) = genMatrix(domList) dataList = [[oLinkMatrix[i], oGroundList[i]] for i in range(0, len(oGroundList))] job_n = 16 trainRatioList = [ float(i) / 100 for i in range(1, 11) + range(10, 101, 10) ] for trainRatio in trainRatioList: turnNum = 100 turn = 0 precision = recall = f1_score = accuracy = 0 while turn < turnNum: try: random.shuffle(dataList) linkMatrix = [dataList[i][0] for i in range(0, len(dataList))] groundList = [dataList[i][1] for i in range(0, len(dataList))] testBound = int(testRatio * len(groundList)) upperBound = int(trainRatio * (1 - testRatio) * len(groundList)) scaler = StandardScaler() linkMatrix = scaler.fit_transform(linkMatrix) grid = None if search: C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(groundList[0:upperBound], n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv, n_jobs=job_n) grid.fit(linkMatrix[0:upperBound], groundList[0:upperBound]) clf = SVC(C=grid.best_params_['C'], gamma=grid.best_params_['gamma']) else: C = float(upperBound / sum(groundList[0:upperBound])) clf = SVC(C=1, gamma=0.10, kernel='rbf') clf.fit(linkMatrix[0:upperBound], groundList[0:upperBound]) predict = clf.predict(linkMatrix[testBound + 1:]) tmp = metric(predict, groundList[testBound + 1:]) precision += tmp[0] recall += tmp[1] f1_score += tmp[2] accuracy += tmp[3] turn += 1 except: continue print "%s, %s, %s, %s" % (precision / turnNum, recall / turnNum, f1_score / turnNum, accuracy / turnNum)
def calLinkRatio(dom, smooth=0.0): return (len(getLink(dom)) + smooth) / (len(dom.text.split()) + smooth)