def multipleRelationFiltering(patterns, websiteLocation, supervisedFileLocation, artificialSeeds, preprocessType="None"): stats = {} patternIndex = 0 for pattern in patterns: pages = getAllPagesInsideWebsite(websiteLocation) resultsPerPattern = [] patternIndex += 1 for page in pages: artificialSeedsPerPage = artificialSeeds[page] (lp, mp, rp) = pattern exactPageLocation = page + "/page.html" contentList = readFileRelationContentInList(page + "/" + supervisedFileLocation) expected = [k.strip() + " " + v.strip() for (k, v) in contentList] expected = [" ".join(item.split()) for item in expected] pageContent = readPlainHtmlPageContent(exactPageLocation) computed = getAllRelations(lp, mp, rp, pageContent) computed = [k.strip() + " " + v.strip() for (k, v) in computed] computed = [" ".join(item.split()) for item in computed] artificialPerPage = [ k.strip() + " " + v.strip() for (k, v) in artificialSeeds[page] ] artificialPerPage = [ " ".join(item.split()) for item in artificialPerPage ] resultsPerPattern.append((computed, expected, artificialPerPage)) stats[patternIndex] = (pattern, resultsPerPattern) patterns = getPatternsFromStats(stats) return patterns
def singleObjectPatternFiltering(patterns, websiteLocation, supervisedFileLocation, artificialSeedSet, threshold=1000, preprocessType="None"): stats = {} output = [] patternIndex = 0 for pattern in patterns: pages = getAllPagesInsideWebsite(websiteLocation) resultsPerPattern = [] patternIndex += 1 for page in pages: (lp, rp) = pattern exactPageLocation = page + "/page.html" contentList = readFileContentInList(page + "/" + supervisedFileLocation) singleObj = "" if len(contentList) == 1: singleObj = contentList[0] goldContent = " ".join(singleObj.split()) expected = [goldContent] pageContent = readPlainHtmlPageContent(exactPageLocation) computed = makeSingleObjectExtractions(pageContent, lp, rp, threshold) resultsPerPattern.append( (computed, expected, artificialSeedSet[page])) stats[patternIndex] = (pattern, resultsPerPattern) patterns = getPatternsFromStats(stats) return patterns
def singleObjectPatternFiltering(patterns, websiteLocation, supervisedFileLocation, preprocessType="None"): output = [] for pattern in patterns: (lp, rp) = pattern pages = getAllPagesInsideWebsite(websiteLocation) patternScore = 0 for page in pages: exactPageLocation = page + "/page.html" contentList = readFileContentInList(page + "/" + supervisedFileLocation) singleObj = "" if len(contentList) == 1: singleObj = contentList[0] goldContent = " ".join(singleObj.split()) pageContent = readPlainHtmlPageContent(exactPageLocation) if preprocessType == "NUM": pageContent = replaceNumWordsInStr(pageContent) results = makeSingleObjectExtractions(pageContent, lp, rp) if goldContent in results: patternScore += 1 if patternScore > 0: output.append((lp, rp)) return output
def getProdInfo(patterns, pageLocation): pageContent = readPlainHtmlPageContent(pageLocation) numProcessedPageContent = replaceNumWordsInStr(pageContent) titlePatterns = patterns.getProductTitlePatterns() specsPatterns = patterns.getProductSpecsPatterns() relationPatterns = patterns.getProductRelationPatterns() productTitles = extractProductTitles(pageContent, numProcessedPageContent, titlePatterns) productSpecs = extractProductSpecs(pageContent, numProcessedPageContent, specsPatterns) productRelations = extractProductRelations(pageContent, numProcessedPageContent, relationPatterns) return ProductInfo(pageLocation, productTitles, productSpecs, productRelations)
def clusterPatternsTesting(patternsLocation, corpus): output = [] clusterPatterns = readTripletPatterns(patternsLocation) for pattern in clusterPatterns: (lp, mp, rp) = pattern for page in corpus: pageLocation = page + "/page.html" plainHtmlContent = readPlainHtmlPageContent(pageLocation) clusters = doClusterExtraction(lp, mp, rp, plainHtmlContent) output.extend(clusters) return output
def entityPatternsTesting(patternsLocation, corpus): output = [] entityPatterns = readEntityExtractionPatterns(patternsLocation) print("Entity patterns are:- ") print(entityPatterns) for pattern in entityPatterns: (lp, rp) = pattern for page in corpus: pageLocation = page + "/page.html" plainHtmlContent = readPlainHtmlPageContent(pageLocation) entities = doEntityExtractions(lp, rp, plainHtmlContent) output.extend(entities) return output
def relationPatternsTesting(patternsLocation, corpus): output = [] relationPatterns = readTripletPatterns(patternsLocation) for pattern in relationPatterns: (lp, mp, rp) = pattern for page in corpus: pageLocation = page + "/page.html" plainHtmlContent = readPlainHtmlPageContent(pageLocation) relations = doRelationExtractions(lp, mp, rp, plainHtmlContent) relations = filteredRelations(relations) # for r in relations: # print(r) # break output.extend(relations) return output
def clusterPatternFiltering(patterns, websiteLocation, supervisedFileLocation, artificialClusters, preprocessType="None"): stats = {} output = [] patternIndex = 0 for pattern in patterns: pages = getAllPagesInsideWebsite(websiteLocation) resultsPerPattern = [] patternIndex += 1 for page in pages: (lp, mp, rp) = pattern exactPageLocation = page + "/page.html" # contentList = getClusterInsideLeftRightPattern(pageContent) contentList = readFileContentInList(page + "/" + supervisedFileLocation) contentList = [" ".join(item.split()) for item in contentList] expected = contentList expectedArtificial = artificialClusters[page] expectedArtificial = [ " ".join(item.split()) for sublist in expectedArtificial for item in sublist ] pageContent = readPlainHtmlPageContent(exactPageLocation) clusters = getClusterInsideLeftRightPattern( pageContent, lp, mp, rp) computed = [] for cluster in clusters: computed.extend(getElementsOfCluster(cluster, mp)) # print("Expected is ") # print(expected) # print("expected Artificial is ") # print(expectedArtificial) # print("Computed is ") # print(computed) resultsPerPattern.append((computed, expected, expectedArtificial)) stats[patternIndex] = (pattern, resultsPerPattern) patterns = getPatternsFromStats(stats) return patterns
websiteLocations = getWebsiteLocations(supervisedDataLocation) print(websiteLocations) from utils import appendPreprocessType #work for each website independently for websiteLocation in websiteLocations: pages = getAllPagesInsideWebsite(websiteLocation) corpusLevelRelationContext = [] for page in pages: exactPageLocation = page + "/page.html" print("Exact page location:- ") print(exactPageLocation) #also can be called as supervision of relations supervisedRelationList = readFileRelationContentInList( page + "/" + supervisedFileName) allRelationContextsPerPage = [] pageContent = readPlainHtmlPageContent(exactPageLocation) for supervisedRelation in supervisedRelationList: (key, value) = supervisedRelation contextsPerRelation = getAllContextsForKV(pageContent, key, value, KEY_VALUE_AWAY_LIMIT) #if condition to ignore the number of contexts. if len(contextsPerRelation) <= 0: continue print("Contexts per relation are:- ") print(contextsPerRelation) allRelationContextsPerPage.append(contextsPerRelation) corpusLevelRelationContext.extend(allRelationContextsPerPage) print("Multiple relation contexts are:- ") print( "This is design choice to keep tables from all the pages belong to big table set" )