def multipleRelationFiltering(patterns, websiteLocation, supervisedFileLocation, artificialSeeds, preprocessType="None"): stats = {} patternIndex = 0 for pattern in patterns: pages = getAllPagesInsideWebsite(websiteLocation) resultsPerPattern = [] patternIndex += 1 for page in pages: artificialSeedsPerPage = artificialSeeds[page] (lp, mp, rp) = pattern exactPageLocation = page + "/page.html" contentList = readFileRelationContentInList(page + "/" + supervisedFileLocation) expected = [k.strip() + " " + v.strip() for (k, v) in contentList] expected = [" ".join(item.split()) for item in expected] pageContent = readPlainHtmlPageContent(exactPageLocation) computed = getAllRelations(lp, mp, rp, pageContent) computed = [k.strip() + " " + v.strip() for (k, v) in computed] computed = [" ".join(item.split()) for item in computed] artificialPerPage = [ k.strip() + " " + v.strip() for (k, v) in artificialSeeds[page] ] artificialPerPage = [ " ".join(item.split()) for item in artificialPerPage ] resultsPerPattern.append((computed, expected, artificialPerPage)) stats[patternIndex] = (pattern, resultsPerPattern) patterns = getPatternsFromStats(stats) return patterns
def singleObjectPatternFiltering(patterns, websiteLocation, supervisedFileLocation, artificialSeedSet, threshold=1000, preprocessType="None"): stats = {} output = [] patternIndex = 0 for pattern in patterns: pages = getAllPagesInsideWebsite(websiteLocation) resultsPerPattern = [] patternIndex += 1 for page in pages: (lp, rp) = pattern exactPageLocation = page + "/page.html" contentList = readFileContentInList(page + "/" + supervisedFileLocation) singleObj = "" if len(contentList) == 1: singleObj = contentList[0] goldContent = " ".join(singleObj.split()) expected = [goldContent] pageContent = readPlainHtmlPageContent(exactPageLocation) computed = makeSingleObjectExtractions(pageContent, lp, rp, threshold) resultsPerPattern.append( (computed, expected, artificialSeedSet[page])) stats[patternIndex] = (pattern, resultsPerPattern) patterns = getPatternsFromStats(stats) return patterns
def singleObjectPatternFiltering(patterns, websiteLocation, supervisedFileLocation, preprocessType="None"): output = [] for pattern in patterns: (lp, rp) = pattern pages = getAllPagesInsideWebsite(websiteLocation) patternScore = 0 for page in pages: exactPageLocation = page + "/page.html" contentList = readFileContentInList(page + "/" + supervisedFileLocation) singleObj = "" if len(contentList) == 1: singleObj = contentList[0] goldContent = " ".join(singleObj.split()) pageContent = readPlainHtmlPageContent(exactPageLocation) if preprocessType == "NUM": pageContent = replaceNumWordsInStr(pageContent) results = makeSingleObjectExtractions(pageContent, lp, rp) if goldContent in results: patternScore += 1 if patternScore > 0: output.append((lp, rp)) return output
def doClusterExtractionTesting(): global websiteLocation patternsLocation = "./supervisedData/amazon/specsPatterns.tsv" outputLocation = "./specsTestingAmazon" pages = getAllPagesInsideWebsite(websiteLocation) clusters = clusterPatternsTesting(patternsLocation, pages) clusters = "\n\n".join(clusters) with open(outputLocation, "w") as f: f.write(clusters) f.close() print("Output written at location:- " + str(outputLocation))
def doRelationExtractionTesting(): global websiteLocation patternsLocation = "./supervisedData/amazon/tablePatterns.tsv" outputLocation = "./relationTestingAmazon" pages = getAllPagesInsideWebsite(websiteLocation) relations = relationPatternsTesting(patternsLocation, pages) relations = "\n".join(relations) with open(outputLocation, "w") as f: f.write(relations) f.close() print("Output written at location:- " + str(outputLocation))
def doEntityExtractionTesting(): global websiteLocation patternsLocation = "./supervisedData/amazon/titlePatterns.tsv" outputLocation = "./entityTestingAmazon" pages = getAllPagesInsideWebsite(websiteLocation) entities = entityPatternsTesting(patternsLocation, pages) entities = "\n".join(entities) with open(outputLocation, "w") as f: f.write(entities) f.close() print("Output written at location:- " + str(outputLocation))
def clusterPatternFiltering(patterns, websiteLocation, supervisedFileLocation, artificialClusters, preprocessType="None"): stats = {} output = [] patternIndex = 0 for pattern in patterns: pages = getAllPagesInsideWebsite(websiteLocation) resultsPerPattern = [] patternIndex += 1 for page in pages: (lp, mp, rp) = pattern exactPageLocation = page + "/page.html" # contentList = getClusterInsideLeftRightPattern(pageContent) contentList = readFileContentInList(page + "/" + supervisedFileLocation) contentList = [" ".join(item.split()) for item in contentList] expected = contentList expectedArtificial = artificialClusters[page] expectedArtificial = [ " ".join(item.split()) for sublist in expectedArtificial for item in sublist ] pageContent = readPlainHtmlPageContent(exactPageLocation) clusters = getClusterInsideLeftRightPattern( pageContent, lp, mp, rp) computed = [] for cluster in clusters: computed.extend(getElementsOfCluster(cluster, mp)) # print("Expected is ") # print(expected) # print("expected Artificial is ") # print(expectedArtificial) # print("Computed is ") # print(computed) resultsPerPattern.append((computed, expected, expectedArtificial)) stats[patternIndex] = (pattern, resultsPerPattern) patterns = getPatternsFromStats(stats) return patterns
patternsOutputLocation = "tablePatterns.tsv" KEY_VALUE_AWAY_LIMIT = 100 #essential imports from utils import writeTripletPatternsAsCsv, getAllContextsForKV from FileUtil import getWebsiteLocations, getAllPagesInsideWebsite, readPlainHtmlPageContent from FileUtil import readFileRelationContentInList from RelationPatternsLearningUtil import learnPatterns from utils import processNumInContext #get all the locations for website so that we can start extracting the patterns for them. websiteLocations = getWebsiteLocations(supervisedDataLocation) print(websiteLocations) from utils import appendPreprocessType #work for each website independently for websiteLocation in websiteLocations: pages = getAllPagesInsideWebsite(websiteLocation) corpusLevelRelationContext = [] for page in pages: exactPageLocation = page + "/page.html" print("Exact page location:- ") print(exactPageLocation) #also can be called as supervision of relations supervisedRelationList = readFileRelationContentInList( page + "/" + supervisedFileName) allRelationContextsPerPage = [] pageContent = readPlainHtmlPageContent(exactPageLocation) for supervisedRelation in supervisedRelationList: (key, value) = supervisedRelation contextsPerRelation = getAllContextsForKV(pageContent, key, value, KEY_VALUE_AWAY_LIMIT) #if condition to ignore the number of contexts.