예제 #1
0
def singleObjectPatternFiltering(patterns,
                                 websiteLocation,
                                 supervisedFileLocation,
                                 preprocessType="None"):
    output = []
    for pattern in patterns:
        (lp, rp) = pattern
        pages = getAllPagesInsideWebsite(websiteLocation)
        patternScore = 0
        for page in pages:
            exactPageLocation = page + "/page.html"
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            singleObj = ""
            if len(contentList) == 1:
                singleObj = contentList[0]
            goldContent = " ".join(singleObj.split())
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            if preprocessType == "NUM":
                pageContent = replaceNumWordsInStr(pageContent)
            results = makeSingleObjectExtractions(pageContent, lp, rp)
            if goldContent in results:
                patternScore += 1
        if patternScore > 0:
            output.append((lp, rp))
    return output
예제 #2
0
def singleObjectPatternFiltering(patterns,
                                 websiteLocation,
                                 supervisedFileLocation,
                                 artificialSeedSet,
                                 threshold=1000,
                                 preprocessType="None"):
    stats = {}
    output = []
    patternIndex = 0
    for pattern in patterns:
        pages = getAllPagesInsideWebsite(websiteLocation)
        resultsPerPattern = []
        patternIndex += 1
        for page in pages:
            (lp, rp) = pattern
            exactPageLocation = page + "/page.html"
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            singleObj = ""
            if len(contentList) == 1:
                singleObj = contentList[0]
            goldContent = " ".join(singleObj.split())
            expected = [goldContent]
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            computed = makeSingleObjectExtractions(pageContent, lp, rp,
                                                   threshold)
            resultsPerPattern.append(
                (computed, expected, artificialSeedSet[page]))
        stats[patternIndex] = (pattern, resultsPerPattern)
    patterns = getPatternsFromStats(stats)
    return patterns
예제 #3
0
def clusterPatternFiltering(patterns,
                            websiteLocation,
                            supervisedFileLocation,
                            artificialClusters,
                            preprocessType="None"):
    stats = {}
    output = []
    patternIndex = 0
    for pattern in patterns:
        pages = getAllPagesInsideWebsite(websiteLocation)
        resultsPerPattern = []
        patternIndex += 1
        for page in pages:
            (lp, mp, rp) = pattern
            exactPageLocation = page + "/page.html"
            # contentList = getClusterInsideLeftRightPattern(pageContent)
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            contentList = [" ".join(item.split()) for item in contentList]
            expected = contentList
            expectedArtificial = artificialClusters[page]
            expectedArtificial = [
                " ".join(item.split()) for sublist in expectedArtificial
                for item in sublist
            ]
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            clusters = getClusterInsideLeftRightPattern(
                pageContent, lp, mp, rp)
            computed = []
            for cluster in clusters:
                computed.extend(getElementsOfCluster(cluster, mp))
            # print("Expected is ")
            # print(expected)
            # print("expected Artificial is ")
            # print(expectedArtificial)
            # print("Computed is ")
            # print(computed)
            resultsPerPattern.append((computed, expected, expectedArtificial))
        stats[patternIndex] = (pattern, resultsPerPattern)
    patterns = getPatternsFromStats(stats)
    return patterns
예제 #4
0
from utils import writeTripletPatternsAsCsv
from ClusterContextExtraction import getClusterContexts
from RegExpPatternFilteringModule import clusterPatternFiltering
from utils import appendPreprocessType, processNumInContext
from SeedExpansionModule import addArtificialClusterSeeds
websiteLocations = getWebsiteLocations(supervisedDataLocation)
print(websiteLocations)
for websiteLocation in websiteLocations:
    pages = getAllPagesInsideWebsite(websiteLocation)
    contexts = []
    artificialClusters = {}
    for page in pages:
        print(page)
        print(websiteLocation)
        exactPageLocation = page + "/page.html"
        clusterElements = readFileContentInList(page + "/" +
                                                supervisedFileName)
        pageContent = readPlainHtmlPageContent(exactPageLocation)
        contextsPerPage = getClusterContexts(pageContent, clusterElements)
        contexts.append(contextsPerPage)
        artificialClusterPerPage = addArtificialClusterSeeds(
            pageContent, clusterElements)
        artificialClusters[page] = artificialClusterPerPage
        realCluster = "\n".join(clusterElements)
        # print("Real Cluster is ")
        # print(realCluster)
        # artificialCluster = ["\n".join(cluster) for cluster in artificialClusterPerPage]
        # artificialCluster = "\n\n\n".join(artificialCluster)
        # print("Artificial Cluster is ")
        # print(artificialCluster)
        # print("Real cluster")
        # realCluster = "\n".join(clusterElements)
예제 #5
0
from FileUtil import getAllPagesInsideWebsite, readPlainHtmlPageContent
from FileUtil import readFileContentInList
from utils import getSingleObjectContexts, writePairPatternsAsCsv
from SingleObjectPatternsLearningUtil import  learnPatterns
from utils import appendPreprocessType
from utils import  processNumInContext

websiteLocations = getWebsiteLocations(supervisedDataLocation)
print(websiteLocations)
for websiteLocation in websiteLocations:
    pages                = getAllPagesInsideWebsite(websiteLocation)
    singleObjectContexts = []
    singleObjList        = []
    for page in pages:
        exactPageLocation = page + "/page.html"
        contentList       = readFileContentInList(page + "/" + supervisedFileName)
        singleObj         = ""
        if len(contentList)==1:
            singleObj = contentList[0]
        #so that multiple spaces in product title are being removed
        singleObj       = " ".join(singleObj.split())
        pageContent     = readPlainHtmlPageContent(exactPageLocation)
        contextsPerPage = getSingleObjectContexts(pageContent, singleObj)
        if len(contextsPerPage)>0:
            singleObjectContexts.append(contextsPerPage)
        singleObjList.append(singleObj)
    print("You know what...")
    print("Single object contexts are:- ")
    print(singleObjectContexts)
    patterns         = learnPatterns(singleObjectContexts)