Example #1
0
def multipleRelationFiltering(patterns,
                              websiteLocation,
                              supervisedFileLocation,
                              artificialSeeds,
                              preprocessType="None"):
    stats = {}
    patternIndex = 0
    for pattern in patterns:
        pages = getAllPagesInsideWebsite(websiteLocation)
        resultsPerPattern = []
        patternIndex += 1
        for page in pages:
            artificialSeedsPerPage = artificialSeeds[page]
            (lp, mp, rp) = pattern
            exactPageLocation = page + "/page.html"
            contentList = readFileRelationContentInList(page + "/" +
                                                        supervisedFileLocation)
            expected = [k.strip() + " " + v.strip() for (k, v) in contentList]
            expected = [" ".join(item.split()) for item in expected]
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            computed = getAllRelations(lp, mp, rp, pageContent)
            computed = [k.strip() + " " + v.strip() for (k, v) in computed]
            computed = [" ".join(item.split()) for item in computed]
            artificialPerPage = [
                k.strip() + " " + v.strip() for (k, v) in artificialSeeds[page]
            ]
            artificialPerPage = [
                " ".join(item.split()) for item in artificialPerPage
            ]
            resultsPerPattern.append((computed, expected, artificialPerPage))
            stats[patternIndex] = (pattern, resultsPerPattern)
        patterns = getPatternsFromStats(stats)
    return patterns
Example #2
0
def singleObjectPatternFiltering(patterns,
                                 websiteLocation,
                                 supervisedFileLocation,
                                 artificialSeedSet,
                                 threshold=1000,
                                 preprocessType="None"):
    stats = {}
    output = []
    patternIndex = 0
    for pattern in patterns:
        pages = getAllPagesInsideWebsite(websiteLocation)
        resultsPerPattern = []
        patternIndex += 1
        for page in pages:
            (lp, rp) = pattern
            exactPageLocation = page + "/page.html"
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            singleObj = ""
            if len(contentList) == 1:
                singleObj = contentList[0]
            goldContent = " ".join(singleObj.split())
            expected = [goldContent]
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            computed = makeSingleObjectExtractions(pageContent, lp, rp,
                                                   threshold)
            resultsPerPattern.append(
                (computed, expected, artificialSeedSet[page]))
        stats[patternIndex] = (pattern, resultsPerPattern)
    patterns = getPatternsFromStats(stats)
    return patterns
Example #3
0
def singleObjectPatternFiltering(patterns,
                                 websiteLocation,
                                 supervisedFileLocation,
                                 preprocessType="None"):
    output = []
    for pattern in patterns:
        (lp, rp) = pattern
        pages = getAllPagesInsideWebsite(websiteLocation)
        patternScore = 0
        for page in pages:
            exactPageLocation = page + "/page.html"
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            singleObj = ""
            if len(contentList) == 1:
                singleObj = contentList[0]
            goldContent = " ".join(singleObj.split())
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            if preprocessType == "NUM":
                pageContent = replaceNumWordsInStr(pageContent)
            results = makeSingleObjectExtractions(pageContent, lp, rp)
            if goldContent in results:
                patternScore += 1
        if patternScore > 0:
            output.append((lp, rp))
    return output
Example #4
0
def getProdInfo(patterns, pageLocation):
    pageContent        = readPlainHtmlPageContent(pageLocation)
    numProcessedPageContent = replaceNumWordsInStr(pageContent)
    titlePatterns      = patterns.getProductTitlePatterns()
    specsPatterns      = patterns.getProductSpecsPatterns()
    relationPatterns   = patterns.getProductRelationPatterns()
    productTitles      = extractProductTitles(pageContent, numProcessedPageContent, titlePatterns)
    productSpecs       = extractProductSpecs(pageContent, numProcessedPageContent, specsPatterns)
    productRelations   = extractProductRelations(pageContent, numProcessedPageContent, relationPatterns)
    return ProductInfo(pageLocation, productTitles, productSpecs, productRelations)
def clusterPatternsTesting(patternsLocation, corpus):
    output = []
    clusterPatterns = readTripletPatterns(patternsLocation)
    for pattern in clusterPatterns:
        (lp, mp, rp) = pattern
        for page in corpus:
            pageLocation = page + "/page.html"
            plainHtmlContent = readPlainHtmlPageContent(pageLocation)
            clusters = doClusterExtraction(lp, mp, rp, plainHtmlContent)
            output.extend(clusters)
    return output
def entityPatternsTesting(patternsLocation, corpus):
    output = []
    entityPatterns = readEntityExtractionPatterns(patternsLocation)
    print("Entity patterns are:- ")
    print(entityPatterns)
    for pattern in entityPatterns:
        (lp, rp) = pattern
        for page in corpus:
            pageLocation = page + "/page.html"
            plainHtmlContent = readPlainHtmlPageContent(pageLocation)
            entities = doEntityExtractions(lp, rp, plainHtmlContent)
            output.extend(entities)
    return output
def relationPatternsTesting(patternsLocation, corpus):
    output = []
    relationPatterns = readTripletPatterns(patternsLocation)
    for pattern in relationPatterns:
        (lp, mp, rp) = pattern
        for page in corpus:
            pageLocation = page + "/page.html"
            plainHtmlContent = readPlainHtmlPageContent(pageLocation)
            relations = doRelationExtractions(lp, mp, rp, plainHtmlContent)
            relations = filteredRelations(relations)
            # for r in relations:
            #     print(r)
            # break
            output.extend(relations)
    return output
Example #8
0
def clusterPatternFiltering(patterns,
                            websiteLocation,
                            supervisedFileLocation,
                            artificialClusters,
                            preprocessType="None"):
    stats = {}
    output = []
    patternIndex = 0
    for pattern in patterns:
        pages = getAllPagesInsideWebsite(websiteLocation)
        resultsPerPattern = []
        patternIndex += 1
        for page in pages:
            (lp, mp, rp) = pattern
            exactPageLocation = page + "/page.html"
            # contentList = getClusterInsideLeftRightPattern(pageContent)
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            contentList = [" ".join(item.split()) for item in contentList]
            expected = contentList
            expectedArtificial = artificialClusters[page]
            expectedArtificial = [
                " ".join(item.split()) for sublist in expectedArtificial
                for item in sublist
            ]
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            clusters = getClusterInsideLeftRightPattern(
                pageContent, lp, mp, rp)
            computed = []
            for cluster in clusters:
                computed.extend(getElementsOfCluster(cluster, mp))
            # print("Expected is ")
            # print(expected)
            # print("expected Artificial is ")
            # print(expectedArtificial)
            # print("Computed is ")
            # print(computed)
            resultsPerPattern.append((computed, expected, expectedArtificial))
        stats[patternIndex] = (pattern, resultsPerPattern)
    patterns = getPatternsFromStats(stats)
    return patterns
websiteLocations = getWebsiteLocations(supervisedDataLocation)
print(websiteLocations)
from utils import appendPreprocessType
#work for each website independently
for websiteLocation in websiteLocations:
    pages = getAllPagesInsideWebsite(websiteLocation)
    corpusLevelRelationContext = []
    for page in pages:
        exactPageLocation = page + "/page.html"
        print("Exact page location:- ")
        print(exactPageLocation)
        #also can be called as supervision of relations
        supervisedRelationList = readFileRelationContentInList(
            page + "/" + supervisedFileName)
        allRelationContextsPerPage = []
        pageContent = readPlainHtmlPageContent(exactPageLocation)
        for supervisedRelation in supervisedRelationList:
            (key, value) = supervisedRelation
            contextsPerRelation = getAllContextsForKV(pageContent, key, value,
                                                      KEY_VALUE_AWAY_LIMIT)
            #if condition to ignore the number of contexts.
            if len(contextsPerRelation) <= 0:
                continue
            print("Contexts per relation are:- ")
            print(contextsPerRelation)
            allRelationContextsPerPage.append(contextsPerRelation)
        corpusLevelRelationContext.extend(allRelationContextsPerPage)
    print("Multiple relation contexts are:- ")
    print(
        "This is design choice to keep tables from all the pages belong to big table set"
    )