Beispiel #1
0
def multipleRelationFiltering(patterns,
                              websiteLocation,
                              supervisedFileLocation,
                              artificialSeeds,
                              preprocessType="None"):
    stats = {}
    patternIndex = 0
    for pattern in patterns:
        pages = getAllPagesInsideWebsite(websiteLocation)
        resultsPerPattern = []
        patternIndex += 1
        for page in pages:
            artificialSeedsPerPage = artificialSeeds[page]
            (lp, mp, rp) = pattern
            exactPageLocation = page + "/page.html"
            contentList = readFileRelationContentInList(page + "/" +
                                                        supervisedFileLocation)
            expected = [k.strip() + " " + v.strip() for (k, v) in contentList]
            expected = [" ".join(item.split()) for item in expected]
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            computed = getAllRelations(lp, mp, rp, pageContent)
            computed = [k.strip() + " " + v.strip() for (k, v) in computed]
            computed = [" ".join(item.split()) for item in computed]
            artificialPerPage = [
                k.strip() + " " + v.strip() for (k, v) in artificialSeeds[page]
            ]
            artificialPerPage = [
                " ".join(item.split()) for item in artificialPerPage
            ]
            resultsPerPattern.append((computed, expected, artificialPerPage))
            stats[patternIndex] = (pattern, resultsPerPattern)
        patterns = getPatternsFromStats(stats)
    return patterns
Beispiel #2
0
def singleObjectPatternFiltering(patterns,
                                 websiteLocation,
                                 supervisedFileLocation,
                                 artificialSeedSet,
                                 threshold=1000,
                                 preprocessType="None"):
    stats = {}
    output = []
    patternIndex = 0
    for pattern in patterns:
        pages = getAllPagesInsideWebsite(websiteLocation)
        resultsPerPattern = []
        patternIndex += 1
        for page in pages:
            (lp, rp) = pattern
            exactPageLocation = page + "/page.html"
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            singleObj = ""
            if len(contentList) == 1:
                singleObj = contentList[0]
            goldContent = " ".join(singleObj.split())
            expected = [goldContent]
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            computed = makeSingleObjectExtractions(pageContent, lp, rp,
                                                   threshold)
            resultsPerPattern.append(
                (computed, expected, artificialSeedSet[page]))
        stats[patternIndex] = (pattern, resultsPerPattern)
    patterns = getPatternsFromStats(stats)
    return patterns
Beispiel #3
0
def singleObjectPatternFiltering(patterns,
                                 websiteLocation,
                                 supervisedFileLocation,
                                 preprocessType="None"):
    output = []
    for pattern in patterns:
        (lp, rp) = pattern
        pages = getAllPagesInsideWebsite(websiteLocation)
        patternScore = 0
        for page in pages:
            exactPageLocation = page + "/page.html"
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            singleObj = ""
            if len(contentList) == 1:
                singleObj = contentList[0]
            goldContent = " ".join(singleObj.split())
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            if preprocessType == "NUM":
                pageContent = replaceNumWordsInStr(pageContent)
            results = makeSingleObjectExtractions(pageContent, lp, rp)
            if goldContent in results:
                patternScore += 1
        if patternScore > 0:
            output.append((lp, rp))
    return output
def doClusterExtractionTesting():
    global websiteLocation
    patternsLocation = "./supervisedData/amazon/specsPatterns.tsv"
    outputLocation = "./specsTestingAmazon"
    pages = getAllPagesInsideWebsite(websiteLocation)
    clusters = clusterPatternsTesting(patternsLocation, pages)
    clusters = "\n\n".join(clusters)
    with open(outputLocation, "w") as f:
        f.write(clusters)
        f.close()
    print("Output written at location:- " + str(outputLocation))
def doRelationExtractionTesting():
    global websiteLocation
    patternsLocation = "./supervisedData/amazon/tablePatterns.tsv"
    outputLocation = "./relationTestingAmazon"
    pages = getAllPagesInsideWebsite(websiteLocation)
    relations = relationPatternsTesting(patternsLocation, pages)
    relations = "\n".join(relations)
    with open(outputLocation, "w") as f:
        f.write(relations)
        f.close()
    print("Output written at location:- " + str(outputLocation))
def doEntityExtractionTesting():
    global websiteLocation
    patternsLocation = "./supervisedData/amazon/titlePatterns.tsv"
    outputLocation = "./entityTestingAmazon"
    pages = getAllPagesInsideWebsite(websiteLocation)
    entities = entityPatternsTesting(patternsLocation, pages)
    entities = "\n".join(entities)
    with open(outputLocation, "w") as f:
        f.write(entities)
        f.close()
    print("Output written at location:- " + str(outputLocation))
Beispiel #7
0
def clusterPatternFiltering(patterns,
                            websiteLocation,
                            supervisedFileLocation,
                            artificialClusters,
                            preprocessType="None"):
    stats = {}
    output = []
    patternIndex = 0
    for pattern in patterns:
        pages = getAllPagesInsideWebsite(websiteLocation)
        resultsPerPattern = []
        patternIndex += 1
        for page in pages:
            (lp, mp, rp) = pattern
            exactPageLocation = page + "/page.html"
            # contentList = getClusterInsideLeftRightPattern(pageContent)
            contentList = readFileContentInList(page + "/" +
                                                supervisedFileLocation)
            contentList = [" ".join(item.split()) for item in contentList]
            expected = contentList
            expectedArtificial = artificialClusters[page]
            expectedArtificial = [
                " ".join(item.split()) for sublist in expectedArtificial
                for item in sublist
            ]
            pageContent = readPlainHtmlPageContent(exactPageLocation)
            clusters = getClusterInsideLeftRightPattern(
                pageContent, lp, mp, rp)
            computed = []
            for cluster in clusters:
                computed.extend(getElementsOfCluster(cluster, mp))
            # print("Expected is ")
            # print(expected)
            # print("expected Artificial is ")
            # print(expectedArtificial)
            # print("Computed is ")
            # print(computed)
            resultsPerPattern.append((computed, expected, expectedArtificial))
        stats[patternIndex] = (pattern, resultsPerPattern)
    patterns = getPatternsFromStats(stats)
    return patterns
patternsOutputLocation = "tablePatterns.tsv"
KEY_VALUE_AWAY_LIMIT = 100

#essential imports
from utils import writeTripletPatternsAsCsv, getAllContextsForKV
from FileUtil import getWebsiteLocations, getAllPagesInsideWebsite, readPlainHtmlPageContent
from FileUtil import readFileRelationContentInList
from RelationPatternsLearningUtil import learnPatterns
from utils import processNumInContext
#get all the locations for website so that we can start extracting the patterns for them.
websiteLocations = getWebsiteLocations(supervisedDataLocation)
print(websiteLocations)
from utils import appendPreprocessType
#work for each website independently
for websiteLocation in websiteLocations:
    pages = getAllPagesInsideWebsite(websiteLocation)
    corpusLevelRelationContext = []
    for page in pages:
        exactPageLocation = page + "/page.html"
        print("Exact page location:- ")
        print(exactPageLocation)
        #also can be called as supervision of relations
        supervisedRelationList = readFileRelationContentInList(
            page + "/" + supervisedFileName)
        allRelationContextsPerPage = []
        pageContent = readPlainHtmlPageContent(exactPageLocation)
        for supervisedRelation in supervisedRelationList:
            (key, value) = supervisedRelation
            contextsPerRelation = getAllContextsForKV(pageContent, key, value,
                                                      KEY_VALUE_AWAY_LIMIT)
            #if condition to ignore the number of contexts.