Example #1
0
def test_content_only_from_obj():
    # acquire data from a test html page built in cleanHtme::buildTestHtml
    clsObj = CaptureContent()
    name = dateFinder.buildTestHtml()
    soup = BeautifulSoup(name)
    exclude = []
    minLength = 10

    # we should have one html object with both 'dogma' and 'karma', three with any of these
    andOr = "and"
    include = ["dogma", "karma"]
    content = clsObj.captureContent(soup, include, exclude, minLength=minLength, andOr=andOr)
    assert len(content) == 1
    andOr = "or"
    content = clsObj.captureContent(soup, include, exclude, minLength=minLength, andOr=andOr)
    assert len(content) == 3

    # we should have three with 'dogma' only, and one if we exclude mentions of 'fang'
    include = ["dogma"]
    content = clsObj.captureContent(soup, include, exclude, minLength=minLength, andOr=andOr)
    assert len(content) == 3
    exclude = ["fang"]
    content = clsObj.captureContent(soup, include, exclude, minLength=minLength, andOr=andOr)
    assert len(content) == 1

    # make sure when run from the main routine, the expected information is returned
    results = clsObj.capture_main(
        name=name, streamType="o", include=include, exclude=exclude, minLength=minLength, andOr=andOr
    )
    assert len(results["polishedCont"]) == 1
    assert len(results["contentAsSoupObjects"]) == 1
    assert len(results["metaData"]) == 2
    assert len(results["links"]) == 2
Example #2
0
def test_main_with_obj():
    # acquire data from a test html page built in cleanHtml::buildTestHtml
    clsObj = CaptureContent()
    name = dateFinder.buildTestHtml()
    soup = BeautifulSoup(name)
    exclude = []
    minLength = 10
    streamType = "o"
    # we should have one html object with both 'dogma' and 'karma', three with any of these
    andOr = "and"
    include = ["dogma", "karma"]
    clsObj.capture_main(name, streamType, include, exclude, minLength, andOr, socket_timeout=None)
    a = 1