Beispiel #1
0
def test_all():
    main = scraper.get_soup("http://caixin.com/")
    links = get_links(main)
    num_links = len(links)

    i = 0
    for link in links:
	# add this 1 line
	print link
        print i
        url, category, date = link
        print url
        print category
        print date
        article = scraper.get_soup(url)
        meta = scraper.get_meta(article)
        print meta
        content = get_content(article)
        if content:
            for line in get_content(article):
                print line
        else:
            print "PHOTOS ONLY"
        i += 1

    print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
Beispiel #2
0
def test_all():
    root = "http://www.nanzao.com"
    main = scraper.get_soup(root)
    if main:
        print "Fetched index page"
        all_links = get_links(main)
        num_links = len(all_links)
        print "num_links", num_links

        i = 0
       
        for url, cat in all_links:
            print "URL AND CAT"
            print url, cat
            soup = scraper.get_soup(root + url)
            if soup:
                publishdate = get_publishdate(soup)
                print "PUBLISHDATE"
                print publishdate
                content = get_content(soup)
                print "CONTENT"
                print content
                meta = scraper.get_meta(soup)
                print "META"
                print meta
                print "ARTICLE NUM", i
                i += 1
    print "NUMBER OF SUCCESSFULLY FETCHED ARTICLES: ", i
    print "TOTAL NUMBER OF ARTICLES", num_links
Beispiel #3
0
def test_all():
    print "test11"

    main = scraper.get_soup("http://www.infzm.com/")

    print "test22"
    links = get_links(main)
    num_links = len(links)

    i = 0
    for link in links:
        print i
        print link
        article = scraper.get_soup(link)
        meta = scraper.get_meta(article)
        print meta
        content = get_content(article)
        print content
        i += 1

    print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
Beispiel #4
0
def test_all():
    main = scraper.get_soup("http://cyol.com")
    links = get_links(main)
    num_links = len(links)
    #import pdb; pdb.set_trace()
    i = 0
    for link in links:
        print i
        url, category, date = link
        print url
        print category
        print date
        article = scraper.get_soup(url)
        meta = scraper.get_meta(article)
        print meta
        author = get_author(article)
        print author
        content = get_content(article)
        print content
        i += 1

    print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
Beispiel #5
0
def test_all():
    """Test scraper on all articles"""
    main = scraper.get_soup("http://www.huanqiu.com/")
    if main:    
        print "got the main soup"
        all_links = get_links(main)
        num_links = len(all_links)

        i = 0
        for link in all_links:
	    # add this one line
	    print link

            category = get_category(link)
            print category
            soup = scraper.get_soup(link)
            if soup:
                encoded_content = [line for line in get_content(soup)]
                all_meta = scraper.get_meta(soup)
                author = get_author(soup)
                publishdate = get_publishdate(soup)
                parsedate = time.strftime("%Y-%m-%d %H:%M:%S")

                #for testing
                if encoded_content:
                    for p in encoded_content:
                        print p
                print all_meta
                print author
                print publishdate
                print parsedate

                print "ARTICLE NUMBER", i
                i += 1

    print "NUMBER OF SUCCESSFULLY FETCHED ARTICLES: ", i
    print "TOTAL NUMBER OF ARTICLES", num_links