def test_headline(self): cnn_headline = scrape.Scraper( "https://www.cnn.com/australia", "//img[@class='media__image media__image--responsive']", "//h3[@data-analytics='dummy_class']") cnn_headline.scraper() self.assertEqual(cnn_headline.print_headline(), "None")
def scraping(self): """ Scrapes and displays website information. """ obj = scrape.Scraper(self.scrape_url) obj.display_title() obj.display_header() obj.display_links()
def getRecords(start): scraper = arxivscraper.Scraper(category=arxiv_repo, date_from=start)#usage of "until" is discouraged, see: https://arxiv.org/help/oa/index @ Datestamps section output = scraper.scrape() cols = output[0].keys() df = pd.DataFrame(output,columns=cols) return df
def update(): url = request.form.get('url') keywords = request.form.get('keywords') num = request.form.get('num') my_article = scrape.Scraper(url, keywords, num) my_article.scraping() # print(my_article.article_list) # print(my_article.date_list) # print(my_article.link_list) # print(my_article.title_list) return render_template('home.html', msg=my_article.msg, article_list=my_article.article_list, authors=my_article.author_list, date=my_article.date_list, link=my_article.link_list, title=my_article.title_list)
import argparse import collect import scrape go_nums = collect.get_go_nums() scraper = scrape.Scraper() try: for go_num in go_nums: print go_num scraper.find_case(go_num) except Exception as e: print 'Encountered unexpected error: ', e scraper.close()
def main(): print("Online Novel to PDF") title = input("Title of PDF: ") address = input("Address of first page: ") index = int(input("Incrementing index: ")) #parse Address parts = address.split(str(index)) prefix = parts[0] suffix = parts[1] titleKeyword = input("Keyword in Title \ (To make sure only scraping what is needed): ") address = prefix + str(index) + suffix scraper = scrape.Scraper(address) tag = '' attrs = '' if input("Do you know which HTML container the content resides in? " ) == 'yes': tag = input("Element type: ") attrName = input("Element Atrribute Name: ") attrValue = input("Element Attribute Value: ") attrs = {attrName: attrValue} else: print( "You will be shown the text of each of the elements on the page. If \ the text matches the content you are looking to scrape, type in yes. \ If not, Just press enter.") elems = scraper.getAllElements() for i in range(2, len(elems)): content = elems[i].text if content != "": print(elems[i].text) if input("Is this correct?") == 'yes': #Find element attributes elem = elems[i] tag = elem.name attrs = elem.attrs break if tag == '': print("No element selected. Exiting.") exit() pdf = convert.Converter() text = scraper.getText(tag, attrs) pdf.addText(text) print(text) index += 1 address = prefix + str(index) + suffix scraper.goTo(address) while titleKeyword in scraper.getText('title'): text = scraper.getText(tag, attrs) pdf.addText(text) print(text) index += 1 address = prefix + str(index) + suffix scraper.goTo(address) pdf.printPDF(title) print("done!")
import sys import scrape import logging if __name__ == "__main__": args = sys.argv[1:] if len(args) > 1: worker = scrape.Scraper(int(args[0]), last_link_index=int(args[1]), last_page_index=(args[2])) else: worker = scrape.Scraper(int(args[0])) #worker = scrape.WireScraper(8, 18, 1) worker.main()
import sys, scrape # name, season_from=NUMBER, season_to=NUMBER, saving_dir scrape.Scraper().scrape(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
def test_duplicate(self): """Test the remove duplicate function""" cnn_headline = scrape.Scraper("https://www.cnn.com/australia", "//img[@class='media__image media__image--responsive']", "//h3[@data-analytics='dummy_class']") self.assertEqual(cnn_headline.remove_duplicate(["a", "a", "a"]), {"a"})