Ejemplo n.º 1
0
 def test_headline(self):
     cnn_headline = scrape.Scraper(
         "https://www.cnn.com/australia",
         "//img[@class='media__image media__image--responsive']",
         "//h3[@data-analytics='dummy_class']")
     cnn_headline.scraper()
     self.assertEqual(cnn_headline.print_headline(), "None")
Ejemplo n.º 2
0
 def scraping(self):
     """
     Scrapes and displays website information.
     """
     obj = scrape.Scraper(self.scrape_url)
     obj.display_title()
     obj.display_header()
     obj.display_links()
Ejemplo n.º 3
0
 def getRecords(start):
     scraper = arxivscraper.Scraper(category=arxiv_repo, date_from=start)#usage of "until" is discouraged, see: https://arxiv.org/help/oa/index @ Datestamps section
     output = scraper.scrape()
     
     
     cols = output[0].keys()
     df = pd.DataFrame(output,columns=cols)
     
     return df
Ejemplo n.º 4
0
def update():

    url = request.form.get('url')
    keywords = request.form.get('keywords')
    num = request.form.get('num')

    my_article = scrape.Scraper(url, keywords, num)
    my_article.scraping()

    # print(my_article.article_list)
    # print(my_article.date_list)
    # print(my_article.link_list)
    # print(my_article.title_list)

    return render_template('home.html',
                           msg=my_article.msg,
                           article_list=my_article.article_list,
                           authors=my_article.author_list,
                           date=my_article.date_list,
                           link=my_article.link_list,
                           title=my_article.title_list)
Ejemplo n.º 5
0
import argparse

import collect
import scrape

go_nums = collect.get_go_nums()
scraper = scrape.Scraper()

try:
    for go_num in go_nums:
        print go_num
        scraper.find_case(go_num)
except Exception as e:
    print 'Encountered unexpected error: ', e

scraper.close()
Ejemplo n.º 6
0
def main():
    print("Online Novel to PDF")

    title = input("Title of PDF: ")

    address = input("Address of first page: ")
    index = int(input("Incrementing index: "))
    #parse Address
    parts = address.split(str(index))
    prefix = parts[0]
    suffix = parts[1]

    titleKeyword = input("Keyword in Title \
    (To make sure only scraping what is needed): ")

    address = prefix + str(index) + suffix
    scraper = scrape.Scraper(address)

    tag = ''
    attrs = ''

    if input("Do you know which HTML container the content resides in? "
             ) == 'yes':
        tag = input("Element type: ")
        attrName = input("Element Atrribute Name: ")
        attrValue = input("Element Attribute Value: ")
        attrs = {attrName: attrValue}
    else:
        print(
            "You will be shown the text of each of the elements on the page. If \
        the text matches the content you are looking to scrape, type in yes. \
        If not, Just press enter.")
        elems = scraper.getAllElements()
        for i in range(2, len(elems)):
            content = elems[i].text
            if content != "":
                print(elems[i].text)
                if input("Is this correct?") == 'yes':
                    #Find element attributes
                    elem = elems[i]
                    tag = elem.name
                    attrs = elem.attrs
                    break

    if tag == '':
        print("No element selected. Exiting.")
        exit()

    pdf = convert.Converter()

    text = scraper.getText(tag, attrs)
    pdf.addText(text)
    print(text)
    index += 1
    address = prefix + str(index) + suffix
    scraper.goTo(address)

    while titleKeyword in scraper.getText('title'):
        text = scraper.getText(tag, attrs)
        pdf.addText(text)
        print(text)
        index += 1
        address = prefix + str(index) + suffix
        scraper.goTo(address)

    pdf.printPDF(title)
    print("done!")
Ejemplo n.º 7
0
import sys
import scrape
import logging

if __name__ == "__main__":
    args = sys.argv[1:]

    if len(args) > 1:
        worker = scrape.Scraper(int(args[0]),
                                last_link_index=int(args[1]),
                                last_page_index=(args[2]))
    else:
        worker = scrape.Scraper(int(args[0]))
        #worker = scrape.WireScraper(8, 18, 1)

    worker.main()
Ejemplo n.º 8
0
import sys, scrape

# name, season_from=NUMBER, season_to=NUMBER, saving_dir
scrape.Scraper().scrape(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
Ejemplo n.º 9
0
 def test_duplicate(self):
     """Test the remove duplicate function"""
     cnn_headline = scrape.Scraper("https://www.cnn.com/australia", "//img[@class='media__image media__image--responsive']", "//h3[@data-analytics='dummy_class']")
     self.assertEqual(cnn_headline.remove_duplicate(["a", "a", "a"]), {"a"})