def run_for_url(self, ch, method, properties, body):
    doc_url = body.decode("utf-8")
    print("[Crawler] Received %r" % doc_url)

    document_text = WebScraper.get_text(doc_url)
    document_links = WebScraper.get_links(doc_url)

    hash_object = hashlib.sha256(document_text.encode("utf-8"))
    digest = hash_object.hexdigest()

    doc_record = self.document_client.get_by_url(doc_url)
    if "id" not in doc_record:
      doc_record = self.document_client.create(doc_url, digest)

    doc_indexed = self.indexing_client.get_by_id(doc_record["id"])
    if "url" not in doc_indexed:
      self.indexing_client.index(doc_record["id"], doc_url, document_text)

    if doc_record["digest"] != digest:
      self.document_client.update_digest(doc_record["id"], digest)
      self.indexing_client.update_content(doc_record["id"], document_text)

    for link in document_links:
      if self.url_counter < Crawler.MAX_URL:
        self.url_counter += 1
        child_doc_record = self.document_client.get_by_url(link.geturl())
        if "id" not in child_doc_record:
          child_doc_record = self.document_client.create(link.geturl(), "digest")
        self.document_client.create_link(doc_record["id"], child_doc_record["id"])
        self.producer.publish(link.geturl())

    self.pagerank_client.update(doc_record["id"])
	def test_Date(self):
		scraper = WebScraper()
		self.assertEqual(scraper._parse_date("1982"), "1982", "Unexpected date response for valid date format")
		self.assertEqual(scraper._parse_date("1983-08-14"), "1983", "Unexpected date response for valid date format")
		self.assertEqual(scraper._parse_date("1986-08-14 00:00:00"), "1986", "Unexpected date response for valid date format")
		self.assertEqual(scraper._parse_date("19/08/1980"), "1980", "Unexpected date response for valid date format")
		self.assertEqual(scraper._parse_date("1980-08"), "1980", "Unexpected date response for valid date format")
		self.assertEqual(scraper._parse_date(""), "1970", "Unexpected date response for invalid date format")
		self.assertEqual(scraper._parse_date("no date"), "1970", "Unexpected date response for invalid date format")
		self.assertEqual(scraper._parse_date(None), "1970", "Unexpected date response for invalid date format")
		self.assertEqual(scraper._parse_date("test"), "1970", "Unexpected date response for invalid date format")
Example #3
0
def push_photo():   
    # read request data
    data = json.loads(request.get_data())
    photo_scraper = PhotoScraper(TEMP_DIR)  
    output_file = photo_scraper.get_picture(data['bucket'], data['file_name'])
    classification = r_scraper.classify_photo(output_file)
    camera_id = data['camera_id']
    mode = data['mode']
    date = data['time_in'] 
    web = WebScraper()
    web.post_entry({
        'time_in'  : date,
        'camera_id': camera_id,
        'item_name': classification})
    return 'success'
Example #4
0
def get_ISIN_download_pdf(funds, ISIN_file = 'ISINs.csv', headless = False):
    '''
    Locate and download the most recent report for a fund, also save ISIN numbers.

    Parameters
    ----------
    funds : list
        The funds to be found.
    ISIN_file : str
        The filename to which ISINs are written
    headless : boolean
        If True, browser is run headlessly

    '''
    
    scraper = WebScraper('C:/Users/Ollie/Downloads/chromedriver_win32/chromedriver', headless = headless)
        
    ISINs = {}
    n_funds = len(funds)
    for i, fund in enumerate(funds):
        nospace_fund = fund.replace(' ', '_')
        # Rename any downloaded pdfs as you go
        renamed_files = scraper.rename_downloads_if_done()
        # Add any renamed files to the ISIN doc
        if len(renamed_files) > 0:
            for fund_path in renamed_files:
                temp_fund_name = fund_path.replace('./pdf_downloads/','').replace('.pdf','')
                fund_id, ISIN = ISINs.pop(temp_fund_name)
                write_ISIN(ISIN_file, fund, fund_id, ISIN)
                
        
        print('\n\n')
        completion = round(i/n_funds * 100, 1)
        print(f'Fund {i} of {n_funds} - {completion}% complete')

        # Get the fund id
        fund_id, ISIN = scraper.get_fund_id_ISIN(fund)
        if fund_id is None:
            # If you can't find the fund, write not found in the ISIN doc
            write_ISIN(ISIN_file, fund, 'Not Found', 'Not Found')
        else:
            success = scraper.download_pdf(fund_id, './pdf_downloads/' + nospace_fund + '.pdf')
            if not success:
                # Write the fund ISIN into the csv straight away if there is no pdf. Otherwise, wait until the
                # pdf is found.
                write_ISIN(ISIN_file, fund, fund_id, ISIN)
            else:
                # If you are waiting for the pdf to download, store the ISIN temporarily
                ISINs.update({nospace_fund : [fund_id, ISIN]})
        

    scraper.rename_downloads()
    scraper.kill()
Example #5
0
    def search_and_add_station(self):
        scraper = WebScraper(self)
        redo = True
        while redo:
            redo = False

            name, url = scraper.get_stream_url()

            self.show('Confirm: Name: ' + name + ' URL: ' + url +
                      '\nEnter \'Ok\' or \'redo\'.\n')
            command = input()
            if command.lower() == 'ok':
                self.settings['stations'].append(name)
                self.settings['urls'].append(url)
                self.show('Added.')
            else:
                redo = True
Example #6
0
def choose_random_quote(k, url):
    w = WebScraper()

    html = w.get_html(url)
    # html = urllib.urlopen(w.get_html(url)).read().decode()

    quotes = set()

    for li in html.select(k):
        for quote in li.text.split('\n'):
            # Case for handling quotes with 0 text length
            # and for handling strings of author names i.e. FirstName LastName).
            # We should never have quotes with less than two words anyway.
            if (len(quote) > 0 and len(quote.split()) > 3 and quote !=
                    "Ahh!!! Still looking for more? We have something for you."
                ):
                quotes.add(quote.strip())

    q = random.choice(list(quotes))

    return q
Example #7
0
def generate_data(args, loop, executor):
    doc_queue = asyncio.Queue()

    if args.web_scraper:
        scraper = WebScraper(loop, doc_queue)
    else:
        scraper = ElasticScraper(loop, doc_queue)

    exporter = CsvExporter("export.csv")
    parser = Parser(loop, executor, doc_queue, exporter,
        thread_pool_size)

    try:
        scraper_task = loop.create_task(scraper.start_scraping())
        parser_task = loop.create_task(parser.start_processing())
        yield from asyncio.gather(scraper_task, parser_task, loop = loop)
    except asyncio.CancelledError:
            pass
    finally:
        scraper_task.cancel()
        parser_task.cancel()

    path = "data/price_data.csv".format(datetime.now())
    exporter.upload_to_S3("cz-whatthehack-local-information", path)
Example #8
0
BASE_DIR = Path(__file__).resolve().parent.parent
output_file = os.path.join(BASE_DIR, 'data/test.csv')

with open('dates.json') as f:
    dates = json.load(f)
with open('urls.json') as f:
    url = json.load(f)

checkins = dates['checkin']
checkouts = dates['checkout']

for checkin, checkout in zip(checkins[10:], checkouts[10:]):
    web1_url = url['web1'].format(checkin=checkin)
    print(f'\n\nAppending website1 data for {checkin}')
    web1 = WebScraper(web1_url, 'website1', checkin) # First page
    web1.scrape(output_file)
    for i in tqdm(range(2, 101)):
        web1 = WebScraper(web1_url, 'website1', checkin, page=i)
        web1.scrape(output_file)
        if not web1.MorePages:
            break

    web2_url = url['web2'].format(checkin=checkin, checkout=checkout)
    print(f'\n\nAppending website2 data for {checkin}')
    web2 = WebScraper(web2_url, 'website2', checkin, checkout) # First page
    web2.scrape(output_file)
    for i in tqdm(range(20, 2001, 20)): # Offset starts at 20 amd increases by 20
        web2 = WebScraper(web2_url, 'website2', checkin, checkout, i)
        web2.scrape(output_file)
        if not web2.MorePages:
Example #9
0
def main():
    app = WebScraper(AnchorScraper())
    #url = 'https://www.tercalivre.com.br'
    url = 'https://github.com'
    app.load(url)
    sys.exit(app.exec_())
Example #10
0
 def test_Date(self):
     scraper = WebScraper()
     self.assertEqual(scraper._parse_date("1982"), "1982",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date("1983-08-14"), "1983",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date("1986-08-14 00:00:00"), "1986",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date("19/08/1980"), "1980",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date("1980-08"), "1980",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date(""), "1970",
                      "Unexpected date response for invalid date format")
     self.assertEqual(scraper._parse_date("no date"), "1970",
                      "Unexpected date response for invalid date format")
     self.assertEqual(scraper._parse_date(None), "1970",
                      "Unexpected date response for invalid date format")
     self.assertEqual(scraper._parse_date("test"), "1970",
                      "Unexpected date response for invalid date format")
from notification_manager import NotificationManager
import os
from web_scraper import WebScraper

keyvault_url = os.environ.get('KEYVAULT_URL')
credential = DefaultAzureCredential()
kv_client = SecretClient(keyvault_url, credential)

MY_EMAIL = kv_client.get_secret('fake-email').value
PASSWORD = kv_client.get_secret('fake-email-password').value
TARGET_EMAIL = kv_client.get_secret('target-email').value

target_product = input("Please enter the full url of the product you wish to search: ")
preferred_price = float(input("Please enter your target price: ").strip('$'))

scraper = WebScraper()

product_dict = scraper.retrieve_price_from_site(target_product, preferred_price)
data_manager = DataManager(product_dict)

data_manager.check_if_item_in_data_file()
products_to_buy = data_manager.check_if_price_below_target()

notification_manager = NotificationManager(MY_EMAIL, PASSWORD)

notification_manager.send_email(products_to_buy, TARGET_EMAIL)




Example #12
0
def _get_stat(player, stat, year):
    url_retriever = HockeyReferenceUrlRetriever(player)
    scraper = WebScraper(url_retriever.get_url())
    return scraper.get_player_stats_for_year(f'{stat}', year)
Example #13
0
from web_scraper import WebScraper
from indeed_scraper import IndeedScraper
from csv_saver import CsvSaver

scraper = WebScraper(
    IndeedScraper, CsvSaver('data_scientist.csv'), {
        'job_title': '',
        'location': '',
        'max_count': 50,
        'save_links': True,
        'advance_request': 'q=\"data+scientist\"&limit=50'
    })
scraper.scrape()
Example #14
0
    headless = True
    csv_file = 'ISINs.csv'
    
    # Remove any uncompleted downloads
    [os.remove(file) for file in os.listdir() if file.endswith('.crdownload') or file.endswith('.pdf')]
    # Create download folder
    if not os.path.exists('./pdf_downloads'):
        os.mkdir('./pdf_downloads')
        
        
    # If this is the first time running, you need to get the fund names. If not you can load a save file.
    if os.path.exists('funds.p'):
        funds = pickle.load(open('funds.p','rb'))
    else:
        # Put your path to the chromedrive binaries here!!!
        scraper = WebScraper('C:/Users/Ollie/Downloads/chromedriver_win32/chromedriver', headless = headless)
        funds = scraper.get_fund_list()
        scraper.kill()
        pickle.dump(funds, open('funds.p','wb'))
        
    # If the csvfile does not exist, fill in headers
    if not os.path.exists(csv_file):
        with open('ISINs.csv', 'w') as file:
            file.write('Funds,ISINs\n')
        uncompleted_funds = funds
    
    
    entries = genfromtxt(csv_file, delimiter=',', dtype = str, skip_header = 1)

    completed_funds = list(entries[:,0]) if entries.shape[0] > 0 else []
def get_contents_from_xpaths():
    json_dict = request.get_json()
    scraper = WebScraper(json_dict)
    return jsonify(**scraper.scrape())
Example #16
0
from cluster_point import CPDatabase
from web_scraper import WebScraper

if __name__ == '__main__':
    cp_db = CPDatabase()
    webscraper = WebScraper()

    ingredients = ["niacin", "salt", "sugar", "colors", "glucose","water","riboflavin","glycerin","fructose","gelatin","iron","niacinamide","spice","maltodextrin","molasses","semolina","cornstarch","caffeine","pectine","sucralose","carmine","palmoil","shortening","monosodium","cornsyrup","whey","corn","lactose","yellow5", "dextrose","yellow6","vitamin-B","soybean","bht","blue1","xanthangum"]
    for ingredient in ingredients:
        data = webscraper.get_data_in_cluster_format(ingredient)
        cp_db.insert(data)

    # cp_db.insert({u'Skin conditions': u'Niacinamide has been used in skin care products, including moisturizers, anti-aging products, and treatments for rosacea, a skin condition involving facial redness and pimples. The benefits of niacinamide in skin care needs to be further studied before conclusions can be made.', u'Type 2 diabetes': u'Limited research shows unclear effects of niacin on outcomes of type 2 diabetes. Human research has also shown that niacin increases blood sugar levels. People should seek medical advice before starting niacin. Further research is needed to draw conclusions.', u'Type 1 diabetes mellitus prevention (niacinamide)': u'In human research, niacinamide lacked an effect on the development of diabetes (type 1). Evidence is mixed and more study is needed in this area. ', u'Pellagra': u'Niacin (vitamin B3) and niacinamide are U.S. Food and Drug Administration (FDA)-approved for the treatment of pellagra, or niacin deficiency. Pellagra is a nutritional disease that occurs due to insufficient dietary amounts of vitamin B3 or the chemical it is made from (tryptophan). Symptoms of pellagra include skin disease, diarrhea, dementia, and depression.', u'Clogged arteries': u'Niacin decreases blood levels of cholesterol, which may reduce the risk of clogged or hardened arteries. However, niacin also can increase homocysteine levels, which may have the opposite effect. Overall, the research supports the use of niacin in combination with other drugs to reduce the risk of clogged arteries. More research is needed in this area before a firm conclusion can be drawn.', u'Age-related macular disease (eye disease)': u'Early evidence suggests that niacin may have beneficial effects in age-related macular degeneration (AMD), a disease that often leads to vision loss. More well-designed studies are needed for conclusions to be reached.', u"Alzheimer's disease (mental decline)": u"Dementia can be caused by a severe lack of niacin in the body. Early evidence suggests that taking more niacin in the diet may slow the onset of Alzheimer's disease and mental decline. Further research is needed before a conclusion can be drawn. ", u'Osteoarthritis (niacinamide)': u'Early research suggests that niacinamide may be useful in the treatment of osteoarthritis. Further research is needed before a conclusion can be made.', u'Erectile dysfunction': u'Early research suggests that niacin has a beneficial effect on erectile dysfunction. However, further well-controlled studies are needed to draw conclusions. ', u'High blood phosphorous level': u'Early evidence shows that niacinamide had reduced high phosphate levels in the blood. However, more research is needed before a firm conclusion can be made.', u'Hepatitis C': u'Early research suggests that niacin may decrease blood levels of hepatitis C, a virus that damages the liver. Notably, niacin has been also associated with liver damage. Further research is necessary for a conclusion to be made. ', u'High cholesterol': u'Niacin is a well-accepted treatment for high cholesterol. Multiple studies show that niacin (not niacinamide) has benefits on levels of high-density cholesterol (HDL or "good cholesterol"), with better results than drugs such as "statins" like atorvastatin (Lipitor\xae). There are also benefits on levels of low-density cholesterol (LDL or "bad cholesterol"), although these effects are less dramatic. Adding niacin to a second drug such as a statin may increase the effects on low-density lipoproteins. The use of niacin for the treatment of high cholesterol associated with type 2 diabetes has been controversial because of the possibility of worsening blood sugar control. People should check with a physician and pharmacist before starting niacin.', u'Heart disease': u'Niacin decreases levels of cholesterol, and other chemicals in the blood, which can reduce the risk of heart disease. However, niacin also increases homocysteine levels, which can increase this risk. Research has shown beneficial effects of niacin, especially in combination with other drugs, for preventing heart disease and fatal heart attacks. Further study is needed to draw conclusions.', u'Type 1 diabetes (slowing progression)': u'Non-human research shows that niacinamide delays the onset of insulin dependence in type 1 diabetes. However, human research assessing whether niacinamide slows progression of type 1 diabetes has yielded unclear results. Further study is needed for conclusions to be reached. ', u'Headaches': u'Early research shows that niacin may be beneficial in the treatment or prevention of headaches. More research is needed.'})
    # cp_db.insert({'niacin': {'text': 'example value'}})
    # cp_db.insert({'ingredient2': {'conditions': [{'condition': 'Disease','description': 'Description of desease'}, {'condition': 'Another disease','description': 'Description of desease'}]}})
Example #17
0
    logging.root.setLevel(logging.INFO)
    filename = "Data Gathering - Covid in Economy.csv"
    data_directory = "data"
    result_directory = "results/COVID"
    parent_dir = os.path.dirname(os.getcwd())
    data_df = pandas.read_csv(os.path.join(parent_dir, data_directory,
                                           filename),
                              encoding="UTF-8")
    urls = data_df["URL"]
    unique_identifiers = data_df["Summary or Document Name"]

    for index, url in enumerate(urls, start=1):
        logging.info("Processing document {index} from {url}".format(
            index=index, url=url))
        with open(os.path.join(parent_dir, data_directory, 'covid',
                               str(index) + '.txt'),
                  "r",
                  encoding="utf8") as f:
            document = f.read()
        WebScraper.sanitize_text(document)
        document = document.replace('\n\n', '\n')
        if logging.root.level == logging.DEBUG:
            print(document)
        toolkit = NLPToolkit(
            document,
            os.path.join(parent_dir, result_directory,
                         "{index}.json".format(index=index)),
            unique_identifiers[index - 1],
        )
        time.sleep(1)