Esempio n. 1
0
def homePageScrapper(html):
    soup = bs(html, 'lxml')
    targetSection = soup.find(id="mainResults")
    targetDivs = targetSection.findAll('div', {'class': 's-item-container'})
    links = {}
    for div in targetDivs:
        itemLink = div.find(
            'a', {
                'class':
                'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'
            })
        itemName = div.find(
            'h2',
            {'class': 'a-size-base s-inline s-access-title a-text-normal'})
        links[itemName.text] = itemLink.get('href')

    file1 = open("data/homePageLinks.csv", "w")
    file2 = open("data/homePageLinks.txt", "w")
    file1.write('"Product Name","Link",\n')
    for key in links:
        fileEnrty = '"' + key + '"' + ',' + '"' + links[key] + '"' + '\n'
        file1.write(fileEnrty)
        file2.write(key + '~' + links[key] + '\n')
    file1.close()
    file2.close()
    print('\ncreated a file data/homePageLinks.csv\n')
    scraper("homePageLinks.txt")
Esempio n. 2
0
def await_market_open(num):
    setup_folders()
    num += 1
    if num > 4:
        print("market not opening today")
        return
    print("checking time...")
    client = build_client()
    today = datetime.date.today()
    try:
        clock = client.get_hours_for_single_market(
            market=client.Markets.EQUITY, date=today).json()["equity"]["EQ"]
    except:
        clock = client.get_hours_for_single_market(
            market=client.Markets.EQUITY,
            date=today).json()["equity"]["equity"]

    # app starts right at 9:30 est from scheduler
    # if it's a trading day, start the app
    if clock["isOpen"] == True:
        print("Beginning process.")
        scraper()  #
        assess('skip')  #
        time.sleep(
            1
        )  # This whole process (from scrape to starting watchdog) takes about 2-5 minutes
        daily_trader(
            'initial'
        )  # so there's also inherently a delay between market open and when the app
        time.sleep(1)  # starts trading
        run_watchdog(0)  #
    else:
        print("market not open today.")
        sys.exit()
Esempio n. 3
0
def main(audio_dir, json_dir, download_hours_ahead, audio_hours_retain,
         json_hours_retain):
    scraper.scraper(json_dir)
    logging.info("Scraper complete")
    sh.cleanup(audio_dir, audio_hours_retain)
    logging.info("Audio cleanup complete")
    sh.cleanup(json_dir, json_hours_retain)
    logging.info("JSON cleanup complete")
    downloader.downloader(download_hours_ahead, audio_dir, json_dir)
    logging.info("Downloader complete")
Esempio n. 4
0
def main():
    """does the work, does not have the classes"""
    scraper.scraper()   # run the scraper

    with open('output.csv', 'r') as csv_file:   #open file until done reading
        lists = LinkedList.LinkedList()

        for row in csv_file:
        # make the nodes from the .csv file and put them into a linked list
            state_text = repr(row.strip())
            lists.new_node(state_text)

    lists.print_list()

    # bools for loops, self-explanatory names
    has_state = False
    still_search = True

    while still_search:
        search_for = raw_input("\nEnter a letter to narrow search "
            "(1: exit, 0: whole list): ")
        print " "

        if search_for == "0":
        # print whole list
            lists.print_list()

        elif search_for == "1":
        # quit
            print "Ending search.\n"
            still_search = False

        else:
        # print only the states beginning with user inputted letter
            current_node = lists.head

            while current_node.next != None:
                # navigate list until the end
                if current_node.cargo[0] == search_for.upper():
                    # if state starts with input, has_state = True
                    print current_node.cargo
                    has_state = True
                current_node = current_node.next

            if has_state == False:
                # if no states are found
                print "No state or territory found."

        has_state = False
Esempio n. 5
0
 def run(self):
     count = 0
     while True:
         try:
             tbd_url = self.frontier.get_tbd_url()
             if not tbd_url:
                 self.logger.info("Frontier is empty. Stopping Crawler.")
                 break
             resp = download(tbd_url, self.config, self.logger)
             self.logger.info(
                 f"Downloaded {tbd_url}, status <{resp.status}>, "
                 f"using cache {self.config.cache_server}.")
             scraped_urls = scraper(tbd_url, resp, self.mostCommonWords,
                                    self.icsSubDomains, self.longestPage,
                                    self.similarURLs)
             for scraped_url in scraped_urls:  #For each scraped url, add only if not discovered
                 if (scraped_url not in self.discoveredURLs):
                     self.discoveredURLs.add(scraped_url)
                     self.frontier.add_url(scraped_url)
             self.frontier.mark_url_complete(tbd_url)
             time.sleep(self.config.time_delay)
             count += 1
             print("\n", count, "\n")
         except:
             print("IT BLEW UPPPPPPPP")
             pass
Esempio n. 6
0
    def run(self):

        # this is where we are gonna call the function to get the robot.txt
        record = Recorder()
        while True:
            tbd_url = self.frontier.get_tbd_url()

            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break

            if not self.can_fetch(tbd_url):
                continue

            resp = download(tbd_url, self.config, self.logger)
            self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                             f"using cache {self.config.cache_server}.")

            scraped_urls = scraper(tbd_url, resp)

            # adding data to recorder
            record.add_url(tbd_url)

            if not (resp.raw_response is None and is_valid(tbd_url)):
                record.add_words(resp.raw_response.content, tbd_url)

            record.save()

            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)

        record.finish_crawl_report()
Esempio n. 7
0
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break

            # Politeness. Check if diff is less than 500 miliseconds.
            current_time = int(round(time.time() * 1000))
            parsed = urlparse(tbd_url, allow_fragments=False)
            if parsed.netloc in self.time_visited:
                if current_time - self.time_visited[parsed.netloc] < 500:
                    # print("sleeping for ", (500-(current_time-time_visited[parsed.netloc])-1) * .001)
                    time.sleep(
                        ((500 -
                          (current_time - self.time_visited[parsed.netloc])) +
                         10) * .001)
            current_time = int(round(time.time() * 1000))
            self.time_visited[parsed.netloc] = current_time

            resp = download(tbd_url, self.config, self.logger)
            if resp == False:
                continue
            else:
                self.logger.info(
                    f"Downloaded {tbd_url}, status <{resp.status}>, "
                    f"using cache {self.config.cache_server}.")

                scraped_urls = scraper(tbd_url, resp)
                # print("scraped_urls: ", scraped_urls)
                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)
                self.frontier.mark_url_complete(tbd_url)
                time.sleep(self.config.time_delay)
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            # if there is a url to download on the frontier
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break
            # put the response = download into a try except, in case there is a timeout
            # and resp doesn't equal anything
            resp = download(tbd_url, self.config, self.logger)

            self.logger.info(
                f"Downloaded {tbd_url}, status <{resp.status}>, "
                f"using cache {self.config.cache_server}.")

            # after getting the response from the webpage, the function will
            # store the information in the self.frontier
            self.frontier.store_page_text_content(resp, tbd_url)

            scraped_urls = scraper(tbd_url, resp)
            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)

        self.frontier.close_files()

        print("number of unique pages is:", unique_pages(self.frontier.discovered_urls))
        print("longest page is:", longest_page(self.frontier.site_content))
        print("fifty most common words are here:",fifty_most_common_words(self.frontier.word_frequencies))
        print(ics_subdomain_frequencies(self.frontier.discovered_urls))
Esempio n. 9
0
def index():
    if request.args:
        link = request.args['link']
        n_posts = int(request.args['n_posts'])
        print(f'i got the args they are {link, n_posts}')

        # тут начинается получаение результата
        raw_comments_df = scraper(link, n_posts)
        print('raw comments shape', raw_comments_df.shape)
        # print(raw_comments_df)
        preprocessed_comments_df = pd.DataFrame(
            {'text': raw_comments_df['text'].tolist()})
        preprocessed_comments_df['text'] = preprocessed_comments_df.text.apply(
            lambda x: str(preprocess(x)))
        preprocessed_comments_df = preprocessed_comments_df[
            preprocessed_comments_df['text'] != 'None']
        print('preproc comments shape', preprocessed_comments_df.shape)
        # print(preprocessed_comments_df)
        prediction = model_predict(preprocessed_comments_df)
        result = metrics(prediction)

        positive_n = result['positive_n']
        negative_n = result['negative_n']
        neutral_n = result['neutral_n']
        positive_index = result['positive_index']
        neutral_index = result['neutral_index']

        return render_template('result.html',
                               positive=positive_n,
                               negative=negative_n,
                               neutral=neutral_n,
                               positive_index=positive_index,
                               neutral_index=neutral_index)
    return render_template('index.html', links=[])
Esempio n. 10
0
def main():
    # output some information
    print("=====================================================")
    print("Welcome to Letterboxd Friend Ranker!")
    print("Instructions: This program compares you and")
    print("your friend's film taste. Once all the data has")
    print("been scraped and scores have been computed,")
    print("a report will be generated. The lower the avg.")
    print("difference, the better. If you and a friend do")
    print("not share at least 30 watched films, a score will")
    print("not be computed.")
    print("=====================================================\n")

    # prompt for info
    file_name = ""
    username = input("Enter your Letterboxd username: "******"Scraping friend data...\n")
    friends = sc.scraper(username)

    print("Computing scores...\n")
    # compute commonality for each friend
    results = commonality(current_user, friends)

    # write report
    print("Generating report...\n")
    gr.generate_report(results, current_user)

    print("Done! View the report in the current directory!")
Esempio n. 11
0
def BFS(root_link):

    q = Queue()
    seen = set()
    explored = []

    base_link = root_link[:root_link[:root_link.rfind('/')].rfind('/')]
    root_title = root_link[root_link.rfind('/') + 1:]
    root_node = Node(title=root_title, link=root_link)
    seen.add(root_title)
    q.put(root_node)

    while not q.empty():
        curr = q.get()
        next_links = scraper(curr.link)
        for link in next_links:
            if link[1] not in seen:
                next_node = Node(link=base_link + link[0],
                                 title=link[1],
                                 parent=curr)
                curr.add_child(next_node)
                q.put(next_node)
                seen.add(next_node.title)
        explored.append(curr)
    for node in explored:
        print("{} evolves into {}".format(node.title, node.list_children()))
Esempio n. 12
0
    def run(self):
        while True:

            tbd_url = self.frontier.get_tbd_url()

            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break

            delta = datetime.timedelta(seconds=.5)
            split = urlparse(tbd_url).netloc.split('.')
            #extract domain from url (does not account for toay.blah./blah/blah/)
            domain = split[-3] + '.' + split[-2] + '.' + split[-1]
            print("DOMAIN: " + domain)
            # if we've accessed tbd_url domain within 500ms then sleep
            #   sleep(500ms)
            if domain in last_seen and (
                    datetime.datetime.now() - last_seen[domain] < delta):
                print("====SLEEPING====")
                time.sleep(.5)
            # store tbh_url accessed at current time.
            last_seen[domain] = datetime.datetime.now()

            resp = download(tbd_url, self.config, self.logger)
            self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                             f"using cache {self.config.cache_server}.")
            scraped_urls = scraper(tbd_url, resp)
            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)
Esempio n. 13
0
    def run(self):
        i = 0
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break
            try:
                resp = download(tbd_url, self.config, self.logger)
                self.logger.info(
                    f"Downloaded {tbd_url}, status <{resp.status}>, "
                    f"using cache {self.config.cache_server}.")
                scraped_urls = scraper(tbd_url, resp, self.state)
                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)

            except HTTPError as err:
                self.logger.error(f"Downloaded {tbd_url}, hitting error {err}")

            self.frontier.mark_url_complete(tbd_url)
            if i % 1000 == 0:
                print(self.state['longest_page'])
                print_freq(self.state['word_rank'], 50)
                for domain, count in self.state['sub_domains'].items():
                    print(domain, count)
                self.frontier.print_saved()

            i += 1
Esempio n. 14
0
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                printStats()
                break
            
            #We will ignore any network exceptions and retry.
            startTime = time.time()
            resp = None
            hasFailed = False
            while resp is None:
                try:
                    resp = download(tbd_url, self.config, self.logger)
                except Exception as ex:
                    hasFailed = True
                    print(f"{ex}\nRetrying in 60 sec.")
                    time.sleep(60)
            if hasFailed:
                with open("server-outages.rtf", "a+") as err:
                    err.write(f"Server outage from: {startTime} to: {time.time()} duration: {round(time.time() - startTime)} sec.\n")

            self.logger.info(
                f"Downloaded {tbd_url}, status <{resp.status}>, "
                f"using cache {self.config.cache_server}.")
            scraped_urls = scraper(tbd_url, resp)
            if scraped_urls is not None:
                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)
Esempio n. 15
0
    def run(self):
        while True:
            try:
                tbd_url = self.frontier.get_tbd_url()
                if not tbd_url:
                    self.logger.info("Frontier is empty. Stopping Crawler.")
                    final_report()
                    break

                resp = download(tbd_url, self.config, self.logger)
                self.logger.info(
                    f"Downloaded {tbd_url}, status <{resp.status}>, "
                    f"using cache {self.config.cache_server}.")
                scraped_urls = scraper(tbd_url, resp, self.logger)

                # Report the stats every so often
                self.sites_crawled += 1
                if self.sites_crawled >= 100:
                    self.sites_crawled = 0
                    self.logger.info(
                        "Downloaded 100 sites. Generating a report-so-far")
                    final_report()

                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)
                self.frontier.mark_url_complete(tbd_url)
                time.sleep(self.config.time_delay)
            except Exception:
                # If the crawler runs into any exception, spit out the final report before re-raising the exception
                self.logger.info(
                    "Worker caught an exception. Generating final report before exit."
                )
                final_report()
                raise
Esempio n. 16
0
def main():
    scraper = sp.scraper()
    scraper.config_user("lukezim5", "playlist-read-private",
                        "3035253db48b41bc9d5054646ff3cac5",
                        "163471d7285c4fe58fb90573b5042bc6")
    scraper.get_featured_playlists()
    return
Esempio n. 17
0
def main():
    """
    Demonstrates capabilities of linkedlist class
    """

    scraper.scraper()

    # make the nodes from the .csv file and put them into a linked list
    with open('output.csv', 'r') as csv_file:   
        lists = LinkedList.LinkedList()

        for row in csv_file:
            state_text = repr(row.strip())
            lists.new_node(state_text)

    lists.print_list()

    # searches for list values with matching first letter
    has_state = False
    still_search = True

    while still_search:
        search_for = raw_input("\nEnter a letter to narrow search "
            "(1: exit, 0: whole list): ")
        print " "

        if search_for == "0":
            lists.print_list()

        elif search_for == "1":
            print "Ending search.\n"
            still_search = False

        else:
        # print only the states beginning with user inputted letter
            current_node = lists.head

            while current_node.next != None:
                if current_node.cargo[0] == search_for.upper():
                    print current_node.cargo
                    has_state = True
                current_node = current_node.next

            if has_state == False:
                print "No state or territory found."

        has_state = False
Esempio n. 18
0
def main():
    print('Glassdoor Job Scraper.')
    job_title = input('Job search input: ')
    location = input('Location: ')
    while True:
        outcome = scraper(job_title, location, GD_USERNAME, GD_PASSWORD)
        if outcome:
            break
Esempio n. 19
0
def scraperurl():
    # collect dict results from scraper
    JSON = scraper()
    # establish collection object
    collection = mongo.db.disease_collection
    # upsert json to collection
    collection.update({}, JSON, upsert=True)
    return redirect('/data', code=302)
Esempio n. 20
0
def download():
    url = 'https://www.proxynova.com/proxy-server-list/elite-proxies/'
    soup = scraper(url)
    print(soup)
    
    with open('soup_file.txt', 'w') as f:
        json.dump(str(soup.prettify), f)
        
    print('Success')
Esempio n. 21
0
	def handle(self, *args, **options):
		
		url = options['url']
		
		scraper.scraper(url, scraper_op)

		for i in golfers:
			# print first
			try:
				g = Golfer.objects.get(name=i)
			except Golfer.DoesNotExist:
				print i + " does not exist. Creating..."
				g = Golfer(name=i)
				g.save()

		name = raw_input('Name of event?: ')
		event = Event(name=name, active=True)
		event.save()
Esempio n. 22
0
def scraperCallBack():
    print("GUI: executing scraper")
    stockList = scraper()
    print("GUI: executing merger")
    merger(stockList)
    print("GUI: executing overview update")
    overviewUpdate(stockList)
    print("GUI: Update request finished")
    messagebox.showinfo("Progress", "Update finished")
def main():
    args = arguments()
    browser = args.browser
    if browser.lower() == 'c' or browser.lower() == 'chrome':
        driver = driver_chrome()
    elif browser.lower() == 'f' or browser.lower() == 'firefox':
        driver = driver_firefox()
    print('\n-- RECLAME AQUI SCRAPER --')

    file = args.file
    id_page = args.id
    pages = args.pages

    conn, cursor = db_conn()

    coletor = url_collector(driver, file, id_page, pages, conn, cursor)
    scraper(driver, coletor, id_page, conn, cursor)
    driver.quit()
def homepage():
    st.title("Enter Amazon product url to fetch data")
    url = st.text_input("")
    if st.button('Fetch Data'):
        all_reviews_df, product_title = scraper.scraper(url)
        if all_reviews_df is not None:
            st.dataframe(all_reviews_df.head())
            title = preprocessing.product_name(product_title)
            all_reviews_df.to_csv(f"./Amazon Reviews/{title}.csv")
            preprocessing.clean_data(all_reviews_df, title)
Esempio n. 25
0
    def run(self):
        try:
            while True:
                self._lock()
                try:
                    tbd_url = self.frontier.get_tbd_url()
                    if tbd_url:
                        self.threads_in_processing.add(self.worker_id)
                    elif tbd_url is None and len(
                            self.threads_in_processing) == 0:
                        self.logger.info(
                            f"Frontier is empty. Stopping the Worker: {self.worker_id}"
                        )
                        break
                finally:
                    self._unlock()

                if tbd_url is None or tbd_url == "":
                    time.sleep(0.05)
                    continue

                self.logger.info(
                    f"Worker: {self.worker_id} Downloading: {tbd_url}")
                resp = download(tbd_url, self.config, self.logger)
                if resp.raw_response is None and resp.error.startswith(
                        "EMPTYCONTENT"):
                    self.logger.error(f"{resp.error}, status <{resp.status}>")

                self.logger.info(
                    f"Worker: {self.worker_id} Downloaded : {tbd_url}, status <{resp.status}>"
                )
                scraped_urls = scraper(tbd_url, resp)

                new_urls_added = 0
                self._lock()
                try:
                    for scraped_url in scraped_urls:
                        if (self.frontier.add_url(scraped_url)):
                            new_urls_added += 1
                    self.frontier.mark_url_complete(tbd_url)
                finally:
                    self._unlock()
                    self.threads_in_processing.remove(self.worker_id)

                self.logger.info(
                    f"Worker: {self.worker_id}, Added: {new_urls_added}, Remaining: {self.frontier.count_tbd_urls()}"
                )
        except BaseException:
            self.logger.exception(
                f"Unexpected exception in Worker: {self.worker_id}")
        finally:
            if self.worker_id in self.threads_in_processing:
                self.threads_in_processing.remove(self.worker_id)
            self.logger.info(f"Worker: {self.worker_id} Stopped")
Esempio n. 26
0
def pageScrapper(html, index):
  soup = bs(html, 'lxml')
  targetDivs = soup.findAll('div', {'class': 'a-section a-spacing-medium'})
  print(len(targetDivs))
  links = {}
  for div in targetDivs:
    itemLink = div.find('a', {'class' : 'a-link-normal a-text-normal'})
    itemName = div.find('span', {'class' : 'a-size-medium a-color-base a-text-normal'})
    links[itemName.text] = itemLink.get('href')

  file1 = open("data/pageLinks"+str(index)+".csv", "w")
  file2 = open("data/pageLinks.txt", "w")
  file1.write('"Product Name","Link",\n')
  for key in links:
    fileEnrty = '"'+key+'"'+','+'"'+links[key]+'"'+'\n'
    file1.write(fileEnrty)
    file2.write(key + '~' + links[key] + '\n')
  file1.close()
  file2.close()
  print('\ncreated a file data/pageLinks'+str(index)+'.csv\n')
  scraper("pageLinks.txt")
Esempio n. 27
0
def check_source(source, article):

    
    results = scraper.scraper(article['link'])

    if results == None:
        text1 = None
    else:
        text1 = results[0]

    text2 = article['full_text']

    assert text1 == text2
Esempio n. 28
0
def animate(i):
    scraper = sc.scraper()
    site = scraper.scrapeWebsite(
        'https://www1.oanda.com/currency/live-exchange-rates/')
    parsed = scraper.parserHTML(site)
    scraper.writer(parsed, 'data.json')
    df = reader.readFile('data.json')
    newDF = reader.createData(df)
    a.clear()
    max = newDF[['GBP_USD']].max()
    min = newDF[['GBP_USD']].min()
    a.set_title('Current $ to £ Exchange Rates')
    newDF.plot(kind='line', x='TIME', y='GBP_USD', ax=a, ylim=[1.30, 1.32])
Esempio n. 29
0
def main():
    path = "C:\\Users\Arthur\Documents\Studies\ParisDauphineUniversity\Master203\\Newsletter\\Scraper\\data\\"
    price_list = []
    for ticker, sec in TICKERS.items():
        print(ticker)
        price_list.append(scraper(ticker, sec))
    price_list = clean_data(price_list)
    ComputingTable = compute_data(price_list, path)
    TableStr = format_table(
        ComputingTable,
        "C:\\Users\Arthur\Documents\Studies\ParisDauphineUniversity\Master203\\Newsletter\\Scraper\\pictures\\"
    )
    return TableStr
Esempio n. 30
0
def email(item, min_price, max_price, id_list, chat_id):
    # Create the HTML version of your message
    # html = """\
    # <html>
    #  <body>"""
    html = ""
    items = scraper.scraper(item, min_price, max_price, id_list)
    if len(items) != 0:
        for item in items:
            # html = html + """<p><b>%s</b><br>
            #html = '<b>%s</b> %s %s ><a href="%s">Ad Link</a>' % (item.title, item.price, item.date, "http://" + item.url)
            title = item.title
            price = item.price
            date = item.date
            url = item.url
            title = title.replace("*", " ")
            date = date.replace("< ", "")
            text = '<b>' + title + '</b>' + '\n' + price + '\n' + date + '\n' + '<a href="http://' + url + '">Ad link</a>'
            bot.send_message(chat_id=chat_id,
                             text=text,
                             parse_mode=telegram.ParseMode.HTML)

        # html = html + """</body></html>"""

        #    sender_email = senderE
        #    receiver_email = receiverE
        #    password = password

        # message = MIMEMultipart("alternative")
        # message["Subject"] = "Kijiji Ad Alert!!!!"
        # message["From"] = sender_email
        # message["To"] = receiver_email

        # Turn these into plain/html MIMEText objects
        # part1 = MIMEText(text, "plain")
        # part2 = MIMEText(html, "html")

        # Add HTML/plain-text parts to MIMEMultipart message
        # The email client will try to render the last part first
        # message.attach(part1)
        # message.attach(part2)

        # Create secure connection with server and send email
        # context = ssl.create_default_context()
        # with smtplib.SMTP_SSL("smtp.gmail.com", 465, context=context) as server:
        # server.login(sender_email, password)
        # server.sendmail(
        # sender_email, receiver_email, message.as_string()
        # )

    return None
Esempio n. 31
0
 def run(self):
     while True:
         tbd_url = self.frontier.get_tbd_url()
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             break
         resp = download(tbd_url, self.config, self.logger)
         self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                          f"using cache {self.config.cache_server}.")
         scraped_urls, urlInfo, token_list = scraper(tbd_url, resp)
         for scraped_url in scraped_urls:
             self.frontier.add_url(scraped_url)
         self.reporter.add_words(tbd_url, token_list)
         self.frontier.mark_url_complete(tbd_url, urlInfo)
Esempio n. 32
0
 def run(self):
     while True:
         tbd_url = self.frontier.get_tbd_url()
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             break
         resp = download(tbd_url, self.config, self.logger)
         self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                          f"using cache {self.config.cache_server}.")
         scraped_urls = scraper(tbd_url, resp)
         for scraped_url in scraped_urls:
             self.frontier.add_url(scraped_url)
         self.frontier.mark_url_complete(tbd_url)
         time.sleep(self.config.time_delay)
Esempio n. 33
0
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
  
                print("************* REPORT ****************")
                print()
                print("Team Members:")
                print("    Kamaniya Sathish Kumar (56361951)")
                print("    Samhitha Tarra (69706915)")
                print("    Vani Anilkumar (36335618)")
                print()
                print("Number of Unique URLs:", scraper.count_unique_url)
                print()
                print("Longest URL:", scraper.longest_page)
                print("Number of Tokens in Longest URL:", scraper.num_words_longest_page)
                print()
                
                print("50 Most Common Words:")
                counter = 1
                for key, value in sorted(scraper.master_freq_dict.items(), key=lambda x: x[1], reverse = True):
                        if counter <= 50:
                            print(str(counter) + ". " + key + " (" + str(value) + ")")
                            counter = int(counter)
                            counter += 1
                        else:
                            break
                print()
                print("Subdomains in ics.uci.edu:")
                for tup, val in sorted(scraper.master_subdomain_dict.items(), key=lambda x: x[0]):
                    url_string = ""
                    url_string += tup[1] + "://" + tup[0] + ".ics.uci.edu,"
                    print(url_string, val)
                print()
                print("************* REPORT ****************")


                self.logger.info("Frontier is empty. Stopping Crawler.")
                break
            resp = download(tbd_url, self.config, self.logger)
            self.logger.info(
                f"Downloaded {tbd_url}, status <{resp.status}>, "
                f"using cache {self.config.cache_server}.")
            scraped_urls = scraper.scraper(tbd_url, resp)

            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)
Esempio n. 34
0
def imageLookupLoop():
  cacheDir = adGlobal.cacheDir;
  image_type = "Action"
  maxImagesPerHost = 5
  searchType = adGlobal.searchType
  syslog.syslog("search method: "+searchType)
  if hangDebug: syslog.syslog("Hang debug:"
		+__file__+" "
		+str(inspect.currentframe().f_lineno))
  hosts=[]

  services = subprocess.check_output(["slptool","findsrvs","service:artdisplay.x"]).split('\n');
  if len(services) == 0:
    syslog.syslog("no available services")
    return
  for s in services:
    if hangDebug: syslog.syslog("Hang debug:"
      +__file__+" "
      +str(inspect.currentframe().f_lineno))
    if slpDebug: syslog.syslog("slp s:"+s)
    loc=s.split(',');
    if loc[0] == '':
      continue
    if slpDebug: syslog.syslog("loc:"+str(loc))
    if hangDebug: syslog.syslog("Hang debug:"
      +__file__+" "
      +str(inspect.currentframe().f_lineno))
    attr=subprocess.check_output(["slptool","findattrs",loc[0]]);
    host={}
    host['ip']=loc[0].split("//")[1]
    host['hasPanel']=False
    host['isDispText']=False
    if attr.find("hasPanel") != -1:
      if slpDebug: syslog.syslog(str(host)+":host has panel");
      host['hasPanel']=True
    if attr.find("isDispText") != -1:
      if slpDebug: syslog.syslog(str(host)+":host is a text display");
      host['isDispText']=True
    if slpDebug: syslog.syslog("slp host"+str(host))
    hosts.append(host)
  if slpDebug: syslog.syslog("hosts:"+str(hosts))
  if hangDebug: syslog.syslog("Hang debug:"
    +__file__+" "
    +str(inspect.currentframe().f_lineno))
  images=[]
  choices=[]
  if searchType == "Archive":
    if hangDebug: syslog.syslog("Hang debug:"
      +__file__+" "
      +str(inspect.currentframe().f_lineno))
    vars=archive.getArchive();
    images=vars[0]
    choices=vars[1]
    if hangDebug: syslog.syslog("Hang debug:"
      +__file__+" "
      +str(inspect.currentframe().f_lineno))
  else:
    while len(images) < 20:
      images=[]
      choices=[]
      choices = words.getWords()
      images = scraper.scraper(choices[:])
      if len(images) == 1 and images[0] == "error":
        return
  syslog.syslog("select: "+choices[0]+" "+choices[1])
  copyList = {}
  if hangDebug: syslog.syslog("Hang debug:"
    +__file__+" "
    +str(inspect.currentframe().f_lineno))
  for h in hosts:
    copyList[h['ip']] = {}
    copyList[h['ip']]['image'] = []
    copyList[h['ip']]['flag'] = []
    copyList[h['ip']]['text'] = None
    if h['hasPanel'] or h['isDispText']:
      if dispDebug: syslog.syslog("doing has panel or disp text")
      if len(choices) < 2:
        syslog.syslog("WARNING, choices array not loaded")
      else: 
        textPath=cacheDir+"/"+adGlobal.textName
        adGlobal.mutex.acquire()
        f = open(textPath,'w')
        f.write(choices[0]+'\n')
        f.write(choices[1]+'\n')
        f.close();
        adGlobal.mutex.release()
        copyList[h['ip']]['text']=textPath 
        if debug: syslog.syslog( "textPath:"+textPath)

  hostCount=0
  if hangDebug: syslog.syslog("Hang debug:"
    +__file__+" "
    +str(inspect.currentframe().f_lineno))
  for image in images:
    if hosts[hostCount]['isDispText']:
      if dispDebug: syslog.syslog("skipping image store for:"+hosts[hostCount]['ip'])
      hostCount += 1
      if hostCount == len(hosts):
        hostCount=0

    if ( searchType != "Archive"):
      if hangDebug: syslog.syslog("Hang debug:"
        +__file__+" "
        +str(inspect.currentframe().f_lineno))
      raw_img=getRawImage(image)
      if hangDebug: syslog.syslog("Hang debug:"
        +__file__+" "
        +str(inspect.currentframe().f_lineno))
      if raw_img is None:
        syslog.syslog( "raw_image = none",image )
        continue;
      adGlobal.mutex.acquire()
      cntr = len([i for i in os.listdir(cacheDir) if image_type in i]) + 1
      adGlobal.mutex.release()
      if debug: syslog.syslog( str(cntr))
      imgPath=cacheDir + '/' + image_type + "_"+ str(cntr)+".jpg"
      adGlobal.mutex.acquire()
      f = open(imgPath, 'wb')
      f.write(raw_img)
      f.close()
      flgPath=cacheDir + '/' + image_type + "_"+ str(cntr)+".flg"
      f = open(flgPath, 'w')
      f.close()
      adGlobal.mutex.release()
      del raw_img
    else:
      adGlobal.mutex.acquire()
      try:
        cmd=["cp",image,cacheDir]
        if debug: syslog.syslog( "cmd:"+str(cmd))
        subprocess.check_output(cmd)
        imgPath=cacheDir+"/"+os.path.basename(image)
        i=os.path.basename(image)
        flgPath=cacheDir+"/"+image[:image.rindex(".")]+".flg"
        cmd=["touch",flgPath]
        if debug: syslog.syslog( "cmd"+str(cmd))
        subprocess.check_output(cmd)
      except subprocess.CalledProcessError, e:
        syslog.syslog("archive file copy problem: "+', '.join(cmd)+str(e.output))
        adGlobal.mutex.release()
        continue;
      finally:
Esempio n. 35
0
def parse_feed(url, debug=False, mode='streaming'):
    """
    params:  url - string for the location of an RSS feed

    returns:  writes all new articles in feed to mongo db with all relevant noozli data
    """

    feed = feedparser.parse(url)

    if len(feed['entries']) == 0:
        log.info("No items in feed: " + url)
        return None

    try:
        client = MongoClient('localhost', 27017)
    except Exception as e:
        log.warning('Failed to connect to mongo.')

    db = client.noozli


    if mode == 'prototype':
        collection = db.prototype
    else:
        collection = db.streaming

    
    if 'npr.org' in url:
        source = 'NPR - ' + feed['feed']['title']
    else:
        source = feed['feed']['title']

    # normalize fox news source
    if source == "FOXNews.com":
        source = "FOX News"

    # limit number of articles for test set
    if mode == 'prototype':
        if collection.find({'source': source}).count() > 100:
            log.info("Source '" + source + "' has " + str(collection.find({'source': source}).count()) + ' articles in database.  Skipping.')
            return None

    
    log.info('parsing feed ' + url)

    
    articles = []
    count = 0
    for item in feed['entries']:
        
        if debug and count == 100:
            break

        # don't normalize abc urls, they don't work
        if 'abcnews.go.com' in item['link']:
            rss_link = item['link']
        else:
            # normalize urls
            p_url = urlparse(item['link'])
            rss_link = p_url.scheme + "://" + p_url.netloc + p_url.path

        strip_regex = re.compile('index\.html|index\.shtml')
        rss_link = strip_regex.sub('', rss_link)

        # Business Insider occasionally has bloomberg articles, skip them
        if source == 'Business Insider' and 'bloomberg.com' in rss_link:
            continue

        article_id = hashlib.md5(str.encode(rss_link)).hexdigest()
        
        if collection.find({'links': rss_link}).count() > 0:
            log.debug('Already found "'+ source + '" article: ' + rss_link)
            continue

        title = item['title']

        try:
            if type(item['content']) == list:
                if len(item['content']) != 1:
                    log.info('RSS feed "content" field has more than 1 entry. ' + url)
                
                content = item['content'][0]['value']
            else:
                content = item['content']['value']

        except KeyError:
            try:
                content = item['summary']
            except KeyError:
                content = None

        try:
            author = item['author']
        except KeyError:
            author = None

        published = item['published']


        # add datetime stamp to data for ttl expiration
        date, zone = published.rsplit(' ', 1)
        try:
            published_dt = datetime.datetime.strptime(date, "%a, %d %b %Y %H:%M:%S")
        except:
            log.warning('unable to parse published time: ' + published)
            continue

        # add time zone awareness
        found_numerical_zone = False
        if len(zone) >= 5:
            search = re.search('(\+|\-)[0-9]+', zone)
            if search:
                found_numerical_zone = True
                zone = search.group()

        if zone == 'EDT' or zone == 'EST':
            published_tz = pytz.timezone('US/Eastern')
        elif zone == 'CDT' or zone == 'CST':
            published_tz = pytz.timezone('US/Central')
        elif zone == 'MDT' or zone == 'MST':
            published_tz = pytz.timezone('US/Mountain')
        elif zone == 'PDT' or zone == 'PST':
            published_tz = pytz.timezone('US/Western')
        elif zone == 'UTC' or zone == 'GMT':
            published_tz = pytz.timezone('UTC')        
        elif found_numerical_zone:
            zone = zone.rstrip('0')
            if len(zone) > 1:
                published_dt += datetime.timedelta(hours=int(zone))
        else:
            log.warning('unable to determine timezone: ' + published)
            continue

        if not found_numerical_zone:
            published_dt = published_tz.localize(published_dt)
            comp_dt = published_dt.astimezone(pytz.utc)
        else:
            comp_dt = published_dt
            
        # check if article is old, skip
        try:
            age = datetime.datetime.utcnow() - comp_dt
        except TypeError:
            utc_tz = pytz.timezone('UTC')
            age = utc_tz.localize(datetime.datetime.utcnow()) - comp_dt

        if age.seconds + age.days*86400 >= 2592000:
            log.debug('url ' + rss_link + ' more than 30 days old, skipping.')
            continue

        #
        # perform scrape (want to minimize this as much as possible, don't annoying hosts) 
        #
        try:
            result = scraper.scraper(rss_link, debug)
        except UnicodeEncodeError:
            # sometimes finding non utf-8 unicode characters, for now skip
            result = None
            log.warning('non utf-8 character in '+rss_link)
    
        full_text = None
        og_url = None
        if result is not None:
            full_text = result[0]
            image_url = result[1]
            display_text = result[2]
            og_url = result[3]
            
        # if og:url exists redefine the link and article_id hash, some rss feeds have badly formatted urls
        og_link = None
        if og_url is not None:
            p_url = urlparse(og_url)
            og_link = p_url.scheme + "://" + p_url.netloc + p_url.path
            
            strip_regex = re.compile('index\.html|index\.shtml')
            og_link = strip_regex.sub('', og_link)

            
            # recompute article id and check again for its existence in database (based on og:url tag)
            article_id = hashlib.md5(str.encode(og_link)).hexdigest()


        if og_link is None: 
            links = [rss_link]
        elif rss_link != og_link:
            links = [og_link, rss_link]
        else:
            links = [og_link]                

        if full_text is not None:          

            if len(full_text) < 250:
                if og_url is not None:
                    log.debug('Text too short on ' + og_link + '.  Skipping.')
                else:
                    log.debug('Text too short on ' + rss_link + '.  Skipping.')
                continue

            # check to see if article is in DB based on the text 
            #   handles sources that get resyndicated (AP, CNN, ABC, etc.)
            matching_articles = collection.find({'full_text': full_text})
            if matching_articles.count() > 0:
                if og_url is not None:
                    log.debug('Already found "'+ source + '" article by text: ' + og_link)
                else:
                    log.debug('Already found "'+ source + '" article by text: ' + rss_link)

                # if text of article was already found, associate all links that point to that same article
                for art in matching_articles:
                    new_links = list(art['links'])
                    if rss_link not in new_links:
                        new_links.append(rss_link)
                        
                    if og_link != rss_link and og_link not in new_links:
                        new_links.append(og_link)

                    if len(new_links) > len(art['links']):
                        collection.update( {'_id': art['_id']}, {'$set': {'links': new_links}} )
                
                continue

            # check og_link after checking if text was found in db, so we can add additional rss_links
            if collection.find({'links': og_link}).count() > 0:
                log.debug('Already found "'+ source + '" article: ' + og_link)
                continue

            #if mode == 'prototype':
            topics = textrazor_categorization(full_text)
            #else:
            #    topics = None

            if image_url is not None:
                server_image_url = resize_and_store_image(image_url, article_id)
            else:
                server_image_url = None

            article_data = {'article_id': article_id,
                            'title': title,
                            'author': author,
                            'summary': content,
                            'full_text': full_text,
                            'display_text': display_text,
                            'image': server_image_url,
                            'links': links,
                            'published': published,
                            'timestamp': published_dt,
                            'topics': {'text_razor': topics },
                            'source': source
                            }

        
            if debug:
                articles.append(article_data)
            else:
                
                log.debug('Adding "'+ source + '" article: ' + links[0])
                collection.insert(article_data)

            count += 1

        else:
            if og_url is not None:
                log.warning("error:  did not extract any text from - " + og_link)
            else:
                log.warning("error:  did not extract any text from - " + rss_link)

            # append empty
            if debug:
                article_data = { 'links': links, 'full_text': None, 'display_text': None, 'source': source }
                articles.append(article_data)

            # in streaming mode, append all links that failed to scrape to reduce website access
            if mode == 'streaming':

                if og_url is None:
                    links = [rss_link]
                else:
                    links = [rss_link, og_link]

                article_data = {'article_id': article_id,
                                'full_text': None,
                                'display_text': None,
                                'links': links,
                                'source': source,
                                'timestamp': published_dt
                                }
                
                if not debug:
                    collection.insert(article_data)
            
            
    log.info('Added ' + str(count) + ' articles from ' + source)

    if debug:
        return articles
Esempio n. 36
0
from scraper import scraper
import os
import time

def diff(A,B):
    if len(A.keys()) > len(B.keys()):
        A,B = B,A
    for k in A.keys():
        if A[k]!=B[k]:
            print '--',k,A[k]
            print '++',k,B[k]
            print ''

while True:
    try:
        if os.path.exists(os.path.join(os.getcwd(),'database.json')):    
            with open('database.json','r') as fin:
                old_database = json.load(fin)
            new_database = scraper(USERNAME, PASSWORD)
            if new_database != old_database:
                diff(old_database, new_database)
                with open('database.json', 'w') as fout:
                    json.dump(new_database, fout)
        else:
            new_database = scraper(USERNAME, PASSWORD)
            with open('database.json', 'w') as fout:
                json.dump(new_database, fout)
        time.sleep(60*60)
    except Exception as e:
        print e
import scraper

gi = scraper.scraper('instagram')	#source [e.g. "google", "twitter", "instagram"]
gi.set_outputDir('output/')	#output directory on local disk
gi.set_downloadLimit(10)	#specified number of images

gi.scrape( "design" )	#arq: tag, search query
gi.stop()	#close the browser once scraping is done
Esempio n. 38
0
 def __init__(self, booksCallback=None):
     self.database = connector.connector(booksCallback)
     self.scraper = scraper.scraper()