def process_path(start_url: str, target: str, relation_table: RelationTable, visit_table: VisitTable, path: list): new_path = path.copy() new_path.append(start_url) if start_url == target: return new_path if len(new_path) >= 5: return None links = scraper.get_links(start_url) if len(links) == 0: return None print("\t" * (len(new_path) - 1) + start_url) for link in links: relation_table.add_relation(start_url, link) visit_table.visit(start_url) neighbors = relation_table.get_unvisited_neighbors(start_url, visit_table) for neighbor in neighbors: if neighbor == target: new_path.append(neighbor) return new_path if len(neighbors) == 0: return None if len(new_path) == 4: return None for neighbor in neighbors: next_path = process_path(neighbor, target, relation, visit_table, new_path) if next_path is not None: return next_path
def main(): browser = config.connect_browser() db = Redis(host=config.REDIS_HOST, port=config.REDIS_PORT) # We want to not look like a bot, I found that initially get all the links to possible scrape # then shuffling them looks much less like a bot. links = [] for name, job in config.WORK.items(): new_links = [] try: print(job['url']) new_links = get_links(browser, job['url'], job['link_regex']) except Exception as e: # Browser sessions get a little funky, in this case refresh the connection browser = config.connect_browser() print(e) links += [(name, link) for link in new_links] random.shuffle(links) channel = config.setup_mq(config.QUEUE_NAME) for link in list(set(links)): # Avoid doing unnessary duplicate work if db.exists(link): continue # Just in case time.sleep(random.randint(1, 8)) try: print(link) story = get_story(browser, link[1], config.WORK[link[0]]['story_xpath']) except Exception as e: print(e) browser = config.connect_browser() continue # Quick filtering to avoid invalid stories if len(story['story']) == 0: continue # Often connections are lost, reconnect in those cases try: publish_story(channel, config.QUEUE_NAME, story) except Exception: channel = config.setup_mq( config.QUEUE_NAME) # Refresh the connection
def scrape(h_tag, link_tag, text_tag, target_url): """A Program to Scrape a website""" url = target_url target = scraper.create_target(target_url) page = scraper.get_page(target) soup = scraper.get_soup(page) headings = scraper.get_heading(soup, h_tag) texts = scraper.get_texts(soup, text_tag) db_functions.create_scrape(url, h_tag, text_tag, link_tag) for h in headings: db_functions.create_result(url, h_tag, h, '', link_tag) for t in texts: db_functions.create_result(url, text_tag, '', t, link_tag) links = scraper.get_links(soup, link_tag) for l in links: db_functions.create_result(url, link_tag, '', '', l) db_functions.print_scrape(url) db_functions.print_records(url)
import pickle import numpy as np from fabulous.color import highlight_green from scraper import get_links, parse_page_data links = get_links() data = parse_page_data(links) model = pickle.load(open("./model/finalized_model.sav", 'rb')) y = model.predict(data) indices = np.argwhere(y == np.max(y)).flatten() print(highlight_green("\nArticles most likely to go viral:\n\n")) for key, value in enumerate(indices): print('{}.'.format(key + 1), end=' ') print(links[value])
from scraper import scrape_box_scores, get_links year = input('Please enter the year in this format: xxxx\n') month = input('Please enter the month in this format: x\n') day = input('Please enter the day in this format: x\n') # Or if you want to run this as a daily script to get previous night's boxscores: # yesterday = date.today() - timedelta(1) # yesterdaytuple = yesterday.timetuple() # year = yesterdaytuple[0] # month = yesterdaytuple[1] # day = yesterdaytuple[2] links = get_links(year, month, day) scrape_box_scores(links, year, month, day)
#!/usr/bin/env python3 import scraper import downloader from sys import argv, exit if __name__ == "__main__": links = scraper.get_links(argv[1]) i = 1 for link in links: print(i, link[0]) print(link[1]) i = i + 1 inp = int(input("Select Quality: ")) if inp < 1 or inp > len(links): print("Invalid selection!") sys.exit(1) link = links[inp-1][1] downloader.download_one(link)