def process_path(start_url: str, target: str, relation_table: RelationTable,
                 visit_table: VisitTable, path: list):
    new_path = path.copy()
    new_path.append(start_url)
    if start_url == target:
        return new_path
    if len(new_path) >= 5:
        return None
    links = scraper.get_links(start_url)
    if len(links) == 0:
        return None
    print("\t" * (len(new_path) - 1) + start_url)
    for link in links:
        relation_table.add_relation(start_url, link)
    visit_table.visit(start_url)
    neighbors = relation_table.get_unvisited_neighbors(start_url, visit_table)
    for neighbor in neighbors:
        if neighbor == target:
            new_path.append(neighbor)
            return new_path
    if len(neighbors) == 0:
        return None
    if len(new_path) == 4:
        return None
    for neighbor in neighbors:
        next_path = process_path(neighbor, target, relation, visit_table,
                                 new_path)
        if next_path is not None:
            return next_path
Exemple #2
0
def main():
    browser = config.connect_browser()
    db = Redis(host=config.REDIS_HOST, port=config.REDIS_PORT)

    # We want to not look like a bot, I found that initially get all the links to possible scrape
    # then shuffling them looks much less like a bot.
    links = []
    for name, job in config.WORK.items():
        new_links = []

        try:
            print(job['url'])
            new_links = get_links(browser, job['url'], job['link_regex'])
        except Exception as e:
            # Browser sessions get a little funky, in this case refresh the connection
            browser = config.connect_browser()
            print(e)

        links += [(name, link) for link in new_links]

    random.shuffle(links)

    channel = config.setup_mq(config.QUEUE_NAME)
    for link in list(set(links)):
        # Avoid doing unnessary duplicate work
        if db.exists(link):
            continue

        # Just in case
        time.sleep(random.randint(1, 8))

        try:
            print(link)
            story = get_story(browser, link[1],
                              config.WORK[link[0]]['story_xpath'])
        except Exception as e:
            print(e)
            browser = config.connect_browser()
            continue

        # Quick filtering to avoid invalid stories
        if len(story['story']) == 0:
            continue

        # Often connections are lost, reconnect in those cases
        try:
            publish_story(channel, config.QUEUE_NAME, story)
        except Exception:
            channel = config.setup_mq(
                config.QUEUE_NAME)  # Refresh the connection
Exemple #3
0
def scrape(h_tag, link_tag, text_tag, target_url):
    """A Program to Scrape a website"""
    url = target_url
    target = scraper.create_target(target_url)
    page = scraper.get_page(target)
    soup = scraper.get_soup(page)
    headings = scraper.get_heading(soup, h_tag)
    texts = scraper.get_texts(soup, text_tag)
    db_functions.create_scrape(url, h_tag, text_tag, link_tag)
    for h in headings:
        db_functions.create_result(url, h_tag, h, '', link_tag)
    for t in texts:
        db_functions.create_result(url, text_tag, '', t, link_tag)
    links = scraper.get_links(soup, link_tag)
    for l in links:
        db_functions.create_result(url, link_tag, '', '', l)
    db_functions.print_scrape(url)
    db_functions.print_records(url)
import pickle
import numpy as np
from fabulous.color import highlight_green

from scraper import get_links, parse_page_data

links = get_links()
data = parse_page_data(links)

model = pickle.load(open("./model/finalized_model.sav", 'rb'))
y = model.predict(data)

indices = np.argwhere(y == np.max(y)).flatten()
print(highlight_green("\nArticles most likely to go viral:\n\n"))

for key, value in enumerate(indices):
    print('{}.'.format(key + 1), end='    ')
    print(links[value])
from scraper import scrape_box_scores, get_links

year = input('Please enter the year in this format: xxxx\n')
month = input('Please enter the month in this format: x\n')
day = input('Please enter the day in this format: x\n')

# Or if you want to run this as a daily script to get previous night's boxscores:
# yesterday = date.today() - timedelta(1)
# yesterdaytuple = yesterday.timetuple()
# year = yesterdaytuple[0]
# month = yesterdaytuple[1]
# day = yesterdaytuple[2]

links = get_links(year, month, day)
scrape_box_scores(links, year, month, day)
Exemple #6
0
#!/usr/bin/env python3
import scraper
import downloader
from sys import argv, exit


if __name__ == "__main__":
    links = scraper.get_links(argv[1])
    i = 1
    for link in links:
        print(i, link[0])
        print(link[1])
        i = i + 1
    inp = int(input("Select Quality: "))
    if inp < 1 or inp > len(links):
        print("Invalid selection!")
        sys.exit(1)
    link = links[inp-1][1]
    downloader.download_one(link)