def execute(self):
     myScraper = Scraper(self.url,self.matchingDict)
     result = myScraper.scrape()
     if self.target is None:
         return result
     else:
         self.target(result, self.url)
Example #2
0
def create_job():
    worker = Scraper(FREQUENCY, TAB, UNIT, FINAL_YEAR)
    while True:
        item = q.get()
        worker.do_work(item)
        print(item + ' is downloaded | ' + str(q.qsize()) + ' item(s) left')
        q.task_done()
Example #3
0
def save_info_from(href, data_dir):

    # initialize child destination
    scrap = Scraper(href)
    dest = scrap.create_destination()
    dest.children_href = scrap.get_children()

    # check if we have already crawled this area
    OBJECT_OUTFILE = data_dir + dest.nickname + '.pickle'
    if os.path.exists(OBJECT_OUTFILE):
        print dest.nickname + ' has already been crawled'
        pass
    else:
        if not os.path.isdir(os.path.dirname(OBJECT_OUTFILE)):
            os.makedirs(os.path.dirname(OBJECT_OUTFILE))

        # traverse tree of areas-->routes
        all_dest = traverse(dest)
        # returns destination object

        # write out to file.. for viz??
        BIG_JSON = data_dir + dest.nickname + '.json'
        with open(BIG_JSON, 'w+') as dump:
            flat = json.dumps(all_dest, default=lambda o: o.__dict__)
            dump.write(flat)

        # save destination object as pickle
        BIG_PICKLE = data_dir + dest.nickname + '.pickle'
        with open(BIG_PICKLE, 'wb') as handle:
            pickle.dump(all_dest, handle)

        flourish = '<<<'+'-'*25
        print flourish + dest.nickname + flourish[::-1]
        print
Example #4
0
def main():
    uid = str(uuid.uuid4())

    print "Creating Scraper() instance ..."

    scraper = Scraper(uid)
    scraper.run()

    print "Running tests ..."

    # typelink()
    test_typelink(scraper)    

    # checkmatch()
    test_checkmatch(scraper)

    # getpagelinks
    test_getpagelinks(scraper)

    # folowlinks()
    test_followlinks(scraper)

    # get scraper status
    text_getstatus(scraper)

    scraper.stop();

    print "Done."
Example #5
0
def scrape():
    scraper = Scraper(**get_creds())

    #   Fetch usage info re: boosters.
    le = UsageDataPoint(
        time=datetime.datetime.utcnow(),
        **scraper.fetch_booster_usage()
    )

    db_session.add(le)
    yield le

    #   Fetch latest transactions and put these in the DB,
    #   but only if we don't already have them.
    for transaction in scraper.fetch_most_recent_transactions():
        existing = KoodoTransaction \
            .query \
            .filter_by(koodo_id=transaction['koodo_id']) \
            .first()
        if not existing:
            kt = KoodoTransaction(**transaction)
            db_session.add(kt)
            yield kt

    db_session.commit()
Example #6
0
def scrape(request, tvdb_id):
    """
    Takes a scrape request, constructs a Scraper object and performs a scrape for the show if it hasn't been scraped
    before or hasn't been scraped within the last :math:`x` days (where :math:`x` is the number of days specified by
    RESCRAPE_AFTER). Otherwise if the show exists and has been scraped within the last :math:`x` days redirect to the
    appropriate show page

    :param request: A Scrape request object.
    :param tvdb_id: The id of the tv show to be scraped (or shown)
    :return: A HttpResponse Object containing the page of the show requested.
    """

    # Determine if the show already exists in the datastore
    q = TVShow.get_by_key_name(tvdb_id)

    if users.is_current_user_admin() and 'force' in request.GET and request.GET[
        'force'] == '1':
        Scraper(tvdb_id, rescrape=True, options=q.options)
        return HttpResponseRedirect('/show/{0}'.format(q.url_string))

    # Check if the show has been scraped before and if that scrape was in the last x days specified by RESCRAPE_AFTER
    if q and q.last_scraped > datetime.now() - timedelta(days=RESCRAPE_AFTER):
        url_slug = q.url_string
    else:
        # If scraping is switched on then scrape the show
        if settings.SCRAPING:
            s = Scraper(tvdb_id)
            url_slug = s.get_url_slug()
        else:
            url_slug = tvdb_id

    return HttpResponseRedirect('/show/{0}'.format(url_slug))
Example #7
0
def test_find_docs():

    declare_test_start( 'follow_link' ) 

    url_data = {
        'url_id': 1,
        'target_url': 'http://timduffy.me/',
        'max_link_level': 6,
        'creation_date_time': str(datetime.datetime.now()),
        'doc_type': 'application/pdf',
        'dispatch_datetime': str(datetime.datetime.now()),
        'allowed_domains': [],
    }

    uid = str(uuid.uuid4())
    scraper = Scraper(uid)
    scraper.set_url_data(url_data)
    docs = scraper.find_docs( )

    print '[ TEST ] {0}'.format(json.dumps(scraper.status))
    print '[ TEST ] {0}'.format(json.dumps(docs))

    passed = False
    if len(docs) > 0:
        passed = True

    declare_test_end( passed )
Example #8
0
def main(request):
	username = request.POST['username']
	password = request.POST['password']
	scraper = Scraper(username, password)
	data = scraper.scrap(True)

	return HttpResponse(obj2json(data), mimetype='application/json')
Example #9
0
	def __init__(self, base_url = awards_base_url, search_url = ""):
		Scraper.__init__(self, base_url, search_url)
		self.file = open('academy_awards.csv', 'wb')
		self.writer = csv.writer(self.file, delimiter='\t')
		self.writer.writerow(['Year', 'Category', 'Won', 'FilmName', 'ActorDirectorName'])
		self.soup = self.connect(base_url)
		self.next_record = '1'
Example #10
0
def traverse(node):
    """ Pre-order depth-first search of Mountain Project tree """

    children = []
    for href in node.children_href:
        # initialize Scraper for this page
        scrap = Scraper(href)
        if scrap.soup is None:
            pass
        else:
            # grab features from the soup
            dest = scrap.create_destination()
            # find children in the soup if any
            dest.children_href = scrap.get_children()
            # recursively deeper down the tree if this is an area
            if dest.children_href != None:
                print
                print '**'+dest.nickname+'**'
                traverse(dest)
            # inner traverse function has returned with destination object
            print dest.nickname + ' | ' + dest.href
            children.append(dest)

    node.children = children
    return node
Example #11
0
 def testExtractTag(self):
     pattern = "<a name='$name'></a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # one attribute
     actual = BeautifulSoup("<a name='abc'></a>")
     self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
     
     # one attribute
     actual = BeautifulSoup("<a name='abc' age='27'></a>")
     self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
     
     # two attributes
     pattern = "<a name='$name' age='$age'></a>"
     exp = BeautifulSoup(pattern)
     actual = BeautifulSoup("<a name='abc' age='27'></a>")
     ret =  _scraper.extractTag(exp.contents[0], actual.contents[0])
     self.assertEqual(2, len(ret))
     self.assertEqual('abc', ret['name'])
     self.assertEqual('27', ret['age'])
     
     # get attribute from sub tag
     pattern = "<a><b name='$name'></b></a>"
     exp = BeautifulSoup(pattern)
     
     # one attribute
     actual = BeautifulSoup("<a><b name='abc'></b></a>")
     self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
Example #12
0
 def testExtractText(self):
     pattern = "<a>$text</a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # one text
     actual = BeautifulSoup("<a>hello world</a>")
     self.assertEqual('hello world', _scraper.extractText(exp.contents[0], actual.contents[0])['text'])
Example #13
0
def scrape_all(root_href, data_dir):
    """ Scrape Mountain Project and save Destination objects """
    
    scrap = Scraper(root_href)

    # iterate over children of the root (e.g. states in the US)
    for href in scrap.get_children():
        save_info_from(href, data_dir)
Example #14
0
 def run(self):
     try:
         print "y"
         s = Scraper('en')
         #s.getCategory(self.cat)
         s.getGameList(self.cat, endPage=0)
     except:
         print "n"
Example #15
0
 def testExtract(self):
     pattern = "<a name='$name'>$text</a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # text in sub tag        
     actual = BeautifulSoup("<a name='abc'>hello world</a>")
     ret = _scraper.extract(actual.contents[0])
     self.assertEqual('hello world', ret['text'])
Example #16
0
def main():

    print "ENTER THE YOSPOS"
    print "Using " + sys.argv[1] + " @ threadid: " + sys.argv[3]
    new_scraper = Scraper(sys.argv[1], sys.argv[2], sys.argv[3], [])
    new_scraper.scrape_thread()

    new_message = raw_input("Reply: ")
    new_poster = Poster(sys.argv[1], sys.argv[2], sys.argv[3])
    new_poster.make_post(new_message)
Example #17
0
 def __init__(self):
     Scraper.__init__(self)
     api_key = self.config["youtube"]["api_key"]
     self.url = "https://www.googleapis.com/youtube/v3/search"
     self.params = {
       "order": "date",
       "maxResults": 10,
       "channelId": "UCH1dpzjCEiGAt8CXkryhkZg",
       "key": api_key,
       "type": "upload",
       "part": "snippet"
     }
def doador_2004(cnpj_ou_cpf):
    u'''
    Retorna uma tabela com as doações desta pessoa (cnpj_ou_cpf). A tabela
    é uma lista de listas, cada uma contendo os campos em "doador_2004.campos".

    >>> tabela = doador_2004('85.907.012/0001-57')
    >>> tabela is not None
    True
    >>> len(tabela)
    16
    >>> len(tabela[0]) == len(doador_2004.campos)
    True

    URL: http://www.tse.gov.br/internet/eleicoes/2004/prest_blank.htm
    '''

    pessoa = pessoa_or_valueerror(cnpj_ou_cpf)
    scraper = Scraper()

    url = 'http://www.tse.gov.br/sadEleicao2004Prestacao/spce/index.jsp'
    scraper.open(url)

    scraper.browser.select_form(name='formDoador')
    scraper.browser.form.find_control(name='nome').readonly = False
    scraper.browser.form.find_control(name='numero').readonly = False
    scraper.browser.form['numero'] = pessoa.plain()
    scraper.browser.form['nome'] = '%'

    try:
        scraper.submit()
    except:
        return None

    if not scraper.html.find(text=regexp('Valor Total de Fornecimento')):
        return None

    table = scraper.html.findAll('table')[-1]

    lines = []
    for tr in table.findAll('tr')[1:-1]:
        columns = []
        for td in tr.findAll('td'):
            try:
                contents = td.b.contents
            except:
                contents = td.contents
            content = ' '.join(contents).strip()
            text = html2unicode(content)
            columns.append(text)
        lines.append(columns)

    return lines
Example #19
0
    def scrapestruct(self, context ):
	pc = context['pc']
	rowscrape = pc['dom_row_pattern']
	blockstr  = context['blockstr']
	soupdoc   = CustomizedSoup( blockstr )
	scraper   = Scraper( rowscrape )
	results   = scraper.match( soupdoc )
	if( len(results) == 0 ): #TBD scraper need to be imporved
	    raise Exception("0 ITEMS SCRAPED WARNING")
	count = min(len(results), 10 )
	items     = results[0:count]
	eitems    = map( lambda i:scraper.extract(i), items)
	context['items'] = eitems
Example #20
0
def performUpdate():
    scr = Scraper()
    try: 
        for competitorListChunk in scr.scrape():
            for competitor in competitorListChunk:
                try:
                    Contestant.update_completed_count(competitor.username.lower(),
                        competitor.completedCount)
                except Exception as e:
                    print("ERR: Username most likely not found in spreadsheet {}. {}".format(
                        competitor.username, str(e)))
    except Exception:
        return
Example #21
0
 def testMatchByType(self):
     # test simple tag
     pattern = "<a></a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # same type
     actual = BeautifulSoup("<a></a>")
     self.assertTrue(_scraper.matchByType(exp.contents[0], actual.contents[0]))
     
     # different type
     actual = BeautifulSoup("text")
     self.assertFalse(_scraper.matchByType(exp.contents[0], actual.contents[0]))        
Example #22
0
    def get(self):
        global total_data, crawl_count, crawled

        if crawl_count >= DEPTH_LIMIT:
            return False

        crawled.add(self.url)
        data = self.fetch()

        if data and data != bytearray(b' '):
            if total_data > CONTENT_LIMIT:
                return False

            total_data += len(data)
            crawl_count += 1
            webserver.save(self.url, self.root, self.type, data)

            s = Scraper(data, self.console)
            if self.type not in ["JS", "CSS"]:
                #css
                css_links = s.get_css()
                for link in css_links:
                    if link:
                        c = Crawler(link, self, "CSS", self.console)
                        if c.url not in crawled:
                            c.get()
                    else:
                        pass
            if self.type not in ["JS", "CSS"]:
                #js
                js_links = s.get_script()
                self.console.print(js_links)

                for link in js_links:
                    if link:
                        c = Crawler(link, self, "JS", self.console)
                        if c.url not in crawled:
                            c.get()
                    else:
                        pass
            #  hrefs
            if self.type == "HTML":
                links = s.get_links()

                for link in links:
                    if link:
                        c = Crawler(link, self, "HTML", self.console)
                        if c.url not in crawled:
                            c.get()
                    else:
                        pass
Example #23
0
def analyze():
    """ Analyze text from a given URL """

    url = request.form.get("url", "").strip()
    use_reducer = not ("noreduce" in request.form)
    dump_forest = "dump" in request.form
    metadata = None
    # Single sentence (True) or contiguous text from URL (False)?
    single = False
    keep_trees = False

    t0 = time.time()

    if url.startswith("http:") or url.startswith("https:"):
        # Scrape the URL, tokenize the text content and return the token list
        metadata, generator = process_url(url)
        toklist = list(generator)
        # If this is an already scraped URL, keep the parse trees and update
        # the database with the new parse
        keep_trees = Scraper.is_known_url(url)
    else:
        # Tokenize the text entered as-is and return the token list
        # In this case, there's no metadata
        toklist = list(tokenize(url))
        single = True

    tok_time = time.time() - t0

    t0 = time.time()

    # result = profile(parse, toklist, single, use_reducer, dump_forest)
    result, trees = parse(toklist, single, use_reducer, dump_forest, keep_trees)

    # Add a name register to the result
    create_name_register(result)

    parse_time = time.time() - t0

    if keep_trees:
        # Save a new parse result
        if Settings.DEBUG:
            print("Storing a new parse tree for url {0}".format(url))
        Scraper.store_parse(url, result, trees)

    result["metadata"] = metadata
    result["tok_time"] = tok_time
    result["parse_time"] = parse_time

    # Return the tokens as a JSON structure to the client
    return jsonify(result = result)
Example #24
0
def scrape():
    """
    Sets up the scraper to scrape HN and Reddit.
    """
    app.logger.info("Scraping Reddit")

    scrape_reddit = Scraper(app.logger)
    scrape_reddit.gather_reddit_data()

    app.logger.info("Finished gathering data, inserting into DB")
    scrape_reddit.insert_into_db()

    app.logger.info("Finished inserting into DB, sleeping for %d minutes..." % (SCRAPING_INTERVAL / 60.0))
    threading.Timer(SCRAPING_INTERVAL, scrape).start()
Example #25
0
def start_threads_for_letter(startLetter):
  outFn = "voters_"+str(startLetter)+".txt"
  outFile = open(outFn, 'w')
  
  print("Getting records starting with " + startLetter)

  scp = Scraper(conf.sessionHeaders, conf.searchHeaders)
  scp.setup_session([conf.baseUrl, conf.rollSearchUrl])
 
  url = conf.searchUrl
  params = conf.searchParams

  params['electorName'] = startLetter

  scp.get_and_write_records(url, 0, params, outFile)
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://go.berniesanders.com/page/event/search_results"
     self.html = HTMLParser()
     self.params = {
         'orderby': 'zip_radius',
         'zip_radius[1]': '6000',
         'zip_radius[0]': '78218',
         'radius_unit': 'mi',
         'country': 'US',
         'format': 'json'
     }
     self.map = {
         "id": "original_id",
         "start_dt": "start_time"
     }
Example #27
0
 def writeindexhtm(self):
     text = Scraper.getIndexhtm()
     if not os.path.isfile(self.path() + 'index.htm'):
         with open(self.path() + 'index.htm', 'w') as f:
             f.write(Patcher.patchindexhtm(text))
     else:
         print 'index.htm already saved'
Example #28
0
    def getacclist(self, facade, accountList, token, step, allofit):
        logging.info(" --> STEP-" + str(step))
        Scraper.getacclist(self, facade, accountList, token)
        self.wipe_pages()

        self.prepParse(allofit, facade, token)

        page = self.HexToByte( allofit['body'])

        scrape_result = self.gettheAccounts(facade, accountList, token, step, allofit, page)

        if scrape_result != 'good' and scrape_result != "got list":
            logging.warning("Got result - " + scrape_result)
            self.flush_pages()

        return scrape_result
Example #29
0
 def write_Slist(self):
     text2 = Scraper.getSlist()
     if not os.path.isfile(self.path() + 'resources\slist.txt'):
         with open(self.path() + 'resources\slist.txt', 'w') as f:
             f.write(text2)
     else:
         print 'slist.txt already downloaded'
Example #30
0
 def writeddixml(self):
     text = Scraper.getddixml()
     if not os.path.isfile(self.path() + 'ddi.xml'):
         with open(self.path() + 'ddi.xml', 'w') as f:
             f.write(text)
     else:
         print 'ddi.xml already saved'
class CachedTweets:
    scraper = Scraper(
        twitter_authentication=TwitterAuthentication.autodetect_twitter_auth())

    @staticmethod
    @lru_cache(maxsize=None)
    def tweets_small():
        # type: () -> Dict[str, List[TweetResult]]
        """Return a static list of 9 tweets that is generated once and re-used throughout the module's lifetime."""
        return CachedTweets.scraper.scrape_terms(
            {"flood", "fire", "house fire"}, count=3)

    @staticmethod
    @lru_cache(maxsize=None)
    def tweets_small_no_retweets():
        # type: () -> Dict[str, List[TweetResult]]
        """Return a static list of 9 non-retweet tweets that is generated once and re-used throughout the module's lifetime."""
        return CachedTweets.scraper.scrape_terms(
            {"flood", "fire", "house fire"}, count=3, include_retweets=False)

    @staticmethod
    @lru_cache(maxsize=None)
    def tweets_small_geo():
        # type: () -> Dict[str, List[TweetResult]]
        """
        Return a static list of 9 tweets geotagged 20 miles from Chicago's center that is generated once and re-used
        throughout the module's lifetime.
        """
        return CachedTweets.scraper.scrape_terms(
            {"flood", "fire", "house fire"},
            count=3,
            geocode=geobox_to_geocode(GEOBOX_CHICAGO, "20mi"))

    @staticmethod
    @lru_cache(maxsize=None)
    def tweets_medium():
        # type: () -> Dict[str, List[TweetResult]]
        """Return a static list of 60 tweets that is generated once and re-used throughout the module's lifetime."""
        return CachedTweets.scraper.scrape_terms(
            {"flood", "fire", "house fire"}, count=20)

    @staticmethod
    @lru_cache(maxsize=None)
    def tweets_medium_no_retweets():
        # type: () -> Dict[str, List[TweetResult]]
        """Return a static list of 60 non-retweet tweets that is generated once and re-used throughout the module's lifetime."""
        return CachedTweets.scraper.scrape_terms(
            {"flood", "fire", "house fire"}, count=20, include_retweets=False)

    @staticmethod
    @lru_cache(maxsize=None)
    def tweets_medium_geo():
        # type: () -> Dict[str, List[TweetResult]]
        """
        Return a static list of 60 tweets geotagged 20 miles from Chicago's center that is generated once and re-used
        throughout the module's lifetime.
        """
        return CachedTweets.scraper.scrape_terms(
            {"flood", "fire", "house fire"},
            count=20,
            geocode=geobox_to_geocode(GEOBOX_CHICAGO, "20mi"))

    @staticmethod
    @lru_cache(maxsize=None)
    def tweets_large():
        # type: () -> Dict[str, List[TweetResult]]
        """Return a static list of 300 tweets that is generated once and re-used throughout the module's lifetime."""
        return CachedTweets.scraper.scrape_terms(
            {"flood", "fire", "house fire"}, count=100)

    @staticmethod
    @lru_cache(maxsize=None)
    def tweets_large_geo():
        # type: () -> Dict[str, List[TweetResult]]
        """Return a static list of 300 tweets geotagged 20 miles from Chicago's center that is generated once and
        re-used throughout the module's lifetime."""
        return CachedTweets.scraper.scrape_terms(
            {"flood", "fire", "house fire"},
            count=100,
            geocode=geobox_to_geocode(GEOBOX_CHICAGO, "20mi"))
def callback(ch, method, properties, body):
    url = json.loads(body)['url']
    scraper = Scraper()
    result = scraper.scrape(url)
    publish_result(result)
def salva(url_veiculo):
    xpath_categoria = "//*[contains(@class, 'categoria')]/a//text()"
    xpath_marca = "//*[contains(@class, 'marca')]/text()"
    xpath_modelo = "//*[contains(@class, 'modelo')]/text()"
    http = urllib3.PoolManager()
    xpath_especs = "//*[@id=\"dados-veiculo\"]/ul[1]/li/span/text()"
    xpath_preco = "//*[@id=\"dados-veiculo\"]/div/span/text()"
    xpath_opcionais = "//*[contains(@class, 'opcionais')]//li/text()"
    scraper = Scraper()
    print("Xpath Categoria...")
    resultados = scraper.pega_lista(url_veiculo, xpath_categoria)
    v = Veiculo
    o = Opcionais
    if (len(resultados) > 0):  #se nao possuir categoria
        v.categoria = resultados[0]
    else:
        v.categoria = "Sem Categoria"
    print("Xpath Marca...")
    resultados = scraper.pega_lista(url_veiculo, xpath_marca)
    try:
        m = Marca.objects.create(nome=resultados[0])
        m.save()
    except:
        m = Marca.objects.get(nome=resultados[0])
    print("Xpath modelo...")
    resultados = scraper.pega_lista(url_veiculo, xpath_modelo)
    v.modelo = resultados[0]
    print("Xpath Especificos...")
    resultados = scraper.pega_lista(url_veiculo, xpath_especs)
    if (len(resultados) > 3):
        v.ano_modelo = formata_ano_modelo(resultados[0])
        v.cor = resultados[1]
        v.km = resultados[2]
        v.combustivel = resultados[3]
    elif (len(resultados) == 3):
        v.ano_modelo = formata_ano_modelo(resultados[0])
        v.cor = resultados[1]
        v.km = -1
        v.combustivel = resultados[2]
    print("Xpath Preco...")
    resultados = scraper.pega_lista(url_veiculo, xpath_preco)
    r = resultados[0].replace("R$ ", "")
    r = r.replace(",00", "")
    print("preco: ", r)
    if "Consulte" in r:
        r = None
        v.preco = r
    else:
        v.preco = r
    marca = Marca.objects.get(nome=m.nome)
    print("Xpath opcionais...")
    resultados = scraper.pega_lista(url_veiculo, xpath_opcionais)
    for resultado in resultados:
        resultado = resultado.encode('utf-8')
        try:
            o = Opcionais.objects.create(nome=resultado)
            o.save()
        except:
            print("")
    v_hash = marca.nome + v.categoria + v.modelo + v.cor + v.ano_modelo + str(
        v.km) + v.combustivel + str(v.preco) + ''.join(resultados)
    v_hash = hashlib.md5(v_hash.encode('utf-8'))
    obj_hash = str(v_hash.hexdigest())
    try:
        v = Veiculo.objects.create(vei_pk=obj_hash,
                                   marca=marca,
                                   categoria=v.categoria,
                                   modelo=v.modelo,
                                   cor=v.cor,
                                   ano_modelo=v.ano_modelo,
                                   km=v.km,
                                   combustivel=v.combustivel,
                                   preco=v.preco)
        v.save()
    except:
        print("Veiculo ja existente no BD")
    for resultado in resultados:
        try:
            vop = VeiculoOpcionais.objects.create(
                veiculo=v, opcionais=Opcionais.objects.get(nome=resultado))
            vop.save()
        except:
            pass
    print("Terminado.")
Example #34
0
 def setUp(self) -> None:
     self.s = Scraper(database="testscraperdb.cnf", logger=self.logger)
     url = 'https://www.chewy.com/s?rh=c%3A288%2Cc%3A332&page=1'
     self.s._enter_update_time_and_count(self.s._get_total_food_count(url))
Example #35
0
from scraper import Scraper

stats_scraper = Scraper()
games = stats_scraper.get_games()
print games[0].home()
print games[0].away()
print stats_scraper.get_teams_rest(0)
Example #36
0
def test_credentials_valid():
    s = Scraper()
    assert s.verify_chinesepod_credentials()
Example #37
0
class Quiz:
  colors = {
    "normal": "\033[1;37;40m",
    "fire": "\033[1;31;40m",
    "water": "\033[1;34;40m",
    "grass": "\033[1;32;40m",
    "flying": "\033[1;37;40m",
    "fighting": "\033[1;31;40m",
    "poison": "\033[1;35;40m",
    "electric": "\033[1;33;40m",
    "ground": "\033[1;33;40m",
    "rock": "\033[1;32;40m",
    "psychic": "\033[1;35;40m",
    "bug": "\033[1;32;40m",
    "ghost": "\033[1;35;40m",
    "dark": "\033[1;37;40m",
    "steel": "\033[1;37;40m",
    "ice": "\033[1;36;40m",
    "dragon": "\033[1;34;40m",
    "fairy": "\033[1;33;40m",
    "correct": "\033[1;32;40m",
    "incorrect": "\033[1;31;40m",
    "reset": "\033[0m"
  }

  def __init__(self, show_types, all_answers):
    self.scraped_pokemon = []
    self.show_types = show_types
    self.all_answers = all_answers
    self.scraper = Scraper()
    self.national_dex = self.scraper.set_national_dex()
  
  def run(self):
    self.rules()
    self.setup()
    correct_answer = self.question()
    self.ending(correct_answer)

  def setup(self):
    # TODO: currently not using the most recent generation until pokemon.com/pokedex is updated 
    # replace the following two lines when it is
    # poke_id = random.randint(1,len(self.national_dex))
    poke_id = random.randint(1,809)
    self.answer_pokemon = self.find_from_scraped(poke_id)
    if not self.answer_pokemon:
      self.answer_pokemon = self.scraper.fetch_pokemon_data(self.national_dex[poke_id-1])
      self.scraped_pokemon.append(self.answer_pokemon)
  
  def find_from_scraped(self, id):
    for poke in self.scraped_pokemon:
      if poke.id == id:
        return poke
    return None
    
  def rules(self):
    print("")
    print("{}Welcome trainer, to the Pokemon Types Quiz!{}".format(self.colors["normal"], self.colors["reset"]))
    print("You will be presented with a pokemon's name{}, and must guess what type{} effective against it".format((" and types" if self.show_types else ""), ("s are" if self.all_answers else " is")))
  
  def question(self):
    print("")
    print("Pokemon's name: {}{}{}".format(self.colors["normal"], self.answer_pokemon.name.title(), self.colors["reset"]))
    if self.show_types:
      types = []
      for type in self.answer_pokemon.types:
        types.append(self.colors[type] + type + self.colors["reset"])
      print("Pokemon's types(s): {}".format(" & ".join(types))) 

    print("What super effective against {}{}{}".format(self.colors["normal"], self.answer_pokemon.name.title(), self.colors["reset"]))
    if self.all_answers:
      print("Enter all possible answers, seperated by a comma and a space (e.g. \"fire, water, bug\")")
      answer = self.validate_answer()
    else:
      answer = self.validate_answer("Enter a single pokemon type: ")

    return self.check_answer(answer)

  def validate_answer(self, msg = ""):
    answer = ""
    print("(for a list of types, enter \"help\")")
    while answer is "":
      answer = input(msg).lower()
      if answer == "help":
        self.list_types()
        answer = ""
        print("What super effective against {}{}{}".format(self.colors["normal"], self.answer_pokemon.name.title(), self.colors["reset"]))
    return answer

  def check_answer(self, answer):
    if self.all_answers:
      return self.answer_pokemon.matches_all_weaknesses(answer.split(", "))
    else:
      return self.answer_pokemon.is_weak_to(answer)
    
  def list_types(self):
    types = []
    for type in Pokemon.all_types:
    
      types.append(self.colors[type] + type + self.colors["reset"])
    print("All types: {}".format(", ".join(types))) 
  
  def ending(self, correct_answer):
    print()
    if correct_answer:
      print(self.colors["correct"] + "Congratulations, you are correct!" + self.colors["reset"])
    else:
      if self.all_answers:
        print(self.colors["incorrect"] + "Sorry, that was not all of the pokemon's weaknesses. They were:" + self.colors["reset"])
      else:
        print(self.colors["incorrect"] + "Sorry, that is not in the list of the pokemon's weaknesses. They were:" + self.colors["reset"])
      weaknesses = []
      for weakness in self.answer_pokemon.weaknesses:
        weaknesses.append(self.colors[weakness] + weakness + self.colors["reset"])
      print("Pokemon's weaknesses: {}".format(", ".join(weaknesses))) 
    
    replay = input("Would you like to play again? (y/n): ").lower()
    while not replay in ["yes", "y", "no", "n"]:
      print("I don't understand that input")
      replay = input("Would you like to play again? (y/n): ").lower()
    
    if replay in ["yes", "y"]:
      self.run()
    else:
      print("Thank you for playing, goodbye!")
Example #38
0
 def __init__(self, show_types, all_answers):
   self.scraped_pokemon = []
   self.show_types = show_types
   self.all_answers = all_answers
   self.scraper = Scraper()
   self.national_dex = self.scraper.set_national_dex()
Example #39
0
from scraper import Scraper


if __name__ == '__main__':
 scraper = Scraper()    
 proxies = scraper.scrape(protocol='SSL')
    
 while proxies.qsize:
  proxy = proxies.get()
  print(proxy)
Example #40
0
from flask_restful import Resource, Api
from app import app, db
from scraper import Scraper

api = Api(app)

scraper = Scraper(db)
scraper.activate()  # Scraping on diff thread for faster launchtime

import models, resources


class Home(Resource):
    def get(self):
        jobs = models.Job.query.all()
        job_schema = models.JobSchema(many=True)
        output = job_schema.dump(jobs).data
        return output


# @app.route('/path/<path:subpath>')
# def show_subpath(subpath):
#     # show the subpath after /path/
# return 'Subpath %s' % escape(subpath)

api.add_resource(Home, '/')
api.add_resource(resources.UserRegistration, '/register')
api.add_resource(resources.UserLogin, '/login')
api.add_resource(resources.ValidateToken, '/validate')

if __name__ == '__main__':
Example #41
0
def update_data_redis():
    s = Scraper()
    countries, continent = s.scraping()
    r.set('countries', json.dumps(countries))
    r.set('continents', json.dumps(continent))
    r.set('total', json.dumps(len(countries) + len(continent)))
Example #42
0
class Gui:
    def __init__(self, root, address):
        # initialize root and title
        self.root = root
        self.root.resizable(0, 0)
        self.root.title('Weather')

        # initialize base canvas
        self.canvas = tk.Canvas(self.root,
                                width=1000,
                                height=500,
                                bg='light blue')
        self.canvas.pack()

        # initialize address
        self.address = address

        # determine if address was loaded from file
        if self.address is None:
            # if none was loaded initialize entry UI
            self.init_entry_ui()
        else:
            # otherwise load weather UI
            self.init_weather_ui()

    # initialize address entry UI
    def init_entry_ui(self):
        # clear canvas
        self.clear_canvas()

        # reset address
        self.address = None

        # create prompt text
        self.prompt_text = self.canvas.create_text(500,
                                                   35,
                                                   text='Enter Address Below:')

        # create address entry text box
        self.address_entry_box = tk.Text(self.canvas, bg='white')
        self.address_entry_box.place(width=600, height=200, x=200, y=50)

        # create find weather button
        self.init_weather_button = tk.Button(self.canvas,
                                             text='Get Weather',
                                             bg='white',
                                             command=self.init_weather_ui)
        self.init_weather_button.place(width=300, height=100, x=350, y=300)

    # initialize weather UI
    def init_weather_ui(self):
        # get address from text entry box
        self.address = self.address_entry_box.get(
            '1.0', 'end-1c') if self.address is None else self.address

        # clear canvas
        self.clear_canvas()

        # set up scraper
        self.web_scraper = Scraper(self.address)

        # create text objects to display location and weather
        self.location_text = self.canvas.create_text(
            500, 100, text=self.web_scraper.get_location())
        if self.web_scraper.get_location() != 'Error: Invalid Address':
            self.forecast_text = self.canvas.create_text(
                500, 115, text=self.web_scraper.get_forecast())
            self.temp_c_text = self.canvas.create_text(
                500, 130, text=self.web_scraper.get_temp_f())
            self.temp_f_text = self.canvas.create_text(
                500, 145, text=self.web_scraper.get_temp_c())

        # initialize reset button
        self.reset_button = tk.Button(self.canvas,
                                      text='Reset Slot',
                                      bg='white',
                                      command=self.init_entry_ui)
        self.reset_button.place(width=300, height=100, x=175, y=300)

        #initialize refresh button
        self.refresh_button = tk.Button(self.canvas,
                                        text='Refresh Forecast',
                                        bg='white',
                                        command=self.refresh_weather)
        self.refresh_button.place(width=300, height=100, x=525, y=300)

    # clear canvas
    def clear_canvas(self):
        # destroy all widgets
        for widget in self.canvas.winfo_children():
            widget.destroy()

        # destroy text created by weather UI state
        try:
            self.canvas.delete(self.location_text)
            self.canvas.delete(self.forecast_text)
            self.canvas.delete(self.temp_c_text)
            self.canvas.delete(self.temp_f_text)
        except AttributeError:
            pass

        # destroy text created by entry UI state
        try:
            self.canvas.delete(self.prompt_text)
        except AttributeError:
            pass

    #refresh weather screen
    def refresh_weather(self):
        self.clear_canvas()
        self.init_weather_ui()

    #get address function
    def get_address(self):
        return self.address
Example #43
0
class TestScraper(TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.logger = VerboseScraperLogger()

    def setUp(self) -> None:
        self.s = Scraper(database="testscraperdb.cnf", logger=self.logger)
        url = 'https://www.chewy.com/s?rh=c%3A288%2Cc%3A332&page=1'
        self.s._enter_update_time_and_count(self.s._get_total_food_count(url))

    def tearDown(self) -> None:
        self.s.engine.dispose()

    def test_scrape_search_results(self):
        # url for search results containing only 4 foods
        url = "https://www.chewy.com/s?rh=c%3A288%2Cc%3A332%2Cbrand_facet%3AAdirondack"

        # dont use number after final /dp/ - corresponds to size of product and doesn't reliable return the
        #   same size; doesn't matter for scraper since we're not looking at price per pound, etc.
        expected_jobs = {
            ("https://www.chewy.com/adirondack-30-high-fat-puppy/dp",
             self.s.scrape_food_if_new),
            ("https://www.chewy.com/adirondack-26-adult-active-recipe-dry/dp",
             self.s.scrape_food_if_new),
            ("https://www.chewy.com/adirondack-large-breed-recipe-dry-dog/dp",
             self.s.scrape_food_if_new),
            ("https://www.chewy.com/adirondack-21-adult-everyday-recipe/dp",
             self.s.scrape_food_if_new)
        }

        self.s.scrape_search_results(url)
        generated_jobs = set()
        while not self.s.scrape_queue.empty():
            job = self.s.scrape_queue.get()
            job = (job[0].rsplit('/', 1)[0], job[1])
            generated_jobs.add(job)
        self.assertEqual(expected_jobs, generated_jobs)

    def test__scrape_food_details(self):
        url1 = "https://www.chewy.com/earthborn-holistic-great-plains-feast/dp/36412"
        food1, diets1 = self.s._scrape_food_details(url1)
        self.assertEqual(51256, food1.item_num)
        self.assertEqual(
            "Earthborn Holistic Great Plains Feast Grain-Free Natural Dry Dog Food",
            food1.name)
        self.assertEqual(
            "https://www.chewy.com/earthborn-holistic-great-plains-feast/dp/36412",
            food1.url)
        test_ingredients1 = (
            "Bison Meal, Peas, Pea Protein, Tapioca, Dried Egg, Canola Oil (preserved with Mixed "
            "Tocopherols), Beef Meal, Pacific Whiting Meal, Pea Starch, Chickpeas, Flaxseed, "
            "Alaska Pollock Meal, Natural Flavors, Pea Fiber, Blueberries, Cranberries, Apples, "
            "Carrots, Spinach, Salt, Potassium Chloride, Choline Chloride, DL-Methionine, "
            "L-Lysine, Taurine, L-Carnitine, Beta-Carotene, Vitamin A Supplement, Vitamin D3 "
            "Supplement, Vitamin E Supplement, Zinc Sulfate, Ferrous Sulfate, Niacin, Folic Acid, "
            "Biotin, Manganese Sulfate, Copper Sulfate, Calcium Pantothenate, Thiamine Mononitrate, "
            "Pyridoxine Hydrochloride, Riboflavin Supplement, L-Ascorbyl-2-Polyphosphate (source of "
            "Vitamin C), Zinc Proteinate, Manganese Proteinate, Copper Proteinate, Calcium Iodate, "
            "Sodium Selenite, Cobalt Carbonate, Vitamin B12 Supplement, Yucca Schidigera Extract, "
            "Rosemary Extract, Dried Enterococcus Faecium Fermentation Product, Dried "
            "Lactobacillus Casei Fermentation Product, Dried Lactobacillus Acidophilus "
            "Fermentation Product.")
        self.assertEqual(test_ingredients1, food1.ingredients)
        self.assertEqual("Earthborn Holistic", food1.brand)
        self.assertEqual(None, food1.xsm_breed)
        self.assertEqual(True, food1.sm_breed)
        self.assertEqual(True, food1.md_breed)
        self.assertEqual(True, food1.lg_breed)
        self.assertEqual(None, food1.xlg_breed)
        self.assertEqual("Dry Food", food1.food_form)
        self.assertEqual("Adult", food1.lifestage)
        self.assertEqual(False, food1.fda_guidelines)
        test_diets1 = ["Grain-Free", "Gluten Free"]
        self.assertEqual(test_diets1, diets1)

        url2 = "https://www.chewy.com/natural-balance-lid-limited/dp/104666"
        food2, diets2 = self.s._scrape_food_details(url2)
        self.assertEqual(76793, food2.item_num)
        self.assertEqual(
            "Natural Balance L.I.D. Limited Ingredient Diets Sweet Potato & Bison Formula Grain-Free Dry Dog Food",
            food2.name)
        self.assertEqual(
            "https://www.chewy.com/natural-balance-lid-limited/dp/104666",
            food2.url)
        test_ingredients2 = (
            "Sweet Potatoes, Bison, Potato Protein, Pea Protein, Canola Oil (Preserved with "
            "Mixed Tocopherols), Dicalcium Phosphate, Natural Flavor, Salmon Oil (Preserved "
            "with Mixed Tocopherols), Potato Fiber, Salt, Calcium Carbonate, Flaxseed, "
            "DL-Methionine, Minerals (Zinc Amino Acid Chelate, Zinc Sulfate, Ferrous "
            "Sulfate, Iron Amino Acid Chelate, Copper Sulfate, Copper Amino Acid Chelate, "
            "Sodium Selenite, Manganese Sulfate, Manganese Amino Acid Chelate, Calcium "
            "Iodate), Vitamins (Vitamin E Supplement, Niacin, d-Calcium Pantothenate, "
            "Vitamin A Supplement, Riboflavin Supplement, Thiamine Mononitrate, Biotin, "
            "Vitamin B12 Supplement, Pyridoxine Hydrochloride, Vitamin D3 Supplement, "
            "Folic Acid), Choline Chloride, Taurine, Citric Acid (preservative), Mixed "
            "Tocopherols (preservative), Rosemary Extract.")
        self.assertEqual(test_ingredients2, food2.ingredients)
        self.assertEqual("Natural Balance", food2.brand)
        self.assertEqual(None, food2.xsm_breed)
        self.assertEqual(True, food2.sm_breed)
        self.assertEqual(True, food2.md_breed)
        self.assertEqual(True, food2.lg_breed)
        self.assertEqual(None, food2.xlg_breed)
        self.assertEqual("Dry Food", food2.food_form)
        self.assertEqual("Adult", food2.lifestage)
        self.assertEqual(False, food2.fda_guidelines)
        test_diets2 = [
            "Sensitive Digestion", "Limited Ingredient Diet",
            "No Corn No Wheat No Soy", "Grain-Free"
        ]
        self.assertEqual(test_diets2, diets2)

    def test__enter_in_db(self):
        import time

        test_key = hash(time.localtime)
        test_url = str(test_key) + "/12345"
        test_name = str(test_key)
        test_diets = [
            'Sensitive Digestion', 'Limited Ingredient Diet',
            'No Corn No Wheat No Soy', 'Grain-Free'
        ]
        test_food = Food(item_num=test_key,
                         name=test_name,
                         url=test_url,
                         ingredients="None",
                         brand="None",
                         xsm_breed=False,
                         sm_breed=False,
                         md_breed=False,
                         lg_breed=False,
                         xlg_breed=False,
                         food_form="None",
                         lifestage="None",
                         fda_guidelines=False)

        self.assertFalse(self.s._check_db_for_food(url=test_url))
        self.s._enter_in_db(test_food, test_diets)
        self.assertTrue(self.s._check_db_for_food(url=test_url))

    def test__enqueue_url(self):
        def dummy_function():
            pass

        self.s._enqueue_url("www.test.com", dummy_function)
        url, func = self.s.scrape_queue.get()
        self.assertEqual("www.test.com", url)
        self.assertEqual(dummy_function, func)

    def test__check_db_for_food(self):
        self.assertTrue(self.s._check_db_for_food(url="www.test.com/1/54321"))
        self.assertFalse(
            self.s._check_db_for_food(
                url="this entry is not in the database/12345"))

    def test__check_ingredients(self):
        food1 = Food(ingredients="chicken, lentils, potatoes - this one's bad")
        food2 = Food(ingredients="just chicken in this food - its good!")
        food3 = Food(
            ingredients=
            "this food has good ingredients, vitamins and minerals, then sweet potatoes - ok!"
        )

        food1 = self.s._check_ingredients(food1)
        food2 = self.s._check_ingredients(food2)
        food3 = self.s._check_ingredients(food3)

        self.assertEqual(False, food1.fda_guidelines)
        self.assertEqual(True, food2.fda_guidelines)
        self.assertEqual(True, food3.fda_guidelines)

    def test__make_request(self):
        r1 = self.s._make_request("https://www.google.com/")
        r2 = self.s._make_request("https://www.google.com/notarealsite")
        self.assertEqual(200, r1.status_code)
        self.assertEqual(404, r2.status_code)

    def test__pages_of_results(self):
        results = self.s._pages_of_results(
            'https://www.chewy.com/s?rh=c%3A288%2Cc%3A332&page=1')
        self.assertEqual(101, results)
        results = self.s._pages_of_results(
            'https://www.chewy.com/s?rh=c%3A288%2Cc%3A332%2Cc%3A294')
        self.assertEqual(43, results)

    def test__new_total_count_greaterthan_last(self):
        new_total = 3600
        self.assertFalse(self.s._new_total_count_greaterthan_last(new_total))
        new_total = 3603
        self.assertFalse(self.s._new_total_count_greaterthan_last(new_total))
        new_total = 3604
        self.assertTrue(self.s._new_total_count_greaterthan_last(new_total))
Example #44
0
logger.setLevel(logging.INFO)

file_handler = logging.FileHandler('app.log')
stream_handler = logging.StreamHandler()
file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

target_class_names = 'post-6 page type-page status-publish hentry'
target_id_name = 'post-6'
html_element = 'article'
target_url = 'http://bit.lk/index.php/category/announcement/page/1'

scraper = Scraper()
fetcher = DocumentFetcher()
datetime_updater = DateTimeUpdater(fetcher, scraper)
database = db_connection()
factory = AnnouncementFactory()

try:
    stored_collection = AnnouncementMapper(database).get_recent_announcements(factory)
    subscribers = SubscriberMapper(JsonAdapter('./subscribers.json')).get_all_subscribers(SubscriberFactory())

    web_page = fetcher.fetch_document(target_url)
    web_collection = scraper \
        .set_html_document(web_page) \
        .extract_html(html_element, target_id_name, target_class_names) \
        .get_announcements(AnnouncementFactory())
Example #45
0
import numpy as np
from yahooquery import Ticker
import multiprocessing as mp

## start the nameserver pyro4-ns [options]

from scraper import Scraper

if __name__ == '__main__':
    #############################################
    # 1. TO-DO: REPLACE THIS STATIC tickerName LIST WITH ONE THAT READS A CSV FILE FOR THE tickerNameS
    TickerNames = pd.read_csv('./src/tickerNames/TickerNames.csv')

    TickerNames = TickerNames.values
    ############################################

    # Multiprocessing
    # processes = []

    print("Fetching Data from Stocks")
    scraper = Scraper('ETH-USD')
    scraper.update()
    # for tickerName in TickerNames:
    #     scraper = Scraper(tickerName[0])
    #     process = mp.Process(target=scraper.update, args=())
    #     process.start()
    #     processes.append(process)

    # for process in processes:
    #     process.join()
Example #46
0
import datetime

from scraper import Scraper
from colorama import init, Fore, Back, Style

best_rooms = ["Aula Be", "Aula Ve", "Aula Oe"]
fine_rooms = ["Aula Ae", "Aula Pe", "Aula Ie", "Aula Qe", "Aula Ke"]
shitty_rooms = ["Aula Ge", "Aula Ce", "Aula De", "Aula Je", "Aula Le", "Aula Se"]



init()


now = datetime.datetime.now()
scraper = Scraper()

data = scraper.get_room_list()

free_room = []

for room in data:
    if room.is_free(now) :
        # print(room.name)
        free_room.append(room)


print("AULE LIBERE ALLE ORE:   " + str(now.time().hour) + ":" + str(now.time().minute))


for room in free_room:
Example #47
0
def test_credentials_invalid():
    s = Scraper(credentials=('a', 'b'))
    assert not s.verify_chinesepod_credentials()
Example #48
0
def get_query_info(url):
    try:
        scrape = Scraper(url)
    except (URLError, BadStatusLine) as err:
        print "url error: ", url, err
        return None

    scrape.move_to('<pre id="queryBodyText" class="cm-s-default">')
    sql_raw = scrape.pull_from_to('<code>', '</code>')
    sql = clean_sql(sql_raw)
    scrape.move_to('> created <span')
    ts = scrape.pull_from_to('title="', '" class="relativetime"')
    scrape.move_to('<div class="user-gravatar32">')
    if scrape.comes_before('<a href="/users', '</div>'):
        user_id = scrape.pull_from_to('<a href="/users/', '"><img ')
        scrape.move_to('<a href="/users/')
        user_name = scrape.pull_from_to('>', '</a>')
    else:
        user_id = None
        user_name = None
    return (ts, url, sql, user_id, user_name)
from flask import Flask, render_template

from flask_app.forms import ScrapeSingleTermTestForm
from config import FlaskConfig
from scraper import Scraper
from twitter import TwitterAuthentication

current_folder = os.path.abspath(os.path.dirname(__file__))
static_folder = os.path.join(current_folder, 'static')
template_folder = os.path.join(current_folder, 'templates')

app = Flask(__name__, static_url_path='/static', template_folder=template_folder, static_folder=static_folder)

app.debug = FlaskConfig.DEBUG

scraper = Scraper(twitter_authentication=TwitterAuthentication.autodetect_twitter_auth())


@app.route('/')
def index():
    return render_template('index.html')


@app.route("/success")
def success():
    return "Whatever you just tried worked. Congrats :)"


@app.route('/scrape_term', methods=("GET", "POST"))
def scrape_term():
    form = ScrapeSingleTermTestForm()
Example #50
0
def main(argv=sys.argv[1:]):
    """
    Main method. Reads URL list and initializes scaper and observers
    """

    if sys.version_info < (3, 8):
        sys.exit(
            f'Python minumum version required is 3.8. Current version is {sys.version_info}'
        )

    # parse arguments
    parser = argparse.ArgumentParser(
        description=
        'URLs health monitor. If no parameters passed is run with defaults.')
    parser.add_argument('-urlc',
                        '--urlconfig',
                        dest='urls',
                        metavar='path/to/config.json',
                        type=filePath,
                        help='Path to urls to be watched.')
    parser.add_argument('-osurls',
                        dest='osurls',
                        metavar='path/to/osa_urls.txt',
                        type=filePath,
                        help='Path to urls to execute OS command')
    parser.add_argument('-apiurls',
                        dest='apiurls',
                        metavar='path/to/apia_urls.txt',
                        type=filePath,
                        help='Path to urls to execute API command')
    args = parser.parse_args(argv)

    # determ root folder based on current file location
    rootFolder = pathlib.Path(__file__).parent.parent

    # get default file names, if no arguments
    urlsFile = args.urls if args.urls else rootFolder.joinpath(
        Config.WATCHER_URLS)
    osurlsFile = args.osurls if args.osurls else rootFolder.joinpath(
        Config.OSACTION_URLS)
    apiurlsFile = args.apiurls if args.apiurls else rootFolder.joinpath(
        Config.APIACTION_URLS)

    with open(urlsFile) as ufile:
        urls = json.load(ufile)

    # transform json to objects of type WatcherConfig
    urlsObjects = [
        WatcherConfig(
            url=url['url'],
            checkPeriodInSec=url['checkPeriodInSec'],
            requestParams=url.get('requestParams', {}),
            responseCodesHealthy=set(url['responseCodesHealthy']),
            responseBodyHealthyRegex=url.get('responseBodyHealthyRegex'),
            responseHeadersHealthyRegex=url.get('responseHeadersHealthyRegex'))
        for url in urls
    ]

    with open(osurlsFile) as ufile:
        # read all lines as set
        osurls = set(map(str.strip, ufile))

    with open(apiurlsFile) as ufile:
        # read all lines as set
        apiurls = set(map(str.strip, ufile))

    # init scraper for the URLs list
    scraper = Scraper(urlsObjects)

    # attach observers to receive notification on fail
    scraper.attach(
        OsAction(scope=osurls,
                 name='Test-OSPing',
                 cmd=('ping', '127.0.0.1', '-n', '1'),
                 waitToComplete=True))
    scraper.attach(ApiCall(scope=apiurls, name='Test-Sendemail'))

    # run scraper
    asyncio.run(scraper.run())
Example #51
0
from scraper import Scraper
from telegram_bot import TelegramBot
import schedule
import time
import datetime

MINUTES = 5
MONTH = ['2020-01']
NUM_CONSECUTIVE = 1
scraper = Scraper(appointment_type="Researcher") # possible options are ["Researcher", "BlueCard"]
bot = TelegramBot()

def get_timestamp():
    return datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %X")

def get_message(appointments, consecutive=1):
    print(appointments)
    free_appointments = []
    message = ""
    send_message = False
    for date in appointments:
        value = appointments[date]
        if len(value) > 0:
            for i in range(0, len(value)):
                timestamp = time.mktime(time.strptime(value[i], '%H:%M'))
                j = i + 1
                while j < len(value):
                    timestamp_next = time.mktime(time.strptime(value[j], '%H:%M'))
                    diff = timestamp_next - timestamp
                    if diff/60 <= 30 * (j - i):
                        j = j+1
Example #52
0
    def getxactlist(self, facade, accountList, token, step, allofit):
        logging.info(" --> STEP-" + str(step))
        Scraper.doscrape(self, facade, accountList, token)
        self.wipe_pages()

        scrape_result = "good"

        self.response = {}

        self.prepParse(allofit, facade, token)

        page = self.HexToByte(allofit['body'])

        # if its early steps login and get the accounts
        if step < 7:
            scrape_result = self.gettheAccounts(facade, accountList, token,
                                                step, allofit, page)

        if step == 4 or step == 5 or step == 6:
            # if that went well and we now have a list
            if scrape_result == "got list":
                # test that we have some account links
                acCount = 0
                accounts = []
                for ac in self.accountLinks:
                    acName = self.accountLinks[acCount].string
                    acLink = self.accountLinks[acCount]['href']
                    # simple lookup here as login sets up mayAccounts in NatWest
                    acpair = self.myAccounts[acCount]

                    # tha unique name defining this account to link on (eg accont number)
                    acID = None

                    # if we did detect an account number
                    if acpair != None:
                        logging.debug('got a pair')
                        acID = acpair['num']  # then use this

                    # now see if we hav this id in our list of accounts to get the correct path to add the xactions to
                    theAccountPath = []
                    if acID in accountList:
                        theAccountPath = accountList[acID]

                        account = {}
                        account['accountid'] = acID
                        account['href'] = acLink
                        account['path'] = theAccountPath
                        account['synched'] = False
                        account['type'] = acpair['type']
                        account['balance'] = acpair['bal']

                        accounts.append(account)

                    else:
                        logging.warn("N-B - account not in users list " +
                                     acName + ' - ' + acID)

                    logging.info('-------------->' + str(acID))
                    logging.info('-------------->' + str(theAccountPath))
                    logging.info('-------------->' + str(acLink))
                    logging.info('-------------->' + str(acName))

                    acCount = acCount + 1

                accounts[0]['synched'] = True
                self.response['accountlist'] = accounts

                # TODO - choose the right url - the first on the list
                self.response['url'] = self.ByteToHex(accounts[0]['href'])
                self.response['data'] = ""
                self.response['method'] = 'GET'

                #  7 or twelve cc or normal
                if accounts[0]['type'] == 'Credit':
                    self.response['step'] = 12
                else:
                    self.response['step'] = 7

                self.response['accountid'] = accounts[0]['accountid']
                self.response['accountpath'] = accounts[0]['path']

                scrape_result = "account list"

            elif scrape_result != "good":
                self.flush_pages()

            return scrape_result

        if step > 6:
            accounts = allofit['accountlist']

            # detals of the currently synching account
            # account number - id etc
            acName = allofit['accountid']
            theAccountPath = allofit['accountpath'][:]

            bankurl = self.HexToByte(allofit['bankurl'])

            logging.debug("PROCESSING - -- -> " + bankurl)
            logging.debug(" acName - " + acName + " acPath - " +
                          str(theAccountPath))

            actype = None
            acbal = 0

        # first recieved next link towards accountlist for normal accounts
        if step == 7:

            logging.info('STEP 7')

            scrape_result = self.doStep7(page)

        # first recieved next link towards accountlist for cc accounts
        if step == 12:
            logging.info('STEP 12')

            scrape_result = self.doStep12(page)

        # actually got a transaction list page - so check for an all link
        if step == 20:
            # now find this account to get the extra bits of info

            result = self.doAllLink(page)

            # if we allready have all xacts as per ususal then trigger step 30 with this page
            if result == "allxacts":
                step = 30
            else:
                logging.debug("need to get the page of all accounts")

        # copy all the response stuff
        if step > 5 and step < 30:
            self.response['accountid'] = acName
            self.response['accountpath'] = theAccountPath
            self.response['accountlist'] = accounts

            scrape_result = "account list"

        # actually got a transaction list page
        if step == 30:

            # now find this account to get the extra bits of info
            for ac in accounts:
                if ac['accountid'] == acName:
                    acType = ac['type']
                    balance = ac['balance']

            if acType == 'Credit':
                result = self._processCCAccount(page, theAccountPath, balance)
            else:
                result = self._processNormAccount(page, theAccountPath,
                                                  balance)

        # good or bad move onto next account - good plan??
        # if trying to grab the account detail pages and they return bad - move on as well
        # remember all the pre acc list return scrape_result = "account list" on success
        if step == 30 or (step > 5 and scrape_result != 'account list'):
            url = ""
            acind = 0
            acit = 0
            # now work out the next url
            for acc in accounts:
                if not acc['synched']:
                    acind = acit
                    url = acc['href']
                    break
                acit = acit + 1

            self.response = {}

            self.response['data'] = ""

            self.response['accountid'] = ""
            self.response['accountpath'] = ""

            if url == "":
                self.response['url'] = ""
                self.response['method'] = 'END'
            else:
                self.response['url'] = self.ByteToHex(url)
                accounts[acind]['synched'] = True
                self.response['method'] = 'GET'
                self.response['accountid'] = accounts[acind]['accountid']
                self.response['accountpath'] = accounts[acind]['path']

            #  6 or twelve cc or normal
            if accounts[acind]['type'] == 'Credit':
                self.response['step'] = 12
            else:
                self.response['step'] = 7

            self.response['accountlist'] = accounts

            scrape_result = "got account"

        return scrape_result
Example #53
0
    def test_Al2O3_scraper(self):
        """Tests whether all 4 records for "Al2O3" are scraped correctly."""

        # Initialize a `scraper.Scraper` instance.
        scraper = Scraper()
        scraper.get_landing_page()

        # Is the landing page correct?
        self.assertIn('NIST-JANAF Thermochemical Tables',
                      scraper.browser.title)

        # Enter "Al2O3" in the form, submit it.
        scraper.send_query('Al2O3')
        scraper.select_state()
        scraper.submit_query()

        # Get all records resulting from the above query
        query_records = scraper.all_query_records

        # Verify number of records scraped.
        self.assertEqual(len(query_records), 4)

        # Check if scraped data is OK.
        self.assertIn('aluminum_oxide__kappa', query_records)
        self.assertEqual(query_records['aluminum_oxide__alpha']['CAS'],
                         '1344-28-1')
        self.assertEqual(query_records['aluminum_oxide__delta']['formula'],
                         'Al2O3')
        self.assertEqual(query_records['aluminum_oxide__gamma']['link'],
                         'http://kinetics.nist.gov/janaf/html/Al-098.txt')

        # Terminate the session cleanly.
        scraper.terminate_session()
Example #54
0
def lambda_handler(event, context):
    scrape = Scraper()
    scrape.scrape()  # get info
    scrape.verify()  # sort info
    scrape.store()  # store info
    return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
Example #55
0
 def __init__(self, url, quality):
     self.url = url
     self.quality = quality
     self.scraper = Scraper(url, quality)
Example #56
0
 def download_single(self, obj):
     images = Scraper.getChapter(obj)
     return self.download_manga(images[0], images[1]) 
Example #57
0
class LSASumarizer():
    def __init__(self, language):
        dir_path = os.path.dirname(os.path.realpath(__file__)) + '/nltk_data/'
        nltk.data.path = [dir_path]

        self.stopwords = sw.words(language)
        if self.stopwords is None:
            self.stopwords = []
        self.scraper = Scraper(language)
        self.language = language
        if language == INDONESIAN:
            factory = StemmerFactory()
            self.stemmer = factory.create_stemmer()
        else:
            self.stemmer = PorterStemmer()

    def _build_feature_matrix(self, documents, feature_type='frequency'):
        def _tokenize(sentence):
            return [
                self.stemmer.stem(w.lower()) for w in word_tokenize(sentence)
            ]

        feature_type = feature_type.lower().strip()

        if feature_type == 'binary':
            vectorizer = CountVectorizer(binary=True,
                                         min_df=1,
                                         stop_words=self.stopwords,
                                         tokenizer=_tokenize,
                                         ngram_range=(1, 2))
        elif feature_type == 'frequency':
            vectorizer = CountVectorizer(binary=False,
                                         min_df=1,
                                         analyzer='word',
                                         stop_words=self.stopwords,
                                         tokenizer=_tokenize,
                                         ngram_range=(1, 2))
        elif feature_type == 'tfidf':
            vectorizer = TfidfVectorizer(min_df=1,
                                         stop_words=self.stopwords,
                                         tokenizer=_tokenize,
                                         ngram_range=(1, 2))
        else:
            raise Exception(
                "Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'"
            )

        feature_matrix = vectorizer.fit_transform(documents).astype(float)

        return vectorizer, feature_matrix

    def _low_rank_svd(self, matrix, singular_count=2):
        u, s, vt = svds(matrix, k=singular_count)
        return u, s, vt

    def _summarize(self,
                   document,
                   num_sentences=2,
                   num_topics=1,
                   feature_type='frequency',
                   sv_threshold=0.5):
        sentences = sent_tokenize(document)
        vec, dt_matrix = self._build_feature_matrix(sentences,
                                                    feature_type=feature_type)

        td_matrix = dt_matrix.transpose()
        td_matrix = td_matrix.multiply(td_matrix > 0)

        u, s, vt = self._low_rank_svd(td_matrix, singular_count=num_topics)
        min_sigma_value = max(s) * sv_threshold
        s[s < min_sigma_value] = 0

        salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
        top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
        top_sentence_indices.sort()

        result = ""
        for index in top_sentence_indices:
            result += sentences[index]

        return result

    def summarize(self, query=None, size=2, text=None):
        suggested_query = None
        lang = None
        status = 0
        if query:
            for ch in ['&', ':', '-', '+', '.', ',']:
                query = query.replace(ch, ' ')
            query = re.sub('[^ 0-9a-zA-Z]+', '', query)
            words = word_tokenize(query.lower())
            filtered_words = [
                word for word in words
                if word not in self.stopwords and word.isalnum()
            ]
            new_query = " ".join(filtered_words)

            suggested_query, status, lang = self.scraper.get_query(new_query)
            if status == -1:
                suggested_query, status, lang = self.scraper.get_query(
                    new_query, isInverse=True)
            if status == -1:
                suggested_query, status, lang = self.scraper.get_query(
                    new_query)

        text = text if text else self.scraper.get_intro_lang(
            suggested_query, lang)

        # remove formula notation and multiple spaces
        text = re.sub('{.+}', '', text)
        text = re.sub('\s+', ' ', text)

        if not text:
            if self.language == INDONESIAN:
                return "mohon maaf {q} tidak ditemukan".format(q=query)
            else:
                return "{q} not found".format(q=query)

        summary = self._summarize(text, size)

        if status == 0:
            return summary
        elif lang == self.language:
            if lang == INDONESIAN:
                return "mungkin maksud anda adalah {sq}\n{s}".format(
                    sq=suggested_query, s=summary)
            else:
                return "maybe this is what you want {sq}\n{s}".format(
                    sq=suggested_query, s=summary)
        else:
            return summary
Example #58
0
class TextRankSummarizer(object):
    def __init__(self, language):
        dir_path = os.path.dirname(os.path.realpath(__file__)) + '/nltk_data/'
        nltk.data.path = [dir_path]

        self.stopwords = sw.words(language)
        self.scraper = Scraper(language)
        self.language = language
        if language == INDONESIAN:
            factory = StemmerFactory()
            self.stemmer = factory.create_stemmer()
        else:
            self.stemmer = PorterStemmer()

    def sentence_similarity(self, sentence1, sentence2):
        if self.stopwords is None:
            self.stopwords = []

        sentence1 = [
            self.stemmer.stem(w.lower()) for w in word_tokenize(sentence1)
        ]
        sentence2 = [
            self.stemmer.stem(w.lower()) for w in word_tokenize(sentence2)
        ]

        all_words = list(set(sentence1 + sentence2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        for w in sentence1:
            if w in self.stopwords:
                continue
            vector1[all_words.index(w)] += 1

        for w in sentence2:
            if w in self.stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return 1 - cosine_distance(vector1, vector2)

    def build_similarity_matrix(self, sentences):
        similarity_matrix = np.zeros((len(sentences), len(sentences)))

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue

                similarity_matrix[idx1][idx2] = self.sentence_similarity(
                    sentences[idx1], sentences[idx2])

        for idx in range(len(similarity_matrix)):
            if similarity_matrix[idx].sum() != 0:
                similarity_matrix[idx] /= similarity_matrix[idx].sum()

        return similarity_matrix

    def page_rank(self, similarity_matrix, eps=0.0001, d=0.85):
        probs = np.ones(len(similarity_matrix)) / len(similarity_matrix)
        while True:
            new_probs = np.ones(len(similarity_matrix)) * (1 - d) / len(
                similarity_matrix) + d * similarity_matrix.T.dot(probs)
            delta = abs((new_probs - probs).sum())
            if delta <= eps:
                return new_probs
            probs = new_probs

    def summarize(self, query=None, size=1, text=None):
        suggested_query = None
        lang = None
        status = 0
        if query:
            for ch in ['&', ':', '-', '+', '.', ',']:
                query = query.replace(ch, ' ')
            words = word_tokenize(query.lower())
            filtered_words = [
                word for word in words
                if word not in self.stopwords and word.isalnum()
            ]
            new_query = " ".join(filtered_words)

            suggested_query, status, lang = self.scraper.get_query(new_query)
            if status == -1:
                suggested_query, status, lang = self.scraper.get_query(
                    new_query, isInverse=True)
            if status == -1:
                suggested_query, status, lang = self.scraper.get_query(
                    new_query)

        text = text if text else self.scraper.get_intro_lang(
            suggested_query, lang)

        # remove formula notation and multiple spaces
        text = re.sub('{.+}', '', text)
        text = re.sub('\s+', ' ', text)

        if not text:
            if self.language == INDONESIAN:
                return "mohon maaf {q} tidak ditemukan".format(q=query)
            else:
                return "{q} not found".format(q=query)

        sentences = sent_tokenize(text)
        similarity_matrix = self.build_similarity_matrix(sentences)
        sentence_ranks = self.page_rank(similarity_matrix)
        ranked_sentence_indexes = [
            item[0] for item in sorted(enumerate(sentence_ranks),
                                       key=lambda item: -item[1])
        ]
        selected_sentences = sorted(ranked_sentence_indexes[:size])

        summary = itemgetter(*selected_sentences)(sentences)

        if isinstance(summary, tuple):
            if status == 0:
                return ' '.join(summary)
            elif lang == self.language:
                res = ' '.join(summary)
                if lang == INDONESIAN:
                    return "mungkin maksud anda adalah {sq}\n{s}".format(
                        sq=suggested_query, s=res)
                else:
                    return "maybe this is what you want {sq}\n{s}".format(
                        sq=suggested_query, s=res)
            else:
                return summary

        if status == 0:
            return summary
        elif lang == self.language:
            if lang == INDONESIAN:
                return "mungkin maksud anda adalah {sq}\n{s}".format(
                    sq=suggested_query, s=summary)
            else:
                return "maybe this is what you want {sq}\n{s}".format(
                    sq=suggested_query, s=summary)
        else:
            return summary
Example #59
0
from dbconnector import DBConnector, DBExplorer
from scraping_functions import extract_competition_information
from scraper import Scraper

import argparse

# ==========================================================================
#                     Functions
# ==========================================================================

# ==========================================================================
#                   Params and definitions
# ==========================================================================
db_object = DBExplorer(data_base='laliga_scraping', table='results')
game_ref_extractor = Scraper(
    url=
    'https://resultados.as.com/resultados/futbol/primera/{season}/jornada/regular_a_{rnd}'
)
results_extractor = Scraper(
    url="https://resultados.as.com{game_ref}/estadisticas/")
configuration = LaLigaConfig()

# ==========================================================================
#                           Code
# ==========================================================================

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--season",
                        help="Season to extract the data",
                        default='2018_2019')
Example #60
0
 def setup_class(cls):
     url = 'http://books.toscrape.com/index.html'
     cls.site1 = Scraper(None)
     cls.site2 = Scraper(url)