def parse(self, response): #####COMENTARIOS##### print('\n########Pagina ' + str(self._num_pagina) + '########') for a in response.xpath('/html/body/div[1]/a'): try: titu = a.css('::attr(title)').get() if titu is not None and titu != 'Descargar' and titu != 'Ak47Full' and titu != 'iPauta' and titu != 'ElGenero' and titu != 'FlowHot': ref = a.css('::attr(href)').get() hoy = date.today().strftime("%d %B, %Y") cantante, album = separa_titulo(titu, '–') yield scrapy.Request(ref, callback=self.parse_attr, meta={ 'referer': ref, 'fecha': hoy, 'titulo': titu, 'cantante': cantante, 'album': album }) except: pass self._num_pagina += 1 try: next_page = response.css('a.nextpostslink ::attr(href)').get() if next_page is not None: yield response.follow(next_page, callback=self.parse) except: print('Hubo un problema al abrir la página siguiente')
def parse(self, response): #####COMENTARIOS##### print('\n########Pagina ' + str(self._num_pagina) + '########') for art in response.css('div.article-container > article'): href = art.css('div.featured-image > a ::attr(href)').get() titulo = art.css('div.featured-image > a ::attr(title)').get() cantante, album = separa_titulo(titulo, '–') fecha = art.css( 'div.below-entry-meta > span > a > time ::text').get() yield scrapy.Request(href, callback=self.parse_attr, meta={ 'fecha': fecha, 'titulo': titulo, 'cantante': cantante, 'album': album }) #break self._num_pagina += 1 try: next_page = response.css('li.previous > a ::attr(href)').get() if next_page is not None: yield response.follow(next_page, callback=self.parse) except: print('Hubo un problema al abrir la página siguiente')
def parse(self, response): #####COMENTARIOS##### print('\n########Pagina ' + str(self._num_pagina) + '########') for art in response.css('div#main-content > article'): titulo = art.css('a ::attr(title)').get() referer = art.css('a ::attr(href)').get() fecha = art.css('span.mh-meta-date updated ::text').get() cantante, album = separa_titulo(titulo, '–') #####LLAMA AL REFERER##### yield scrapy.Request(referer, callback=self.parse_attr, meta={ 'fecha': fecha, 'referer': referer, 'titulo': titulo, 'cantante': cantante, 'album': album }) #break self._num_pagina += 1 try: next_page = response.css('div.nav-previous > a ::attr(href)').get() #print('PAGINA SIGUIENTE:' + next_page) if next_page is not None: yield response.follow(next_page, callback=self.parse) except: print('Hubo un problema al abrir la página siguiente')
def parse(self, response): #####COMENTARIOS##### print('\n########Pagina ' + str(self._num_pagina) + '########') for h3 in response.css('h3.post-title') or response.css( 'h3.post-title.entry-title'): #####RECOLECTA LOS DATOS DE LA PÁGINA##### referer = h3.css('a ::attr(href)').get() titulo = h3.css('a ::text').get() #####SEPARA CANTANTE Y ALBUM##### cantante, album = separa_titulo(titulo, '-') #####LLAMA AL REFERER##### yield scrapy.Request(referer, callback=self.parse_attr, meta={ 'referer': referer, 'titulo': titulo, 'cantante': cantante, 'album': album }) self._num_pagina += 1 try: next_page = response.css( 'a.blog-pager-older-link.flat-button.ripple ::attr(href)').get( ) if next_page is not None: yield response.follow(next_page, callback=self.parse) except: print('Hubo un problema al abrir la página siguiente')
def parse(self, response): #####COMENTARIOS##### print('\n########Pagina ' + str(self._num_pagina) + '########') #####TOMA LOS DATOS DE LA PÁGINA##### titulo = response.css('h1.entry-title ::text').get() referer = response.css('figure > a ::attr(href)').get() fecha = response.css('time.entry-date.published.updated ::text').get() cantante, album = separa_titulo(titulo, '–') album = separa(album,' ', 1) #####LLAMA AL REFERER##### yield scrapy.Request(referer, callback= self.parse_attr, meta= {'fecha': fecha, 'referer': referer, 'titulo': titulo, 'cantante': cantante, 'album': album})
def parse(self, response): titulo = [] cantante = [] album = [] referer = [] infringing = [] fecha = date.today().strftime("%B %d, %Y") #####COMENTARIOS##### print('\n########Pagina ' + str(self._num_pagina) + '########') #prueba = response.css("[style = 'text-align: center; outline: rgb(33, 198, 243) none 0px;'] > span").get() for span in response.css('div#post-body-5669324029259817671 > div > div'): text = span.css('span ::text').get() #print(str(text)) if text is not None: if text != 'DESCARGAR' and text[0] != '0': titulo.append(text) can, alb = separa_titulo(titulo[-1], '-') if can == '-': can, alb = separa_titulo(titulo[-1], '–') cantante.append(can) album.append(alb) #print('agrega titulo: ' + titulo[-1]) #print('agrega cantante: ' + cantante[-1]) #print('agrega album: ' + album[-1]) text_des = span.css('span > a ::text').get() text_des1 = span.css('a ::text').get() if text_des == 'DESCARGAR' or text_des1 == 'DESCARGAR': #print(str(span.css('span > a ::attr(href)').get())) if span.css('span > a ::attr(href)').get() is not None: referer.append(str(span.css('span > a ::attr(href)').get())) elif span.css('a ::attr(href)').get() is not None: referer.append(str(span.css('a ::attr(href)').get())) if referer is not None: infringing.append(self.get_inf(referer[-1])) #print('agrega referer: ' + referer[-1]) #print('agrega infringing: ' + infringing[-1]) self.get_datos(titulo, fecha, cantante, album, referer, infringing)
def parse_attr(self, response): for li in response.css('li.mp3Play'): Titulo = li.css('b ::text').get() Cantante, Album = separa_titulo(Titulo, '-') Fecha = date.today().strftime("%d %B, %Y") id = li.css('a.b_down ::attr(data-url)').get() if id is not None: url = 'https://www.internet-dvr.com/api-private.js?vidID={}&token=37fb468a1118f225202b8f6be914f4406c93954f1b460d36afb40d540581508b74fefaa886db99f2fb8d3ac3bcd481a77a658c4432f801bd8df0c4da26916588'.format( id) #print(url) yield Request(url, meta={ 'referer': response.meta['referer'], 'Titulo': Titulo, 'Cantante': Cantante, 'Album': Album, 'Fecha': Fecha }, callback=self.parse_attr2)
def extrae_categoria(driver): global pag next_page = 1 while next_page: print( "#################################### Página {} ####################################" .format(pag)) for a in driver.find_elements_by_css_selector( 'div#blog-entries > article h2 > a'): titulo = a.get_attribute('title').replace('(FLAC)', '').replace('(Mp3)', '') referer = a.get_attribute('href') cantante, album = separa_titulo(titulo, '–') time.sleep(2) driver.execute_script("window.open(arguments[0]);", referer) driver.switch_to.window(driver.window_handles[2]) ref_inf = extrae_infringing(driver) if ref_inf: for inf in ref_inf: mega_link = get_mega(inf) if mega_link != False: if mega_link.find('mega') != -1: if c.existe_inf(mega_link) == False: imprime_datos(titulo, fecha, cantante, album, referer, mega_link) c.inserta_item(titulo, cantante, album, referer, mega_link, fecha, id_domin) v.muestra_item_guardado(titulo) close_taps(driver, 1) try: next_page = driver.find_element_by_css_selector( 'a.next.page-numbers') next_page.click() pag += 1 except: print('Ocurrió un error al cambiar de página') break close_taps(driver, 0)
def parse_attr(self, response): titulo = response.css('h1 > a ::text').get() cantante, album = separa_titulo(titulo, '-') fecha = response.css('time > a ::text').get() fecha = strip_spaces(fecha) fecha = separa(fecha, '-', 0) infringing = response.css( 'div.post__content > p > a ::attr(href)').get() try: if infringing.find('images') > 0: infringing = response.xpath( '//*[@id="post"]/div[2]/p/a[2]/@href').get() if infringing is not None or infringing.find('megaupload') > 0: if veri(infringing) == True: imprime_datos(titulo, fecha, cantante, album, response.meta['referer'], infringing) if self.c.existe_inf(infringing, self.id_domin) == False: self.c.inserta_item(titulo, cantante, album, response.meta['referer'], infringing, fecha, self.id_domin) except: pass
#####ABRE NAVEGADOR SELENIUM##### driver = webdriver.Chrome('C:\\Users\\APDIF\\Desktop\\chromedriver.exe') driver.get(start_urls[0]) #####ESPERA A QUE CARGUE LA PÁGINA##### WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h2 > a"))) #####GUARDA LA PÁGINA PRINCIPAL##### main_window = driver.current_window_handle #####TOMA EL HREF DE LA PÁGINA SIGUIENTE##### next_page = driver.find_element_by_css_selector('a.sa.sa-nextpage.tip').get_attribute('href') #####RECORRE TODAS LAS PÁGINAS##### while next_page is not None: #####TOMA LOS DATOS##### for a in driver.find_elements_by_css_selector("h2 > a"): referer = a.get_attribute('href') titulo = a.find_element_by_css_selector('span').text cantante, album = separa_titulo(titulo, '-') fecha = date.today().strftime("%B %d, %Y") #####ABRE UNA NUEVA PESTAÑA##### driver.execute_script("window.open(arguments[0]);", referer) driver.switch_to.window(driver.window_handles[1]) infringing = driver.find_element_by_xpath('//*[@id="shell"]/section/div[1]/div[2]/article/section/center/a').get_attribute('href') imprime_datos(titulo, fecha, cantante, album, referer, infringing) if c.existe_inf(infringing, id_domin) == False: if veri(infringing) == True: c.inserta_item(titulo, cantante, album, referer, infringing, fecha, id_domin) #####CIERRA LA PESTAÑA##### driver.close() #####CAMBIA A LA PÁGINA PRINCIPAL##### driver.switch_to.window(main_window) #####ABRE LA SIGUIENTE PÁGINA##### driver.get(next_page)
def parse(self, response): id = '0' #####COMENTARIOS##### print('\n########Pagina ' + str(self._num_pagina) + '########') for art in response.xpath('//*[@id="content"]/div[1]/article'): band = True titulo = art.css(' h2 > a ::text').get() fecha = art.css('p > span > a ::text').get() album, cantante = separa_titulo(titulo, '-') referer = art.css('div > div.post-entry-content > a ::attr(href)').get() id = self.get_id(referer,'/', 3) infringing = self.get_infr(id) if referer is None: referer = art.css('div > div > p > strong > a ::attr(href)').get() if referer is None: #print('PATH: ' + str(art.xpath('//div/div/p[2]/a').extract())) #print('A: ' + str(art.css('div.post-entry-content').get())) for p in art.css('div.post-entry-content > strong > span > span > a') or art.css('div.post-entry-content > p > a'): band = False #print ('A: ' + str(art.css('div.post-entry-content').get())) referer = p.css('::attr(href)').get() id = self.get_id(referer,'/', 3) infringing = self.get_infr(id) #print('REF: ' + referer) if self.comprueba_refer(referer, 'open') == True: r = art.css('div.post-entry-content > p') #print('A: ' + str(art.css('div.post-entry-content > p > strong').get())) if r.css('strong') is not None: r = r.css('strong > span') referer = r.css('a ::attr(href)').get() id = self.get_id(referer,'/', 3) infringing = self.get_infr(id) #####INSERTA EN BD##### if veri(infringing) == True: if self.c.existe_inf(infringing, self.id_domin) == False: self.c.inserta_item(titulo, cantante, album, referer, infringing, fecha, self.id_domin) else: id = self.get_id(referer,'/', 3) infringing = self.get_infr(id) self.imprime_datos(titulo, fecha, cantante, album, referer, infringing) else: id = self.get_id(referer,'/', 3) infringing = self.get_infr(id) else: if self.comprueba_refer(referer, '?') == True: infringing = self.get_Mega(referer) if band == True: #####INSERTA EN BD##### if veri(infringing) == True: if self.c.existe_inf(infringing, self.id_domin) == False: self.c.inserta_item(titulo, cantante, album, referer, infringing, fecha, self.id_domin) self.imprime_datos(titulo, fecha, cantante, album, referer, infringing) #referer = None self._num_pagina+=1 try: next_page = response.css('a.next.page-numbers ::attr(href)').get() if next_page is not None: yield response.follow(next_page, callback= self.parse) except: print('Hubo un problema al abrir la página siguiente')