class Discografiasmega(scrapy.Spider): name = 'Discografiasmega' _num_pagina = 1 id_domin = 0 start_urls = ['https://www.discografiasmega.com/'] # RETORNA EL �LTIMO DOMINIO id_domin = retorna_dominio(start_urls[0]) def parse(self, response): #####RECORRE "ARTICLES"##### for art in response.css('div.archive-main.archive-masonry article'): referer = art.css('h2 > a ::attr(href)').get() yield Request(referer, meta= {'referer': referer},callback=self.parse_attr) next_page = response.css('a.next.page-numbers ::attr(href)').get() if next_page: yield response.follow(next_page, callback= self.parse) def parse_attr(self, response): i = 0 Fecha = date.today().strftime("%d %B, %Y") for T in response.xpath(".//strong[contains(text(), 'MEGA')]"): i += 1 Titulo = self.Limpia_titulo(T.css('::text').get()) Cantante, Album = separa_titulo(Titulo, '–') if i == 1: padre = T.xpath('..') Infringing = padre.css('a ::attr(href)').get() imprime_datos(Titulo,'', Cantante, Album, response.meta['referer'], Infringing) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(Titulo, Cantante, Album, response.meta['referer'], Infringing, Fecha, self.id_domin) # INGRESA AL INFRINGING POR MEDIO DE SELENIUM self.Datos_Selenium(Titulo, Cantante, Album, response.meta['referer'], Infringing, Fecha, i) def Datos_Selenium(self,Titulo, Cantante, Album, Referer, Infringing, Fecha, i): driver = webdriver.Chrome('C:\\Users\\APDIF\\Desktop\\chromedriver.exe') driver.get(Infringing) Inf_short = driver.find_element_by_xpath("//div[@class='link-container']/a["+ str(i) +"]").text # SIGUE EL INFRINGING driver.get(Inf_short) # INGRESA INFRINGING A LA BD LOS DATOS imprime_datos(Titulo,'', Cantante, Album, Referer, Inf_short) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(Titulo, Cantante, Album, Referer, Inf_short, Fecha, self.id_domin) Infringing_mega = Get_megaLink(driver) if Infringing_mega != False: # INGRESA INFRINGING A LA BD LOS DATOS imprime_datos(Titulo,'', Cantante, Album, Referer, Infringing_mega) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(Titulo, Cantante, Album, Referer, Infringing_mega, Fecha, self.id_domin) driver.quit() def Limpia_titulo(self,Titulo): Titulo = Titulo.replace('Descargar','').replace('MEGA','') Titulo = Titulo.split('[')[0] return Titulo
class elmanualnlhc(scrapy.Spider): name = 'elmanualnlhc' _num_pagina = 1 id_domin = 0 start_urls = ['https://elmanualnlhc.wordpress.com/'] id_domin = retorna_dominio(start_urls[0]) def parse(self, response): #####RECORRE DIVS##### for div in response.css('div.narrowcolumn > div'): infringing = div.css('div >p > a ::text').extract_first() #####VERIFICA QUE INFRINGING CONTENGA TEXTO##### if infringing is not None: #####SI CONTIENE LA PALABRA DOWNLOAD##### if infringing.find('Download') != -1 or infringing.find( 'download') != -1: #####TOMA EL RESTO DE LOS DATOS##### Titulo = div.css('h2 ::attr(title)').get() referer = div.css('h2 ::attr(href)').get() Fecha = div.css('p.postmetadata ::text').get() Fecha = Fecha.replace('\n\t\t\t', '') #####TOMA EL INFRINGING##### infringing = div.css( 'div >p > a ::attr(href)').extract_first() #####VERIFICA QUE NO SEA UNA IMAGEN##### if infringing.find('.png') == -1 or infringing.find( '.jpg') == -1: #####VERIFICA SI ES UN LINK VÁLIDO##### if veri(infringing) == True: #####LIMPIA EL TEXTO DEL TÍTULO##### if Titulo is not None: Titulo = Titulo.replace('\xa0', ' ') #####SEPARA EL CANTANTE Y EL ALBUM DEL TÍTULO##### Cantante, Album = separa_titulo(Titulo, '–') print((Titulo, Cantante, Album, referer, infringing, Fecha)) #####SI NO EXISTE, LO INSERTA LOS DATOS EN LA BD##### if c.existe_inf(infringing) == False: c.inserta_item(Titulo, Cantante, Album, referer, infringing, Fecha, self.id_domin) v.muestra_item_guardado(Titulo) #####PASA A LA SIGUIENTE PÁGINA##### self._num_pagina += 1 try: next_page = 'https://elmanualnlhc.wordpress.com/page/{}/'.format( self._num_pagina) yield response.follow(next_page, callback=self.parse) except: pass
class playcorridos(scrapy.Spider): name = 'playcorridos' _num_pagina = 1 id_domin = 0 Fecha = date.today().strftime("%d %B, %Y") start_urls = ['http://playcorridos.com/'] # RETORNA EL �LTIMO DOMINIO id_domin = retorna_dominio(start_urls[0]) def parse(self, response): print('##### PÁGINA #{} #####'.format(self._num_pagina)) #####RECORRE "ARTICLES"##### for art in response.css('article.item-list'): Titulo = art.css('h2 > a ::text').get() Titulo = self.Limpia_titulo(Titulo) Cantante, Album = separa_titulo(Titulo, '–') Titulo, Cantante, Album = self.acentos(Titulo, Cantante, Album) referer = art.css('h2 > a ::attr(href)').get() for a in art.css('p > a'): try: Inf = a.css('::attr(href)').get() if Inf.find('zippyshare') != -1: Infringing = self.zippy(Inf) imprime_datos(Titulo, '', Cantante, Album, referer, Infringing) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(Titulo, Cantante, Album, referer, Infringing, self.Fecha, self.id_domin) elif Inf.find('mediafire') != -1: yield Request(Inf, meta={ 'Titulo': Titulo, 'Cantante': Cantante, 'Album': Album, 'Referer': referer }, callback=self.mediaFire) elif Inf.find('userscloud') != -1: Infringing = self.userCloud(Inf) imprime_datos(Titulo, '', Cantante, Album, referer, Infringing) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(Titulo, Cantante, Album, referer, Infringing, self.Fecha, self.id_domin) except: continue #break next_page = response.css('span#tie-next-page > a ::attr(href)').get() if next_page: self._num_pagina += 1 yield response.follow(next_page, callback=self.parse) def userCloud(self, Inf): driver = webdriver.Chrome( 'C:\\Users\\APDIF\\Desktop\\chromedriver.exe') driver.get(Inf) time.sleep(2) try: element = driver.find_element_by_css_selector( "button.btn.btn-inverse.btn-icon-stacked") driver.execute_script("arguments[0].click();", element) except: pass time.sleep(3) element = driver.find_element_by_css_selector("button#btn_download") driver.execute_script("arguments[0].click();", element) time.sleep(1) sel = driver.execute_script( "return document.getElementsByClassName('ribbon-heading ribbon-default top-left-right')[0].innerHTML;" ) Infringing = self.get_atr(sel).strip() #print(Infringing) driver.quit() return Infringing def zippy(self, Inf): driver = webdriver.Chrome( 'C:\\Users\\APDIF\\Desktop\\chromedriver.exe') driver.get(Inf) time.sleep(0.5) Infringing = driver.find_element_by_id("dlbutton").get_attribute( 'href') driver.quit() return Infringing def mediaFire(self, response): Infringing = response.css( 'a.input.popsok ::attr(href)').extract_first().strip() imprime_datos(response.meta['Titulo'], '', response.meta['Cantante'], response.meta['Album'], response.meta['Referer'], Infringing) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(response.meta['Titulo'], response.meta['Cantante'], response.meta['Album'], response.meta['Referer'], Infringing, self.Fecha, self.id_domin) def get_atr(self, texto): if texto: texto = texto.split('onclick')[0] texto = texto.split('=')[1].replace('"', '') return texto def Limpia_titulo(self, Titulo): if Titulo: Titulo = Titulo.split('(')[0] return Titulo def get_Album(self, Texto): Texto = Texto.strip() try: Album = Texto.split('–')[1] return Album except: return Texto def acentos(self, Titulo, Cantante, Album): Titulo = strip_accents(Titulo) Cantante = strip_accents(Cantante) Album = strip_accents(Album) return Titulo, Cantante, Album
class discografiaspormega(scrapy.Spider): name = 'discografiaspormega' _num_pagina = 1 id_domin = 0 start_urls = ['https://www.discografiaspormega.com/'] # RETORNA EL �LTIMO DOMINIO id_domin = retorna_dominio(start_urls[0]) def parse(self, response): print('##### PÁGINA #{} #####'.format(self._num_pagina)) #####RECORRE "ARTICLES"##### for art in response.css('div#content article'): referer = art.css('a ::attr(href)').get() #print(referer) yield Request(referer, meta={'referer': referer}, callback=self.parse_attr) next_page = response.css('a.next.page-numbers ::attr(href)').get() if next_page: self._num_pagina += 1 yield response.follow(next_page, callback=self.parse) def parse_attr(self, response): #print(response.text) i = 0 Titulos = [] Fecha = date.today().strftime("%d %B, %Y") driver = webdriver.Chrome( 'C:\\Users\\APDIF\\Desktop\\chromedriver.exe') driver.get(response.url) for str in driver.find_elements_by_xpath('//p/strong'): i += 1 if i % 2 != 0: Titulos.append(str.text) #print(str.text) else: continue i = 0 tam = len(Titulos) for a in driver.find_elements_by_css_selector('p > a'): #print(i) if i == 0: i += 1 continue if i == tam: break inf = a.get_attribute('href') driver.execute_script("window.open(arguments[0]);", inf) driver.switch_to.window(driver.window_handles[1]) time.sleep(1) Infringing_mega = driver.current_url driver.close() driver.switch_to.window(driver.window_handles[0]) Titulo = self.Limpia_titulo(Titulos[i]) Cantante, Album = separa_titulo(Titulo, '–') imprime_datos(Titulo, '', Cantante, Album, response.url, Infringing_mega) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(Titulo, Cantante, Album, response.url, Infringing_mega, Fecha, self.id_domin) i += 1 driver.quit() def Limpia_titulo(self, Titulo): Titulo = Titulo.replace('Descargar', '').replace('MEGA', '') Titulo = Titulo.split('[')[0] return Titulo
driver.quit() return False def busca_todas_categorias(driver): for a in driver.find_elements_by_css_selector('ul.sub-menu > li > a'): cate_ref = a.get_attribute('href') driver.execute_script("window.open(arguments[0]);", cate_ref) driver.switch_to.window(driver.window_handles[1]) extrae_categoria(driver) driver.quit() def busca_por_categoria(driver, cate_ref): driver.execute_script("window.open(arguments[0]);", cate_ref) driver.switch_to.window(driver.window_handles[1]) extrae_categoria(driver) driver.quit() fecha = date.today().strftime("%d %B, %Y") url = 'https://www.barboflacmusic.com/' #####TOMA EL ÚLTIMO ID DE LA TABLA DE DOMINIOS EN LA BD##### id_domin = retorna_dominio(url) #####ABRE NAVEGADOR##### driver = webdriver.Chrome('C:\\Users\\APDIF\\Desktop\\chromedriver.exe') driver.get(url) #print(get_mega('https://ouo.io/VlrKab')) #cate_ref = 'https://www.barboflacmusic.com/category/salsa/page/{}/'.format(pag) busca_por_categoria(driver, 'https://www.barboflacmusic.com/category/vallenato/')
class mp3teca(scrapy.Spider): name = 'mp3teca' _num_pagina = 2 id_domin = 0 start_urls = ['https://mp3teca.com/mp3s/'] inf_url = 'http://yyy-music.com/d/' hoy = date.today().strftime("%d %B, %Y") custom_settings = {'CONCURRENT_REQUESTS': 10, 'DOWNLOAD_DELAY': 0.8} id_domin = retorna_dominio(start_urls[0]) def parse(self, response): for li in response.css('div#content > div ul > li'): referer = li.css('a ::attr(href)').get() Titulo = li.css('a ::text').get() Titulo = self.give_emoji_free_text(Titulo) #print(Titulo) Cantante, Cancion = separa_titulo(Titulo, '–') Cantante = self.give_emoji_free_text(Cantante) id = self.get_id(referer) url = self.inf_url + id yield Request(url, meta={ 'referer': referer, 'Titulo': Titulo, 'Cantante': Cantante, 'Cancion': Cancion, 'Fecha': self.hoy }, callback=self.parse_attr) #####PASA A LA SIGUIENTE PÁGINA##### self._num_pagina += 1 try: next_page = 'https://mp3teca.com/mp3s/page/{}/'.format( self._num_pagina) yield response.follow(next_page, callback=self.parse) except: pass def parse_attr(self, response): infringing = response.css('a.btn-nwo ::attr(href)').get() #infringing = str(infringing,'utf-8') #infringing = self.give_emoji_free_text(infringing) #####VERIFICA SI ES UN LINK VÁLIDO##### if veri(infringing) == True: if c.existe_ref(response.meta['referer']) == False: imprime_datos(response.meta['Titulo'], response.meta['Fecha'], response.meta['Cantante'], response.meta['Cancion'], response.meta['referer'], infringing) if c.inserta_item(response.meta['Titulo'], response.meta['Cantante'], response.meta['Cancion'], response.meta['referer'], infringing, response.meta['Fecha'], self.id_domin) == True: v.muestra_item_guardado(response.meta['Titulo']) def give_emoji_free_text(self, text): allchars = [str for str in text] emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI] clean_text = ' '.join([ str for str in text.split() if not any(i in str for i in emoji_list) ]) return clean_text def get_id(self, ref): ref = ref.split('/') return ref[4]
class musiconworldoffmx(scrapy.Spider): name = 'musiconworldoffmx' _num_pagina = 1 id_domin = 0 start_urls = ['http://musiconworldoffmx.com/'] # RETORNA EL �LTIMO DOMINIO id_domin = retorna_dominio(start_urls[0]) def parse(self, response): print('##### PÁGINA #{} #####'.format(self._num_pagina)) #####RECORRE "ARTICLES"##### for art in response.css('div.entry-content a'): #print(art.get()) referer = art.css('a ::attr(href)').get() yield Request(referer, callback=self.parse_attr) next_page = response.css('div.nav-previous > a ::attr(href)').get() if next_page: self._num_pagina += 1 yield response.follow(next_page, callback=self.parse) def parse_attr(self, response): #print(response.text) try: Titulo = response.css('h2.entry-title > a ::text').get() Cantante = self.Limpia_titulo(Titulo) Fecha = response.css('div.entry-meta ::text').extract()[0].strip( ) + ' ' + response.css( 'div.entry-meta ::text').extract()[1].strip() Fecha = Fecha.strip() for alb in response.css('div.entry-content > p a'): Album = alb.css('::text').get() Album = self.get_Album(Album) Inf = alb.css('::attr(href)').get() imprime_datos(Titulo, Fecha, Cantante, Album, response.url, Inf) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(Titulo, Cantante, Album, response.url, Inf, Fecha, self.id_domin) driver = webdriver.Chrome( 'C:\\Users\\APDIF\\Desktop\\chromedriver.exe') driver.get(Inf) Infringing_mega = Get_megaLink(driver) imprime_datos(Titulo, Fecha, Cantante, Album, response.url, Infringing_mega) # INGRESA INFRINGING A LA BD LOS DATOS Inserta_Datos(Titulo, Cantante, Album, response.url, Infringing_mega, Fecha, self.id_domin) except: pass def Limpia_titulo(self, Titulo): if Titulo: Titulo = Titulo.replace('Discografia', '').replace('Discograifa', '').replace('MEGA', '') Titulo = Titulo.split('(')[0] try: Titulo = Titulo.split('(')[0].strip() except: pass return Titulo def get_Album(self, Texto): Texto = Texto.strip() try: Album = Texto.split('–')[1] return Album except: return Texto