def serializar_ocorrencias(lista_urls, palavra): #Armazenara o array de todas as ocorrencias de cada link ocorrencias = [] #Para cada link na lista sera feito o bloco de codigo for link in lista_urls: #formatando a pagina para html pag_url = str(uReq(link).read()) pag_soup = soup(pag_url, "html.parser") #qtd de paragrafos <p> no html ct = paragrafos(pag_soup,palavra) #cria um dicionario com o link e a quantidade de ocorrencias ocorrencia = {"url":link, "qtd_de_ocorrencias": str(ct)} #adiciona o dicionario a lista de ocorrencias ocorrencias.append(ocorrencia) #printando no console a troca da url print("*************************************************************************") #retornando o array de dicionarios serializado no formato json #return json.dumps(ocorrencias, indent=4) return ocorrencias
def crawl(href,count): print(get_time() + ", Parsing Link: " + href) req = Request(href, headers={'User-Agent': 'Mozilla/5.0'}) uClient = uReq(req) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") heading = page_soup.find('center') content_container = page_soup.find('table', attrs={'style' : "background:transparent; text-align:justify;"}).prettify() table = soup(content_container,"html.parser") para = table.find_all('p') #name = str(count)+".html" with io.open("para_hn.html", "a", encoding="utf-8") as fout: #fout.write("\n\n" + heading.text + "\n\n") # for i in para: #print(para[i]) fout.write(str(para)) link = page_soup.find('img', attrs={'alt' : 'Next.png'}) next_link = link.findPrevious('a')['href'] complete_link = "http://hi.krishnakosh.org" + quote(next_link, safe='%,/') return complete_link
# -*- coding: utf-8 -*- import bs4 from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48' #grabs my URL in a variable. Opens a client then closes it to not keep using resources uClient = uReq(my_url) page_html = uClient.read() uClient.close() #Holds my HTML parsing page_soup = soup(page_html, "html.parser").encode("utf-8") #Grabs each product. Used for Graphics Card atm. containers = page_soup.findAll("div",{"class":"item-container"}) #Export into a CSV file now. filename = "GraphicsCards.csv" f = open(filename, "w") headers = "brand, product_name, shipping \n" f.write(headers) # Goes through the whole page, grabbing certain things i'm looking for. # TO DO: # ADD IMAGE RECOGNITION FOR TITLE OF PRODUCT # ADD IN REVIEWED ONLY DEVICES FOR THIS ONE for container in containers: brand = container.div.div.a.img["title"]
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq #Steam URL... self explainitory STEAM_URL = 'https://store.steampowered.com/search/?os=win&specials=1&page=' CONTAINERS = [] #Grab, and hold all the selected pages for steam special sales for i in range(10): print ("grabbing page #" + str(i+1)) uClient = uReq(STEAM_URL + str(i+1)) page_html = uClient.read() uClient.close() parsed_html = soup(page_html, "html.parser") containers = CONTAINERS + parsed_html.findAll("div", {"class":"responsive_search_name_combined"}) filename = "SteamSpecialSale.csv" f = open(filename, "w") headers = "Game_Title, Original_Price, Sale_Price \n" f.write(headers) #Go through a whole page, looping through all entries needed. #then write them into CSV. for container in containers: game_title = container.findAll("span", {"class":"title"})[0].text.strip() game_price = container.findAll("div", {"class":"search_price"})[0].text.strip() original_price = game_price[:game_price.rfind("$")].strip()
# downloads the MTG json database to a sister directory. print("Calling function download_json_db()...\n") # this function requires.... # from io import BytesIO # import zipfile # from urllib.request import urlopen as uReq # import os # fields host_url = 'https://mtgjson.com/json/AllCards.json.zip' internal_filename = 'AllCards.json' # downloads the file print("Downloading JSON library from mtgjson.com...") client = uReq(host_url) zip_byte_file = client.read() client.close() print("Download... Done!\n") # Extract the file print("Extracting zip folder....") zip_folder = zipfile.ZipFile(BytesIO(zip_byte_file)) json_file = zip_folder.read(internal_filename) # Converts to JSON json_file = json_file.decode("utf-8") json_file = json.loads(json_file) json_file = json.dumps(json_file) print("Extraction... Complete!\n")
# ) # ################# insert to DB code ############################ print("Scrapping data from: Zappa.co.il") myurl = "https://www.zappa-club.co.il/content/the-show-must-go-on/" #Write path depending if on Linux Azure VM or Win laptop if path.exists("C:/Users/omerm/Desktop/Hackorona/Data-Scrapping"): the_path = "C:/Users/omerm/Desktop/Hackorona/Data-Scrapping" else: the_path = "/root/bin/datascrape" #Grapping page req = Request(myurl, headers={'User-Agent': 'Mozilla/5.0'}) uClient = uReq(req) page_html = uClient.read() uClient.close() #parses the info page_soup = soup(page_html, "html.parser") #create csv file filename = the_path + "/data/Zappa.csv" with open(filename, "w", encoding="utf=16") as f: #csv headers headers = "Date., Time., Title., Caterogies., Url\n" f.write(headers) ### URL ###
from urllib.request import Request, urlopen as uReq from bs4 import BeautifulSoup as soup req = Request( 'https://freelancehunt.com/projects/skill/parsing-dannyih/169.html?page=1', headers={'User-Agent': 'Mozilla/5.0'}) response = uReq(req) page_html = response.read() response.close() page_soup = soup(page_html, 'html.parser') tr_list = page_soup.findAll('tr', {'style': 'vertical-align: top'}) td_list = page_soup.findAll('td', {'class': 'text-center'}) titles = [] for tr in tr_list: title = tr.td.a.text titles.append(title) price_list = [] for td in td_list: remainder = td_list.index(td) % 4 if remainder == 0: price_list.append(td.span.text.strip()) def final(titles, prices): x = 0 lines = []
def redesextract(): from settings import my_url, name, doc, last, RH, COD_PRODUCTO import init, bs4, logging, sys, re from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup global contredes uClient = uReq(my_url) page_html = uClient.read() uClient.close() all = 0 a = 0 x = 0 y = 0 auto = "" vincula = "" insti = "" vinculain = "" page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("table") for a in range(0, len(containers)): buscaReds = containers[a].h3 #print(buscaReds) try: if buscaReds.text == "Redes de conocimiento especializado": all = a #print(all) break except AttributeError: pass if all != 0: containerb = containers[all] container = containerb.findAll("blockquote") for x in range(0, len(container)): cont = container[x] info_red = cont.text #Nombre de la red index1 = info_red.find("Nombre de la red ") + 17 index2 = info_red.find( "\xa0\r\n Tipo de red") Nombrered = info_red[index1:index2] # Tipo de Red index1 = info_red.find("Tipo de red") + 11 index2 = info_red.find( ",\xa0\r\n Creada el:") Tipored = info_red[index1:index2] # Lugar Red index1 = info_red.find( "\xa0\r\n en ") + 42 index2 = info_red.find(" \xa0 \r\n") LugarRed = info_red[index1:index2] #Fecha de Realización inicio y fin index1 = info_red.find("Creada el:") + 10 index2 = index1 + 4 AnoRedini = info_red[index1:index2] if AnoRedini == "," or AnoRedini == ",\xa0\r\n": MesRedini = "" AnoRedini = "" FechaRedini = "" MesRedfin = "" AnoRedfin = "" FechaRedfin = "" else: index1 = index1 + 5 index2 = index1 + 2 MesRedini = info_red[index1:index2] index1 = info_red.find("Creada el:") + 10 index2 = index1 + 10 FechaRedini = info_red[index1:index2] index1 = info_red.find(",", index1, index1 + 58) + 40 index2 = index1 + 4 AnoRedfin = info_red[index1:index2] if AnoRedfin == " " or AnoRedfin == ",": MesRedfin = "" AnoRedfin = "" FechaRedfin = "" else: index1 = index1 + 5 index2 = index1 + 2 MesRedfin = info_red[index1:index2] index1 = info_red.find("Creada el:") + 10 index1 = info_red.find(",", index1, index1 + 58) + 40 index2 = index1 + 10 FechaRedfin = info_red[index1:index2] init.rel_persona_producto_colciencias.append(str(RH) + ";"\ + str(COD_PRODUCTO) + ";"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',"1".replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Nombrered.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "0" + ","\ + "" + ";"\ + "" + ";"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarRed.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "" + ";"\ + "\n") init.inrel_personas_producto_colciencias.append( \ "REPLACE INTO `uapa_db`.`rel_personas_producto_colciencias`(`cod_rel_per_prod_col`,`cod_producto`,`cod_rh`,`cod_tipo_producto`,`nombre_producto`,`evento_asociado`,`datos_complementarios`,`lugar`,`ano`,`ambito`,`palabras_clave`,`areas`,`sectores`,`coautores`,`vincula_coautores`,`editorial`,`volumen`,`paginas`,`doi`,`finalidad`,`instituciones_asociadas`,`tipo_vinculacion_institucion`) VALUES" + "('"+ str(RH) + str(COD_PRODUCTO) + "'," + str(COD_PRODUCTO) + ","\ + "'" + str(RH) + "',"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',"7".replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Nombrered.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "null" + ","\ + "null" + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarRed.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ");\n") init.colciencias_apropiacion.append(str(RH) + str(COD_PRODUCTO) + ";"\ + str(RH) + ";"\ + str(COD_PRODUCTO) + ";"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaRedini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoRedini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesRedini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaRedfin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoRedfin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesRedfin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "\n") init.incolciencias_apropiacion.append( \ "REPLACE INTO `uapa_db`.`colciencias_apropiacion`(`cod_colciencias_apropiacion`,`cod_rh`,`cod_rel_per_prod_col`,`fecha_ini`,`fecha_fin`,`cod_tipo_evento`) VALUES" + "('" + str(COD_PRODUCTO) + "',"\ + "'" + str(RH) + "',"\ + "'" + str(RH) + str(COD_PRODUCTO) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaRedini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaRedfin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "0" + ");\n") COD_PRODUCTO = COD_PRODUCTO + 1 else: logging.info(' El Docente ' + name + ' ' + last + ' no tiene Redes Asociadas') contredes = [COD_PRODUCTO]
def main(): database = r"C:\Users\Kenny\Documents\webscrape\Phish.db" #create a database connection conn = create_connection(database) with conn: cur = conn.cursor() #These are the variables responsible for tracking ID in each respective table. songCount = 1 venueCount = 1 showCount = 1 setCount = 0 setlistEntryCount = 1 #Set-up with the first URL url = "https://phish.net/setlists/phish-december-02-1983-harris-millis-cafeteria-university-of-vermont-burlington-vt-usa.html" while (url != "https://phish.net/setlist/jump/next?showdate=2020-02-23"): uClient = uReq(url) page_html = uClient.read() uClient.close() #Hold the parsed-HTML in a BeautifulSoup data structure page_soup = soup(page_html, "html.parser") #gets the URL for the next show urlDivContainer = page_soup.findAll("div", {"class": "well clearfix"}) urlDiv = urlDivContainer[1] aTag = urlDiv.find_all('a')[1] url = "https://phish.net" + aTag["href"] #Get the date from the site-header dateDiv = page_soup.find("div", {"class": "setlist-date-long"}) aTag = dateDiv.find_all('a')[1] dateString = str(aTag) date = dateString[len(dateString) - 14:len(dateString) - 4] #Formatted in "MM/DD/YYYY" #Reorganizes date-format to "YYYY-MM-DD" year = date[len(date) - 4:len(date)] month = date[0:2] day = date[3:5] date = year + "-" + month + "-" + day #Get the Venue venueDiv = page_soup.find("div", {"class": "setlist-venue"}) aTag = venueDiv.find('a').contents[0] venue = str(aTag.contents[0]).title() #Get the Location locDiv = page_soup.find("div", {"class": "setlist-location"}) aTag = locDiv.find_all("a") city = aTag[0].contents[0] state = aTag[1].contents[0] #Iterate through sets setlistBody = page_soup.find("div", {"class": "setlist-body"}) p = setlistBody.find("p") #Works through and gets the set or song information setlistSongCount = 1 setInfo = "" for tag in p.find_all(["a", "span"]): #If next tag is span tag, it will hold the setInfo instead of a song if (tag.name == "span"): setInfo = tag.contents[0] setlistSongCount = 1 setCount += 1 else: #Otherwise, the next tag will be an <a>, which holds a song. song = tag.contents[0] songID = 1 #Check whether or not to add song to Song table val = valueCheck.checkSong(conn, song) if (val == False): songToInsert = (songCount, song) SQLInsert.insert_song(conn, songToInsert) songID = songCount songCount += 1 else: cur.execute( "SELECT songID FROM Songs WHERE songName LIKE (?)", (song, )) songID = cur.fetchall()[0][0] #Handles whether songs are separated by ",", ">", or "->" sib = tag.next_sibling sibString = str(sib) while (sibString[0] == "<"): sib = sib.next_sibling sibString = str(sib) segue = False transition = False if ("->" in sibString): transition = True elif (">" in sibString): segue = True setlistEntryToInsert = (setlistEntryCount, showCount, setCount, setInfo, setlistSongCount, songID, segue, transition) SQLInsert.insert_setlist(conn, setlistEntryToInsert) setlistSongCount += 1 setlistEntryCount += 1 #Check the Venues table if this Venue already exists val = valueCheck.checkVenue(conn, venue) if (val == False): venueToInsert = (venueCount, venue, city, state) SQLInsert.insert_venue(conn, venueToInsert) venueID = venueCount venueCount += 1 #Otherwise, get the venueID from the Venues table else: cur.execute( "SELECT venueID FROM Venues WHERE venueName LIKE (?)", (venue, )) venueID = cur.fetchall()[0][0] #Add the Show to the Shows table showToInsert = (showCount, date, venueID) SQLInsert.insert_show(conn, showToInsert) showCount += 1
def index(request): this_user = User.objects.get(id=request.session['id']) site_links = [] site_headlines = [] user_list = this_user.news.all() first_item = this_user.news.first() if first_item != None: categories = [first_item.list_name] else: categories = [] links_dict = {} #Generate list of all of the users news categories for list_name in user_list: for category in categories: if categories == [] or category != list_name.list_name: print('in IF') categories.append(list_name.list_name) else: print('in ELSE') continue # print('////////////////////////////////////', categories) #Generate an object where there is a key for each category and it's value will be a list of valuable links for category in categories: content_list = [] for entry in user_list: site_list = [] site_links = [] site_headlines = [] if entry.list_name == category: #open connection to page, copy html as local variable, close connection uClient = uReq(f'{ entry.site }') page_html = uClient.read() uClient.close() site_list.append(entry.site) #BeautifulSoup Magic soup = bsoup(page_html, 'html.parser') for story in soup.find_all('a'): # print('//////////// entry.site = ', entry.site) # print(story.get('href')) if story.get('href') == None: continue if entry.site in story.get('href') and re.search( r'\b[0-9]{4}\b', story.get('href')) != None: site_links.append(story.get('href')) # print('{{{{{{{{{{{{', story.text) for h1 in soup.find_all('a'): for h1 in story.find_all('h1'): values = h1.text for h2 in soup.find_all('a'): for h2 in story.find_all('h2'): values = h2.text for h3 in soup.find_all('a'): for h3 in story.find_all('h3'): values = h3.text for h4 in soup.find_all('a'): for h4 in story.find_all('h4'): values = h4.text if values != '': site_headlines.append(values) #Write the links and headlines to the context. print('================', len(site_headlines), len(site_links)) for content in range(len(site_headlines)): content_list.append( [site_headlines[content], site_links[content]]) links_dict.update({f'{category}': content_list}) # context.update({ f'{i}_links': site_links }) # context.update({ f'{i}_headlines': site_headlines}) context = {'links_dict': links_dict, 'categories': categories} print('/////////////context = ', context) return render(request, 'tracker/dashboard.html', context)
def evenextract(): from settings import my_url, name, doc, last, RH, COD_PRODUCTO import init, bs4, logging, sys, re global conteventos LOG_FILENAME = './Logs/Registros.log' logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG, format="%(asctime)s:%(levelname)s:%(message)s") LEVELS = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR, 'critical': logging.CRITICAL } if len(sys.argv) > 1: level_name = sys.argv[1] level = LEVELS.get(level_name, logging.NOTSET) logging.basicConfig(level=level) from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup uClient = uReq(my_url) page_html = uClient.read() uClient.close() all = 0 a = 0 x = 0 y = 0 conteventos = 0 auto = "" vincula = "" insti = "" vinculain = "" page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("table") for a in range(0, len(containers)): buscaeventos = containers[a].h3 #print(buscaeventos) try: if buscaeventos.text == "Eventos científicos": all = a #print(all) break except AttributeError: pass if all != 0: containerb = containers[all] container = containerb.findAll("table") for x in range(0, len(container)): cont = container[x] info_evento = cont.td.text #Nombre del evento index1 = info_evento.find("Nombre del evento:") + 18 index2 = info_evento.find("Tipo de evento:") NombreEvento = info_evento[index1:index2] # Tipo de Evento index1 = info_evento.find("Tipo de evento:") + 15 index2 = info_evento.find(" Ámbito:") TipoEvento = info_evento[index1:index2] if TipoEvento.strip() == "Otro": TipoEvento = "1" elif TipoEvento.strip() == "Taller": TipoEvento = "2" elif TipoEvento.strip() == "Congreso": TipoEvento = "3" elif TipoEvento.strip() == "Encuentro": TipoEvento = "4" elif TipoEvento.strip() == "Seminario": TipoEvento = "5" elif TipoEvento.strip() == "Simposio": TipoEvento = "6" else: logging.critical('Añadir a Tipo_Evento: ' + TipoEvento) print("ALERTA: Revisar el archivo Registros.log") #Ambito index1 = info_evento.find( "\xa0\r\n Ámbito: " ) + 51 index2 = info_evento.find( "\xa0 \r\n Realizado el:" ) Ambito = info_evento[index1:index2] #Fecha de Realización inicio y fin index1 = info_evento.find("Realizado el:") + 13 index2 = index1 + 4 AnoEventoini = info_evento[index1:index2] if AnoEventoini == "," or AnoEventoini == ",\xa0\r\n": MesEventoini = "" AnoEventoini = "" FechaEventoini = "" MesEventofin = "" AnoEventofin = "" FechaEventofin = "" else: index1 = index1 + 5 index2 = index1 + 2 MesEventoini = info_evento[index1:index2] index1 = info_evento.find("Realizado el:") + 13 index2 = index1 + 10 FechaEventoini = info_evento[index1:index2] index1 = info_evento.find(",", index1, len(info_evento)) + 48 index2 = index1 + 4 AnoEventofin = info_evento[index1:index2] if AnoEventofin == " \xa0\r\n" or AnoEventofin == ",": MesEventofin = "" AnoEventofin = "" FechaEventofin = "" else: index1 = index1 + 5 index2 = index1 + 2 MesEventofin = info_evento[index1:index2] index1 = info_evento.find("Realizado el:") + 13 index1 = info_evento.find(",", index1, len(info_evento)) + 48 index2 = index1 + 10 FechaEventofin = info_evento[index1:index2] #Lugar Evento index1 = info_evento.find( " \xa0\r\n en " ) + 51 index2 = info_evento.find(" \xa0 - \xa0\r\n") LugarEvento = info_evento[index1:index2] b_eventos = cont.findAll("td") #Autores autores = b_eventos[3].findAll("li") if len(autores) == 0: auto = "" vincula = "" else: for z in range(0, len(autores)): autor = autores[z].text index1 = autor.find("Nombre:") + 8 index2 = autor.find( "\r\n Rol en el evento: " ) if len(auto) == 0: auto = autor[index1:index2] else: auto = auto + ", " + autor[index1:index2] index1 = autor.find("Rol en el evento: ") + 18 index2 = autor.find("\r\n ", index1, len(autor)) if len(vincula) == 0: vincula = autor[index1:index2] else: vincula = vincula + ", " + autor[index1:index2] #Instituciones Instituciones = b_eventos[2].findAll("li") if len(Instituciones) == 0: insti = "" vinculain = "" else: for z in range(0, len(Instituciones)): institu = Instituciones[z].text index1 = institu.find("Nombre de la institución:") + 25 index2 = institu.find( "\r\n Tipo de vinculación" ) if len(insti) == 0: insti = institu[index1:index2] else: insti = insti + ", " + institu[index1:index2] index1 = institu.find("Tipo de vinculación") + 19 index2 = institu.find("'", index1, len(institu)) if len(vinculain) == 0: vinculain = institu[index1:index2] else: vinculain = vinculain + ", " + institu[index1:index2] #Productos Asociados productos = b_eventos[1].findAll("li") if len(productos) == 0: init.rel_persona_producto_colciencias.append(str(RH) + ";"\ + str(COD_PRODUCTO) + ";"\ + "0" + ";"\ + "" + ";"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',TipoEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "" + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Ambito.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "" + ";" \ + "" + ";" \ + "" + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',auto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vincula.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "" + ";" \ + "" + ";" \ + "" + ";" \ + "" + ";" \ + "" + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',insti.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vinculain.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "\n") init.colciencias_apropiacion.append(str(RH) + str(COD_PRODUCTO) + ";"\ + str(RH) + ";"\ + str(COD_PRODUCTO) + ";"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "\n") init.inrel_personas_producto_colciencias.append( \ "REPLACE INTO `uapa_db`.`rel_personas_producto_colciencias`(`cod_rel_per_prod_col`,`cod_producto`,`cod_rh`,`cod_tipo_producto`,`nombre_producto`,`evento_asociado`,`datos_complementarios`,`lugar`,`ano`,`ambito`,`palabras_clave`,`areas`,`sectores`,`coautores`,`vincula_coautores`,`editorial`,`volumen`,`paginas`,`doi`,`finalidad`,`instituciones_asociadas`,`tipo_vinculacion_institucion`) VALUES" + "('"+ str(RH) + str(COD_PRODUCTO) + "'," + str(COD_PRODUCTO) + ","\ + "'" + str(RH) + "',"\ + "0" + ","\ + "null" + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Ambito.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',auto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vincula.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',insti.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vinculain.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "');\n") init.incolciencias_apropiacion.append( \ "REPLACE INTO `uapa_db`.`colciencias_apropiacion`(`cod_colciencias_apropiacion`,`cod_rh`,`cod_rel_per_prod_col`,`fecha_ini`,`fecha_fin`,`cod_tipo_evento`) VALUES" + "('" + str(COD_PRODUCTO) + "',"\ + "'" + str(RH) + "',"\ + "'" + str(RH) + str(COD_PRODUCTO) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "null" + ");\n") COD_PRODUCTO = COD_PRODUCTO + 1 else: for y in range(0, len(productos)): prod = productos[y].text index1 = prod.find("Nombre del producto:") + 20 index2 = prod.find("Tipo de producto:") NombreProducto = prod[index1:index2] index1 = prod.find("Tipo de producto:") + 17 index2 = prod.find("\r\n", index1, len(prod)) Tipopub = prod[index1:index2] if Tipopub == "Producción bibliográfica - Trabajos en eventos (Capítulos de memoria) - Completo": Tipopub = "2" elif Tipopub == "Producción técnica - Presentación de trabajo - Comunicación": Tipopub = "3" elif Tipopub == "Demás trabajos - Demás trabajos - Póster": Tipopub = "4" elif Tipopub == "Producción técnica - Presentación de trabajo - Conferencia": Tipopub = "5" elif Tipopub == "Producción técnica - Presentación de trabajo - Ponencia": Tipopub = "6" elif Tipopub == "Producción bibliográfica - Trabajos en eventos (Capítulos de memoria) - Resumen": Tipopub = "12" elif Tipopub == "Producción técnica - Presentación de trabajo - Congreso": Tipopub = "13" elif Tipopub == "Producción técnica - Presentación de trabajo - Simposio": Tipopub = "14" elif Tipopub == "Producción técnica - Presentación de trabajo - Seminario": Tipopub = "15" elif Tipopub == "Producción técnica - Presentación de trabajo - Otro": Tipopub = "16" else: logging.critical('Añadir a Tipo_Producto: ' + TipoEvento) print("ALERTA: Revisar el archivo Eventos.log") init.rel_persona_producto_colciencias.append(str(RH) + ";"\ + str(COD_PRODUCTO) + ";"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Tipopub.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreProducto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',TipoEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "" + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Ambito.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "" + ";" \ + "" + ";" \ + "" + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',auto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vincula.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "" + ";" \ + "" + ";" \ + "" + ";" \ + "" + ";" \ + "" + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',insti.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vinculain.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "\n") init.inrel_personas_producto_colciencias.append( \ "REPLACE INTO `uapa_db`.`rel_personas_producto_colciencias`(`cod_rel_per_prod_col`,`cod_producto`,`cod_rh`,`cod_tipo_producto`,`nombre_producto`,`evento_asociado`,`datos_complementarios`,`lugar`,`ano`,`ambito`,`palabras_clave`,`areas`,`sectores`,`coautores`,`vincula_coautores`,`editorial`,`volumen`,`paginas`,`doi`,`finalidad`,`instituciones_asociadas`,`tipo_vinculacion_institucion`) VALUES" + "('"+ str(RH) + str(COD_PRODUCTO) + "'," + str(COD_PRODUCTO) + ","\ + "'" + str(RH) + "',"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Tipopub.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreProducto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "null" + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Ambito.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',auto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vincula.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "null" + ","\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',insti.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vinculain.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "');\n") init.colciencias_apropiacion.append(str(RH) + str(COD_PRODUCTO) + ";"\ + str(RH) + ";"\ + str(COD_PRODUCTO) + ";"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',TipoEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \ + "\n") init.incolciencias_apropiacion.append( \ "REPLACE INTO `uapa_db`.`colciencias_apropiacion`(`cod_colciencias_apropiacion`,`cod_rh`,`cod_rel_per_prod_col`,`fecha_ini`,`fecha_fin`,`cod_tipo_evento`) VALUES" + "('" + str(COD_PRODUCTO) + "',"\ + "'" + str(RH) + "',"\ + "'" + str(RH) + str(COD_PRODUCTO) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\ + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',TipoEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ");\n") COD_PRODUCTO = COD_PRODUCTO + 1 auto = "" vincula = "" insti = "" vinculain = "" else: logging.info(' El Docente ' + name + ' ' + last + ' no tiene Eventos Asociados') conteventos = [COD_PRODUCTO]
import csv from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq Url = 'https://karki23.github.io/Weather-Data/Albury.html' pageHtml = uReq(Url) soup = soup(pageHtml,"html.parser") table = soup.find_all("table", { "class" : "tablepress tablepress-id-10 tablepress-responsive-phone" }) with open('Albury.csv', 'w',newline='') as csvfile: f = csv.writer(csvfile) f.writerow(['Date', 'Location', 'MinTemp','MaxTemp','Rainfall','Evaporation','Sunshine','WindGustDir','WindGustSpeed','WindDir9am','WindDir3pm','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','CLoud3pm','Temp9am','Temp3pm','RainToday','RISK_MM','RainTomorrow']) for x in table: table_body = x.find('tbody') rows = table_body.find_all('tr') for tr in rows: data=[] cols = tr.find_all('td') for td in cols: data.append(td.text.strip()) f.writerow(data) print(data)
price_list = [] address1_list = [] address2_list = [] room_list = [] shower_list = [] car_list = [] size_list = [] for i in tqdm(range(1,7)): # sleep is used to make sure that I dont spam the server too much time.sleep(2) try: my_url = "https://www.domain.com.au/sale/?suburb=caulfield-vic-3162,elsternwick-vic-3185,gardenvale-vic-3185,glen-huntly-vic-3163,mckinnon-vic-3204,murrumbeena-vic-3163,ormond-vic-3204,carnegie-vic-3163,bentleigh-vic-3204,bentleigh-east-vic-3165&ptype=apartment&bedrooms=2-any&price=0-750000&excludeunderoffer=1&carspaces=1-any&ssubs=0&page={}".format(i) req = urllib.request.Request(my_url,headers={'User-Agent': "Magic Browser"}) con = uReq(req) page_html = con.read() con.close() # html parsing page_soup = soup(page_html, 'html.parser') containers = page_soup.find_all(class_="css-qrqvvg") for container in containers: # Get price try: price_container = container.find_all('p', class_="css-mgq8yx") price = price_container[0].text.strip().encode('ascii', 'ignore').decode("utf-8") price_list.append(price) print(price) except IndexError: print('None') price_list.append('NG')
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq from time import ctime as ct # This file aims at scrapping a youtube channel in order to get data as: # the number of subscribers, the number of views and the starting date of this # particular channel. channelName="Mister Geopolitix" channelUrl="https://www.youtube.com/channel/UCX9lsdsTKfTi1eqoyL-RS-Q/about" #Opening connection, grabbing the page uClient = uReq(channelUrl) pageHtml = uClient.read() uClient.close() # Using beautifulsoup module we parse the source code of the webpage pageSoup = soup(pageHtml, "html.parser") # We are seeking the 'about-stat' span section: stats = pageSoup.findAll("span", {"class", "about-stat"}) # Values Extraction nbSubs = stats[0].find("b").text.replace('\xa0', ' ') nbViews = stats[1].find("b").text.replace('\xa0', ' ') startDate = stats[2].text.replace('\xa0', ' ') # Save data in a file with the current date record = open("log.txt", "a") date = ct() #current time
"KKTC-GİRNE": "85", "TÜRKİSTAN-KAZAKİSTAN": "86", "KKTC-GÜZELYURT": "87", "BİŞKEK-KIRGIZİSTAN": "88", "KOMRAT-MOLDOVA": "89", "KKTC-LEFKE": "90", "ÜSKÜP-MAKEDONYA": "91", } return switcher.get(cityName, "0") f = open("university-data.txt", "a") mainUrl = 'https://www.basarisiralamalari.com/universite-taban-puanlari-2020-ve-basari-siralamalari-osym/' # Opens up the connection and gets the html page from it uClient = uReq(mainUrl) pageHtml = uClient.read() # Closes the connection uClient.close() pageSoup = soup(pageHtml.decode('utf-8', 'ignore'), 'html.parser') uniTable = pageSoup.find('table', {'id': 'basaritable'}) wholeTbody = uniTable.tbody allRows = wholeTbody.findAll('tr') allURLs = [] for row in allRows: if row.find('tr', attrs={'style': 'height: 46px;'}):
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq #from textblob import TextBlob #input item name from user and remove spaces print("Search for item..?") item_name_full = input() item_name = item_name_full.replace(' ', '') #link to scrap data from my_url = 'https://www.flipkart.com/search?q=' + item_name + '&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off' #open connection uClient = uReq(my_url) page_html = uClient.read() uClient.close() #Use BeautigulSoup to parse the html site page_soup = soup(page_html, "html.parser") #on web browser 'inspect element' to get the class of part which we want to scrap no_page_soup = page_soup.findAll("div", {"class": "_2zg3yZ"}) #print total no of page like 'Page 1 of 8' '''print(no_page_soup[0].span.text)''' #Find only total no. of pages. like '8' and then convert to int num_pages_str = no_page_soup[0].span.text #['Page', '1', 'of', '8']
from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup my_url = ["https://www.theravive.com/cities/ma/"] #opening up connection, grabbing the page for i in my_url: uClient = uReq(i) page1_html = uClient.read() uClient.close() #html parsing page1_soup = soup(page1_html, "html.parser") #grabs each product containers = page1_soup.findAll("div", {"class": "profile-info"}) container = containers[0] filename = "ma.csv" f = open(filename, "w") headers = "Name, Profession, Contact_Info\n" f.write(headers) for container in containers: address_container = container.findAll("div", {"class": "profile-address"}) address = address_container[0].text.strip() name_container = container.findAll("h4", {"class": "green-text"})
a = 0 while (choice != 4): print("1.DO YOU WANT TO SEE PRODUCTS PREVAILING IN FLIPKART?") print("2.DO YOU WANT TO SEE PRODUCTS PREVAILING IN SNAPDEAL?") print("3.DO YOU WANT TO SEE WHICH E-SITE IS EFFICIENT?") print("4.EXIT") print( "***********************************************************************" ) choice = int(input()) if (choice == 1): i = 0 print("NOW YOU ARE IN FLIPKART") my_url = 'https://www.flipkart.com/search?q=kurti&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off' uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "_3liAhj _1R0K0g"}) print("total number of dresses in this page:", len(containers)) container = containers[0] for container in containers: name1 = container.findAll("a", {"class": "_2cLu-l"}) name = name1[0].text.strip() print(name) price1 = container.findAll("div", {"class": "_1vC4OE"}) price = price1[0].text.strip() print(price) rat1 = container.findAll("span", {"class": "_2_KrJI"}) try:
def index(): if request.method == 'POST': try: searchString = request.form['content'].replace(" ", "") flipkart_url = "https://www.flipkart.com/search?q=" + searchString uClient = uReq(flipkart_url) flipkartPage = uClient.read() uClient.close() flipkart_html = bs(flipkartPage, "html.parser") bigboxes = flipkart_html.findAll("div", {"class": "bhgxx2 col-12-12"}) del bigboxes[0:3] box = bigboxes[0] productLink = "https://www.flipkart.com" + box.div.div.div.a['href'] print(productLink) prodRes = requests.get(productLink) prod_html = bs(prodRes.content, "html.parser") prod = prod_html.find('div', {'class': '_29OxBi'}).h1.span.get_text() data = prod_html.find('div', {'class': 'swINJg _3nrCtb'}) parent = data.find_parent() url = parent.get('href') url = 'https://www.flipkart.com' + url req_data = requests.get(url) all_reviews = bs(req_data.content, 'html.parser') pages = all_reviews.find_all('div', {'class': '_2zg3yZ _3KSYCY'}) # extracts all the pages url info page = int(pages[0].span.get_text().split()[-1]) if page > 3: page = 3 reviews = [] for i in range(0, page): # we iterate through all the pages commentboxes = all_reviews.find_all('div', {'class': "_1PBCrt"}) for commentbox in commentboxes: try: name = commentbox.div.div.find_all('p', {'class': '_3LYOAd _3sxSiS'})[0].text except: name = 'No Name' try: rating = commentbox.div.div.div.div.text except: rating = 'No Rating' try: commentHead = commentbox.div.div.div.p.text except: commentHead = 'No Comment Heading' try: comtag = commentbox.div.div.find_all('div', {'class': ''}) custComment = comtag[0].div.text except Exception as e: print("Exception while creating dictionary: ", e) mydict = {"Product": prod, "Name": name, "Rating": rating, "CommentHead": commentHead, "Comment": custComment} reviews.append(mydict) return render_template('results.html', reviews=reviews[0:(len(reviews) - 1)]) except Exception as e: print('The Exception message is: ', e) return 'something is wrong' else: return render_template('index.html')
def add_box_office(csv_in, csv_out): ratings = pd.read_csv(csv_in, encoding='ISO-8859-1') ratings['Box Office Gross USA'] = 0 ratings['Box Office Gross USA'].fillna(value=0, inplace=True) infl_string = 'Inflation Adjusted Box Office Gross USA' ratings[infl_string] = 0 ratings[infl_string].fillna(value=0, inplace=True) context = ssl._create_unverified_context() for i in range(len(urls)): url = urls[i] # Open connection, read html, close connection uClient = uReq(url, context=context) page_html = uClient.read() uClient.close() # html parser page_soup = soup(page_html, 'html.parser') # print(page_soup.body.span) title = page_soup.title.get_text() print(title) movie_year = '2020' if (not 'TV Series' in title and not 'TV Mini-Series' in title): open_paren = title.find('(') close_paren = title.find(')') movie_year = title[open_paren + 1:close_paren] print('Year: ', movie_year) box_office_gross = page_soup.find('div', {'id': 'main_bottom'}) box_office_gross = box_office_gross.find('div', {'id': 'titleDetails'}) box_office_gross = box_office_gross.findAll('div', {'class': 'txt-block'}) print('num divs: ', len(box_office_gross)) j = 0 has_gross_usa = False for j in range(4, len(box_office_gross)): if ('Gross USA' in box_office_gross[j].get_text()): has_gross_usa = True break if (has_gross_usa): box_office_gross = box_office_gross[j].get_text() dollar_index = box_office_gross.find('$') box_office_gross = box_office_gross[dollar_index + 1:] box_office_gross = box_office_gross.replace(',', '') else: box_office_gross = 0 t = datetime.now().year - int(movie_year) # Inflation adjusted box office gross infl_adj_gross = float(box_office_gross) * (pow( (1 + 1.84545 / 100), t)) box_off_loc = ratings.columns.get_loc('Box Office Gross USA') ratings.iat[i, box_off_loc] = float(box_office_gross) ratings.iat[i, box_off_loc + 1] = float(infl_adj_gross) # Format numbers to $xxx,xxx,xxx.xx infl_adj_gross = '${:,.2f}'.format(infl_adj_gross) box_office_gross = '${:,.2f}'.format(float(box_office_gross)) print("Box office gross: ", box_office_gross) print("Box office gross (inflation adjusted): ", infl_adj_gross) print() print('----------------------------------') print() ratings.to_csv(csv_out)
set_url = root_url + set_name_url print("url: " + set_url) # spoof header req = Request(set_url) req.add_header("User-Agent", "Mozilla/5.0") ################################### # download and parse page into soup ################################### scrape_date = str(datetime.now()) web_client = uReq(req) print("web_client: " + str(web_client.getcode())) soup = soup(web_client.read(), "html.parser") web_client.close() ####################### # isolate pricing table ####################### html_tables = soup.find_all("table") # lots of tables. We only need the pricing table html_table = html_tables[8]
c = 0 while True: if c == 0: cf_id = input("Enter your codeforces id : ") c += 1 else: cf_id = input("Enter your codeforces id again\n") link = 'https://www.codeforces.com/profile/' + cf_id check = requests.get(link) display = "INVALID Codeforces ID" if check.url == "https://codeforces.com/": print(display.center(40, '*')) else: break uClient = uReq(link) link = "https://codeforces.com/api/user.info?handles=" + cf_id ml = requests.get(link) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") c = page_soup.find_all('li') p = json.loads(ml.text) dic = p['result'] ls = dic[0] #user-id print("User-ID:", cf_id) #User Name print("Name:", ls['firstName'], ls['lastName'])
review_date = item.select("small")[0].text review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")]) print("Name:", name) print("Location:", location) print("Review_date:", review_date) print("Review_Title:", review_title) print("Review_Data:", review_data) row = [name, location, review_date, review_title, review_data] csv_writer.writerow(row) # --- get next url --- uclient = uReq(url) page_html = uclient.read() uclient.close() soup = BeautifulSoup(page_html, "html.parser") container = soup.find("ul", {"class": "pages table"}) all_li = container.findAll("li") if all_li: last_div = all_li[-1] content = last_div.getText() content = int(content) container = soup.findAll("li", {"class": "next"}) li = container[0].find("a", {"class": "btn btn-link"}).attrs['href'] # ---- get data ---
"div", { "id": "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl04_nameRow" }) transName = transDiv.find("div", {"class": "value"}) csvDict['Trans_Card'] = transName.text.strip() with open('cards.csv', 'a', encoding='utf8', newline='') as csvfile: scrapeTo = csv.DictWriter(csvfile, fieldnames=card_values) scrapeTo.writerow(csvDict) return my_url = 'http://gatherer.wizards.com/Pages/Search/Default.aspx?text=+%5Btransform%5D' # opening connection, grabbing page uClient = uReq(my_url) page_html = uClient.read() # html parsing page_soup = soup(page_html, "lxml") page_nums = page_soup.find("div", {"class": "paging"}) # grabs cardInfo div siteURL = "http://gatherer.wizards.com" # scrapeTo = csv.writer(open('card.csv', 'wb')) cardCount = 0 current_page = 0
from bs4 import BeautifulSoup from urllib.request import urlopen as uReq # Url of page to scrape my_url = "http://magic.wizards.com/en/events/coverage/gpman16/mono-blue-prison-with-martin-muller-2016-05-28" # Grab page html form URL web_client = uReq(my_url) # convert raw html to a soup object page_soup = BeautifulSoup(web_client.read(), "html.parser") # Extract deck deck_soup = page_soup.find_all("div", {"class": "deck-list-text"}) # Extract card count quantities from deck card_counts = page_soup.find_all("a", {"class": "card-name"}) # Extract card information from deck input("Press any key to end.")
def index(): searchString = request.form['content'].replace( " ", "") # obtaining the search string entered in the form try: # dbConn = pymongo.MongoClient("mongodb://localhost:27017/") # opening a connection to Mongo #db = dbConn['crawlerDB'] # connecting to the database called crawlerDB # reviews = db[searchString].find({}) # searching the collection with the name same as the keyword # if reviews.count() > 0: # if there is a collection with searched keyword and it has records in it # return render_template('results.html',reviews=reviews) # show the results to user #else: flipkart_url = "https://www.flipkart.com/search?q=" + searchString # preparing the URL to search the product on flipkart uClient = uReq( flipkart_url) # requesting the webpage from the internet flipkartPage = uClient.read() # reading the webpage uClient.close() # closing the connection to the web server flipkart_html = bs(flipkartPage, "html.parser") # parsing the webpage as HTML bigboxes = flipkart_html.findAll("div", { "class": "bhgxx2 col-12-12" }) # seacrhing for appropriate tag to redirect to the product link del bigboxes[ 0: 3] # the first 3 members of the list do not contain relevant information, hence deleting them. box = bigboxes[0] # taking the first iteration (for demo) productLink = "https://www.flipkart.com" + box.div.div.div.a[ 'href'] # extracting the actual product link prodRes = requests.get( productLink) # getting the product page from server prod_html = bs(prodRes.text, "html.parser") # parsing the product page as HTML commentboxes = prod_html.find_all( 'div', {'class': "_3nrCtb" }) # finding the HTML section containing the customer comments #table = db[searchString] # creating a collection with the same name as search string. Tables and Collections are analogous. #filename = searchString+".csv" # filename to save the details #fw = open(filename, "w") # creating a local file to save the details #headers = "Product, Customer Name, Rating, Heading, Comment \n" # providing the heading of the columns #fw.write(headers) # writing first the headers to file reviews = [] # initializing an empty list for reviews # iterating over the comment section to get the details of customer and their comments for commentbox in commentboxes: try: name = commentbox.div.div.find_all( 'p', {'class': '_3LYOAd _3sxSiS'})[0].text except: name = 'No Name' try: rating = commentbox.div.div.div.div.text except: rating = 'No Rating' try: commentHead = commentbox.div.div.div.p.text except: commentHead = 'No Comment Heading' try: comtag = commentbox.div.div.find_all('div', {'class': ''}) custComment = comtag[0].div.text except: custComment = 'No Customer Comment' #fw.write(searchString+","+name.replace(",", ":")+","+rating + "," + commentHead.replace(",", ":") + "," + custComment.replace(",", ":") + "\n") mydict = { "Product": searchString, "Name": name, "Rating": rating, "CommentHead": commentHead, "Comment": custComment } # saving that detail to a dictionary # x = table.insert_one(mydict) #insertig the dictionary containing the rview comments to the collection reviews.append( mydict) # appending the comments to the review list return render_template( 'results.html', reviews=reviews) # showing the review to the user except: return 'something is wrong'
def parse_deck(mtggoldfish_url): # grab and soup page page_client = uReq(mtggoldfish_url) print("Grabbing the html through the blind eternities...") page_soup = BeautifulSoup( page_client.read(), "html.parser" ) page_client.close() # Grabs the paper decklist with the prices print("Extracting table...") deck_table = page_soup.find_all( "div", {"id": "tab-paper"} ) # discards things around the table deck_table = deck_table[0].div.div.table # gets all the rows from the html table table_rows = deck_table.find_all("tr") decklist = {} # Loops through the rows print("Parsing rows...") for row in table_rows: columns = row.find_all("td") if(len(columns) == 4): # extracts features from each column quantity = columns[0].text.strip() card_name = columns[1].text.strip() color = "" try: color = columns[2].span.img['alt'].strip() except AttributeError: pass price = round( float( columns[3].text.strip() ), 2 # two decimal places ) card_key = card_name.lower() decklist[card_key] = { "card_name": card_name, "color": color, "price_amt": price, "price_date_utc": str(datetime.datetime.utcnow()), "deck_qty": quantity, } print("Parsed..." + card_name + ", " + color + ", " + quantity + ", " + str(price) ) print("Printing card names.") card_names = list(decklist.keys()) card_names.sort() print(card_names) print("Printing dictionary.") print(decklist)
from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup from email.mime.text import MIMEText import smtplib import sys MY_URL = 'http://www.secretflying.com/posts/category/error-fare/' USER_CLIENT = uReq(MY_URL) PAGE_HTML = USER_CLIENT.read() USER_CLIENT.close() PAGE_SOUP = soup(PAGE_HTML, "html.parser") ERROR_FARES_CONTENT = PAGE_SOUP.findAll( "div", {"class": "article-content-wrapper entry-main-content"}) ORIGINAL_TEXT_FILE = "errorfarelist.txt" DIFFERENCES_TEXT_FILE = "errorfaresdiffs.txt" # This writes the previous data from a text file into a list prior to the next function potentially finding updates. def store_existing_deals_in_mem(): with open(ORIGINAL_TEXT_FILE, "r") as F: current_deals_file = [] for word in F: current_deals_file.append(word.strip('\n')) return current_deals_file # The website page 'titles' are parsed then the content is written to a text file (overwrites all data in the file). def parse_and_write_to_file(): with open(ORIGINAL_TEXT_FILE, "w+") as F: for deals in ERROR_FARES_CONTENT: try:
def main(): # Uses tweepy to access twitter to allow the ability to tweet consumer_key = #consumer key consumer_secret = #consumer secret access_token = #enter access token access_token_secret = #enter access password auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) # Stores URL my_url = 'https://www.snopes.com/category/facts/' # Opening connection, grab the page uClient = uReq(my_url) page_html = uClient.read() uClient.close() # parse page_soup = soup(page_html, "html.parser") # get specific part of html containers = page_soup.findAll("a", {"class": "article-link"}) title = [15] url = [15] validity = [15] # Grabs title from Snopes web page for container in containers[:10]: try: # Grabs the title of article x = 0 title.insert(x, str(container.h2.text)) except(NameError, IndexError, AttributeError): print("Error") # print(title[x]) x += 1 # Grabs validity from Snopes web page for container in containers[:10]: try: validity_container = container.findAll("span", {"itemprop": "reviewRating"}) x = 0 for valid in validity_container: validity.insert(x, valid.span.text) except(NameError, IndexError, AttributeError): print("Error") x += 1 # Creates link list that stores all url on web page links = [] # URL's we are actually interested in start here ind = 94 # # Grabs url from Snopes web page for container in containers[:10]: try: # Grabs all url tags on web page y = 0 for link in page_soup.findAll('a', attrs={'href': re.compile("^https://www.snopes.com")}): links.append(link.get('href')) # Iterates to corresponding url url.insert(y, links[ind]) ind += 1 except(NameError, IndexError, AttributeError): print("Error") # Test print to make sure three # for x in range(10): # print(title[x] + " " + validity[x] + " " + url[x]) # Creates tweet with articles found false by Snopes for x in range(10): if validity[x] in ('FALSE', 'MOSTLY FALSE'): api.update_status("FAKE NEWS ALERT!! \n" + title[x] + " " + url[x] + " #FAKENEWS #RESISTTRUMP #RESIST" " #STOPHATE #REALNEWS #TRUMP " "#POLITICS #RESIST #NEWS" " #RUSSIA" "ANTIRUSSIA" " #INVESTIGATE" ) else: continue print("--- %s seconds ---" % (time.time() - start_time))
from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup current_date = date.today() filename = "results {}_{}_{}.csv".format(current_date.day, current_date.month, current_date.year) f = open(filename, "w") headers = "suburb,address,bed,result,price,link\n" f.write(headers) all_letters = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ] for letter in all_letters: RE_url = 'https://www.realestateview.com.au/sales-and-auction-results/victoria/' + letter + '/' # opens a connection to URL uClient = uReq(RE_url) # puts all the HTML into a container page_html = uClient.read() # closes connection uClient.close() # stores html as soup page_soup = soup(page_html, "html.parser") #finds relevant info on page containers = page_soup.findAll("tr", {"class": "auction-result-item"}) #things you want: Suburb, address, bedrooms, result, price for container in containers: #suburb suburb_container = container.findAll("meta", {"itemprop": "addressLocality"})
from bs4 import BeautifulSoup from urllib.request import urlopen as uReq # URL to be scraped. Has to be an edhrec page. target_url = "https://edhrec.com/sets/akh/" # Open connection, download page web_client = uReq(target_url) # Parse page into a soup data structure print("Grabbing website: " + target_url) page_soup = BeautifulSoup(web_client.read(), "html.parser") # Close the web client web_client.close() card_frames = page_soup.find_all("div", {"class": "nw"}) out_filename = "scrape_edhrec_output.tsv" f = open(out_filename, "w") headers = "card_name\tin_decks\tprice\n" f.write(headers) for card_frame in card_frames: name_frame = card_frame.find_all("div", {"class": "nwname"}) card_name = name_frame[0].text quantity_frame = card_frame.find_all("div", {"class": "nwdesc ellipsis"}) quantity = quantity_frame[0].text quantity = quantity.replace(" decks", "")
"""Script created by following tutorial here: https://www.youtube.com/watch?v=XQgXKtPSzUI""" from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup myurl = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20cards' filename = "products.csv" f = open(filename, "w") headers = "brand, product_name, shipping\n" f.write(headers) # Open connection and download page. uClient = uReq(myurl) # Download URL. page_html = uClient.read() # Downloaded HTML. uClient.close() # Close the client after downloading. # Parse HTML pagesoup = soup(page_html, "html.parser") # Parse the file as HTML. # Grab each product containers = pagesoup.findAll("div", {"class": "item-container"}) container = containers[0] #print(container.div.div.a.img["title"]) # Item title example. for container in containers: brand = container.div.div.a.img["title"] title_container = container.findAll("a", {"class": "item-title"}) product_name = title_container[0].text