Ejemplo n.º 1
0
def serializar_ocorrencias(lista_urls, palavra):

	#Armazenara o array de todas as ocorrencias de cada link
	ocorrencias = []

	#Para cada link na lista sera feito o bloco de codigo
	for link in lista_urls:
		
		#formatando a pagina para html
		pag_url = str(uReq(link).read())
		pag_soup = soup(pag_url, "html.parser")

		#qtd de paragrafos <p> no html
		ct = paragrafos(pag_soup,palavra)
		
		#cria um dicionario com o link e a quantidade de ocorrencias
		ocorrencia = {"url":link, "qtd_de_ocorrencias": str(ct)}
		#adiciona o dicionario a lista de ocorrencias
		ocorrencias.append(ocorrencia)

		#printando no console a troca da url
		print("*************************************************************************")


	#retornando o array de dicionarios serializado no formato json
	#return json.dumps(ocorrencias, indent=4)
	return ocorrencias
Ejemplo n.º 2
0
def crawl(href,count):
	print(get_time() + ", Parsing Link: " + href)
	

	req = Request(href, headers={'User-Agent': 'Mozilla/5.0'})

	uClient = uReq(req)
	page_html = uClient.read()
	uClient.close()
	
	page_soup = soup(page_html, "html.parser")
	heading = page_soup.find('center')
	content_container = page_soup.find('table', attrs={'style' : "background:transparent; text-align:justify;"}).prettify()
	
	table = soup(content_container,"html.parser")	
	
	para = table.find_all('p')
	
	#name = str(count)+".html"
	with io.open("para_hn.html", "a", encoding="utf-8") as fout:
		#fout.write("\n\n" + heading.text + "\n\n")
		#	for i in para:
	 	#print(para[i])
		fout.write(str(para))
		

	link = page_soup.find('img', attrs={'alt' : 'Next.png'})
	next_link = link.findPrevious('a')['href']
	complete_link = "http://hi.krishnakosh.org" + quote(next_link, safe='%,/')

	return complete_link
Ejemplo n.º 3
0
# -*- coding: utf-8 -*-
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48'

#grabs my URL in a variable. Opens a client then closes it to not keep using resources
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

#Holds my HTML parsing
page_soup = soup(page_html, "html.parser").encode("utf-8")

#Grabs each product. Used for Graphics Card atm.
containers = page_soup.findAll("div",{"class":"item-container"})

#Export into a CSV file now.
filename = "GraphicsCards.csv"
f = open(filename, "w")

headers = "brand, product_name, shipping \n"
f.write(headers)

# Goes through the whole page, grabbing certain things i'm looking for.
# TO DO:
# ADD IMAGE RECOGNITION FOR TITLE OF PRODUCT
# ADD IN REVIEWED ONLY DEVICES FOR THIS ONE
for container in containers:
	brand = container.div.div.a.img["title"]
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq

#Steam URL... self explainitory
STEAM_URL = 'https://store.steampowered.com/search/?os=win&specials=1&page='
CONTAINERS = []

#Grab, and hold all the selected pages for steam special sales
for i in range(10):
	print ("grabbing page #" + str(i+1))
	uClient = uReq(STEAM_URL + str(i+1))
	page_html = uClient.read()
	uClient.close()
	parsed_html = soup(page_html, "html.parser")
	containers = CONTAINERS + parsed_html.findAll("div", {"class":"responsive_search_name_combined"})


filename = "SteamSpecialSale.csv"
f = open(filename, "w")

headers = "Game_Title, Original_Price, Sale_Price \n"
f.write(headers)

#Go through a whole page, looping through all entries needed.
#then write them into CSV.
for container in containers:
	game_title = container.findAll("span", {"class":"title"})[0].text.strip()

	game_price = container.findAll("div", {"class":"search_price"})[0].text.strip()
	original_price = game_price[:game_price.rfind("$")].strip()
Ejemplo n.º 5
0
# downloads the MTG json database to a sister directory.
print("Calling function download_json_db()...\n")
# this function requires....
# from io import BytesIO
# import zipfile
# from urllib.request import urlopen as uReq
# import os

# fields
host_url = 'https://mtgjson.com/json/AllCards.json.zip'
internal_filename = 'AllCards.json'

# downloads the file
print("Downloading JSON library from mtgjson.com...")

client = uReq(host_url)
zip_byte_file = client.read()
client.close()
print("Download... Done!\n")

# Extract the file
print("Extracting zip folder....")
zip_folder = zipfile.ZipFile(BytesIO(zip_byte_file))
json_file = zip_folder.read(internal_filename)

# Converts to JSON
json_file = json_file.decode("utf-8")
json_file = json.loads(json_file)
json_file = json.dumps(json_file)

print("Extraction... Complete!\n")
Ejemplo n.º 6
0
#                      )
# ################# insert to DB code ############################

print("Scrapping data from: Zappa.co.il")

myurl = "https://www.zappa-club.co.il/content/the-show-must-go-on/"

#Write path depending if on Linux Azure VM or Win laptop
if path.exists("C:/Users/omerm/Desktop/Hackorona/Data-Scrapping"):
    the_path = "C:/Users/omerm/Desktop/Hackorona/Data-Scrapping"
else:
    the_path = "/root/bin/datascrape"

#Grapping page
req = Request(myurl, headers={'User-Agent': 'Mozilla/5.0'})
uClient = uReq(req)
page_html = uClient.read()
uClient.close()

#parses the info
page_soup = soup(page_html, "html.parser")

#create csv file
filename = the_path + "/data/Zappa.csv"
with open(filename, "w", encoding="utf=16") as f:

    #csv headers
    headers = "Date., Time., Title., Caterogies., Url\n"
    f.write(headers)

    ### URL ###
Ejemplo n.º 7
0
from urllib.request import Request, urlopen as uReq
from bs4 import BeautifulSoup as soup

req = Request(
    'https://freelancehunt.com/projects/skill/parsing-dannyih/169.html?page=1',
    headers={'User-Agent': 'Mozilla/5.0'})
response = uReq(req)
page_html = response.read()
response.close()

page_soup = soup(page_html, 'html.parser')

tr_list = page_soup.findAll('tr', {'style': 'vertical-align: top'})
td_list = page_soup.findAll('td', {'class': 'text-center'})

titles = []

for tr in tr_list:
    title = tr.td.a.text
    titles.append(title)

price_list = []
for td in td_list:
    remainder = td_list.index(td) % 4
    if remainder == 0:
        price_list.append(td.span.text.strip())


def final(titles, prices):
    x = 0
    lines = []
Ejemplo n.º 8
0
def redesextract():
    from settings import my_url, name, doc, last, RH, COD_PRODUCTO
    import init, bs4, logging, sys, re
    from urllib.request import urlopen as uReq
    from bs4 import BeautifulSoup as soup
    global contredes
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    all = 0
    a = 0
    x = 0
    y = 0
    auto = ""
    vincula = ""
    insti = ""
    vinculain = ""
    page_soup = soup(page_html, "html.parser")
    containers = page_soup.findAll("table")
    for a in range(0, len(containers)):
        buscaReds = containers[a].h3
        #print(buscaReds)
        try:
            if buscaReds.text == "Redes de conocimiento especializado":
                all = a
                #print(all)
                break
        except AttributeError:
            pass
    if all != 0:
        containerb = containers[all]
        container = containerb.findAll("blockquote")
        for x in range(0, len(container)):
            cont = container[x]
            info_red = cont.text
            #Nombre de la red
            index1 = info_red.find("Nombre de la red ") + 17
            index2 = info_red.find(
                "\xa0\r\n                                Tipo de red")
            Nombrered = info_red[index1:index2]
            # Tipo de Red
            index1 = info_red.find("Tipo de red") + 11
            index2 = info_red.find(
                ",\xa0\r\n                                Creada el:")
            Tipored = info_red[index1:index2]
            # Lugar Red
            index1 = info_red.find(
                "\xa0\r\n                                    en ") + 42
            index2 = info_red.find(" \xa0 \r\n")
            LugarRed = info_red[index1:index2]
            #Fecha de Realización inicio y fin
            index1 = info_red.find("Creada el:") + 10
            index2 = index1 + 4
            AnoRedini = info_red[index1:index2]
            if AnoRedini == "," or AnoRedini == ",\xa0\r\n":
                MesRedini = ""
                AnoRedini = ""
                FechaRedini = ""
                MesRedfin = ""
                AnoRedfin = ""
                FechaRedfin = ""
            else:
                index1 = index1 + 5
                index2 = index1 + 2
                MesRedini = info_red[index1:index2]
                index1 = info_red.find("Creada el:") + 10
                index2 = index1 + 10
                FechaRedini = info_red[index1:index2]
                index1 = info_red.find(",", index1, index1 + 58) + 40
                index2 = index1 + 4
                AnoRedfin = info_red[index1:index2]
                if AnoRedfin == "    " or AnoRedfin == ",":
                    MesRedfin = ""
                    AnoRedfin = ""
                    FechaRedfin = ""
                else:
                    index1 = index1 + 5
                    index2 = index1 + 2
                    MesRedfin = info_red[index1:index2]
                    index1 = info_red.find("Creada el:") + 10
                    index1 = info_red.find(",", index1, index1 + 58) + 40
                    index2 = index1 + 10
                    FechaRedfin = info_red[index1:index2]
            init.rel_persona_producto_colciencias.append(str(RH) + ";"\
            + str(COD_PRODUCTO) + ";"\
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',"1".replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Nombrered.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + "0" + ","\
            + "" + ";"\
            + "" + ";"\
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarRed.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "" + ";"\
            + "\n")
            init.inrel_personas_producto_colciencias.append( \
            "REPLACE INTO `uapa_db`.`rel_personas_producto_colciencias`(`cod_rel_per_prod_col`,`cod_producto`,`cod_rh`,`cod_tipo_producto`,`nombre_producto`,`evento_asociado`,`datos_complementarios`,`lugar`,`ano`,`ambito`,`palabras_clave`,`areas`,`sectores`,`coautores`,`vincula_coautores`,`editorial`,`volumen`,`paginas`,`doi`,`finalidad`,`instituciones_asociadas`,`tipo_vinculacion_institucion`) VALUES"
            + "('"+ str(RH) + str(COD_PRODUCTO) + "',"
            + str(COD_PRODUCTO) + ","\
            + "'" + str(RH) + "',"\
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',"7".replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\
            + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Nombrered.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
            + "null" + ","\
            + "null" + ","\
            + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarRed.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ","\
            + "null" + ");\n")
            init.colciencias_apropiacion.append(str(RH) + str(COD_PRODUCTO) + ";"\
            + str(RH) + ";"\
            + str(COD_PRODUCTO) + ";"\
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaRedini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoRedini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesRedini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaRedfin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoRedfin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesRedfin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
            + "\n")
            init.incolciencias_apropiacion.append( \
            "REPLACE INTO `uapa_db`.`colciencias_apropiacion`(`cod_colciencias_apropiacion`,`cod_rh`,`cod_rel_per_prod_col`,`fecha_ini`,`fecha_fin`,`cod_tipo_evento`) VALUES"
            + "('" + str(COD_PRODUCTO) + "',"\
            + "'" + str(RH) + "',"\
            + "'" + str(RH) + str(COD_PRODUCTO) + "',"\
            + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaRedini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
            + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaRedfin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
            + "0" + ");\n")
            COD_PRODUCTO = COD_PRODUCTO + 1
    else:
        logging.info(' El Docente ' + name + ' ' + last +
                     ' no tiene Redes Asociadas')
    contredes = [COD_PRODUCTO]
Ejemplo n.º 9
0
def main():
    database = r"C:\Users\Kenny\Documents\webscrape\Phish.db"

    #create a database connection
    conn = create_connection(database)
    with conn:
        cur = conn.cursor()

        #These are the variables responsible for tracking ID in each respective table.
        songCount = 1
        venueCount = 1
        showCount = 1
        setCount = 0
        setlistEntryCount = 1

        #Set-up with the first URL
        url = "https://phish.net/setlists/phish-december-02-1983-harris-millis-cafeteria-university-of-vermont-burlington-vt-usa.html"
        while (url !=
               "https://phish.net/setlist/jump/next?showdate=2020-02-23"):
            uClient = uReq(url)
            page_html = uClient.read()
            uClient.close()

            #Hold the parsed-HTML in a BeautifulSoup data structure
            page_soup = soup(page_html, "html.parser")

            #gets the URL for the next show
            urlDivContainer = page_soup.findAll("div",
                                                {"class": "well clearfix"})
            urlDiv = urlDivContainer[1]
            aTag = urlDiv.find_all('a')[1]
            url = "https://phish.net" + aTag["href"]

            #Get the date from the site-header
            dateDiv = page_soup.find("div", {"class": "setlist-date-long"})
            aTag = dateDiv.find_all('a')[1]
            dateString = str(aTag)
            date = dateString[len(dateString) - 14:len(dateString) -
                              4]  #Formatted in "MM/DD/YYYY"
            #Reorganizes date-format to "YYYY-MM-DD"
            year = date[len(date) - 4:len(date)]
            month = date[0:2]
            day = date[3:5]
            date = year + "-" + month + "-" + day

            #Get the Venue
            venueDiv = page_soup.find("div", {"class": "setlist-venue"})
            aTag = venueDiv.find('a').contents[0]
            venue = str(aTag.contents[0]).title()

            #Get the Location
            locDiv = page_soup.find("div", {"class": "setlist-location"})
            aTag = locDiv.find_all("a")
            city = aTag[0].contents[0]
            state = aTag[1].contents[0]

            #Iterate through sets
            setlistBody = page_soup.find("div", {"class": "setlist-body"})

            p = setlistBody.find("p")

            #Works through and gets the set or song information
            setlistSongCount = 1
            setInfo = ""
            for tag in p.find_all(["a", "span"]):
                #If next tag is span tag, it will hold the setInfo instead of a song
                if (tag.name == "span"):
                    setInfo = tag.contents[0]
                    setlistSongCount = 1
                    setCount += 1

                else:  #Otherwise, the next tag will be an <a>, which holds a song.

                    song = tag.contents[0]

                    songID = 1
                    #Check whether or not to add song to Song table
                    val = valueCheck.checkSong(conn, song)
                    if (val == False):
                        songToInsert = (songCount, song)
                        SQLInsert.insert_song(conn, songToInsert)
                        songID = songCount
                        songCount += 1
                    else:
                        cur.execute(
                            "SELECT songID FROM Songs WHERE songName LIKE (?)",
                            (song, ))
                        songID = cur.fetchall()[0][0]

                    #Handles whether songs are separated by ",", ">", or "->"
                    sib = tag.next_sibling
                    sibString = str(sib)
                    while (sibString[0] == "<"):
                        sib = sib.next_sibling
                        sibString = str(sib)
                    segue = False
                    transition = False
                    if ("->" in sibString):
                        transition = True
                    elif (">" in sibString):
                        segue = True

                    setlistEntryToInsert = (setlistEntryCount, showCount,
                                            setCount, setInfo,
                                            setlistSongCount, songID, segue,
                                            transition)
                    SQLInsert.insert_setlist(conn, setlistEntryToInsert)
                    setlistSongCount += 1
                    setlistEntryCount += 1

            #Check the Venues table if this Venue already exists
            val = valueCheck.checkVenue(conn, venue)
            if (val == False):
                venueToInsert = (venueCount, venue, city, state)
                SQLInsert.insert_venue(conn, venueToInsert)
                venueID = venueCount
                venueCount += 1
            #Otherwise, get the venueID from the Venues table
            else:
                cur.execute(
                    "SELECT venueID FROM Venues WHERE venueName LIKE (?)",
                    (venue, ))
                venueID = cur.fetchall()[0][0]

            #Add the Show to the Shows table
            showToInsert = (showCount, date, venueID)
            SQLInsert.insert_show(conn, showToInsert)

            showCount += 1
Ejemplo n.º 10
0
def index(request):
    this_user = User.objects.get(id=request.session['id'])
    site_links = []
    site_headlines = []
    user_list = this_user.news.all()
    first_item = this_user.news.first()
    if first_item != None:
        categories = [first_item.list_name]
    else:
        categories = []
    links_dict = {}

    #Generate list of all of the users news categories
    for list_name in user_list:

        for category in categories:
            if categories == [] or category != list_name.list_name:
                print('in IF')
                categories.append(list_name.list_name)
            else:
                print('in ELSE')
                continue
    # print('////////////////////////////////////', categories)


#Generate an object where there is a key for each category and it's value will be a list of valuable links

    for category in categories:
        content_list = []
        for entry in user_list:
            site_list = []
            site_links = []
            site_headlines = []
            if entry.list_name == category:
                #open connection to page, copy html as local variable, close connection
                uClient = uReq(f'{ entry.site }')
                page_html = uClient.read()
                uClient.close()
                site_list.append(entry.site)
                #BeautifulSoup Magic
                soup = bsoup(page_html, 'html.parser')
                for story in soup.find_all('a'):
                    # print('//////////// entry.site = ', entry.site)
                    # print(story.get('href'))
                    if story.get('href') == None:
                        continue
                    if entry.site in story.get('href') and re.search(
                            r'\b[0-9]{4}\b', story.get('href')) != None:
                        site_links.append(story.get('href'))
                        # print('{{{{{{{{{{{{', story.text)
                        for h1 in soup.find_all('a'):
                            for h1 in story.find_all('h1'):
                                values = h1.text
                        for h2 in soup.find_all('a'):
                            for h2 in story.find_all('h2'):
                                values = h2.text
                        for h3 in soup.find_all('a'):
                            for h3 in story.find_all('h3'):
                                values = h3.text
                        for h4 in soup.find_all('a'):
                            for h4 in story.find_all('h4'):
                                values = h4.text
                        if values != '':
                            site_headlines.append(values)

            #Write the links and headlines to the context.
            print('================', len(site_headlines), len(site_links))

            for content in range(len(site_headlines)):
                content_list.append(
                    [site_headlines[content], site_links[content]])
            links_dict.update({f'{category}': content_list})

            # context.update({ f'{i}_links': site_links })
            # context.update({ f'{i}_headlines': site_headlines})

    context = {'links_dict': links_dict, 'categories': categories}
    print('/////////////context = ', context)

    return render(request, 'tracker/dashboard.html', context)
Ejemplo n.º 11
0
def evenextract():
    from settings import my_url, name, doc, last, RH, COD_PRODUCTO
    import init, bs4, logging, sys, re
    global conteventos
    LOG_FILENAME = './Logs/Registros.log'
    logging.basicConfig(filename=LOG_FILENAME,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")
    LEVELS = {
        'debug': logging.DEBUG,
        'info': logging.INFO,
        'warning': logging.WARNING,
        'error': logging.ERROR,
        'critical': logging.CRITICAL
    }
    if len(sys.argv) > 1:
        level_name = sys.argv[1]
        level = LEVELS.get(level_name, logging.NOTSET)
        logging.basicConfig(level=level)
    from urllib.request import urlopen as uReq
    from bs4 import BeautifulSoup as soup
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    all = 0
    a = 0
    x = 0
    y = 0
    conteventos = 0
    auto = ""
    vincula = ""
    insti = ""
    vinculain = ""
    page_soup = soup(page_html, "html.parser")
    containers = page_soup.findAll("table")
    for a in range(0, len(containers)):
        buscaeventos = containers[a].h3
        #print(buscaeventos)
        try:
            if buscaeventos.text == "Eventos científicos":
                all = a
                #print(all)
                break
        except AttributeError:
            pass
    if all != 0:
        containerb = containers[all]
        container = containerb.findAll("table")
        for x in range(0, len(container)):
            cont = container[x]
            info_evento = cont.td.text
            #Nombre del evento
            index1 = info_evento.find("Nombre del evento:") + 18
            index2 = info_evento.find("Tipo de evento:")
            NombreEvento = info_evento[index1:index2]
            # Tipo de Evento
            index1 = info_evento.find("Tipo de evento:") + 15
            index2 = info_evento.find(" Ámbito:")
            TipoEvento = info_evento[index1:index2]
            if TipoEvento.strip() == "Otro":
                TipoEvento = "1"
            elif TipoEvento.strip() == "Taller":
                TipoEvento = "2"
            elif TipoEvento.strip() == "Congreso":
                TipoEvento = "3"
            elif TipoEvento.strip() == "Encuentro":
                TipoEvento = "4"
            elif TipoEvento.strip() == "Seminario":
                TipoEvento = "5"
            elif TipoEvento.strip() == "Simposio":
                TipoEvento = "6"
            else:
                logging.critical('Añadir a Tipo_Evento: ' + TipoEvento)
                print("ALERTA: Revisar el archivo Registros.log")
            #Ambito
            index1 = info_evento.find(
                "\xa0\r\n                                        Ámbito: "
            ) + 51
            index2 = info_evento.find(
                "\xa0                \r\n                                        Realizado el:"
            )
            Ambito = info_evento[index1:index2]
            #Fecha de Realización inicio y fin
            index1 = info_evento.find("Realizado el:") + 13
            index2 = index1 + 4
            AnoEventoini = info_evento[index1:index2]
            if AnoEventoini == "," or AnoEventoini == ",\xa0\r\n":
                MesEventoini = ""
                AnoEventoini = ""
                FechaEventoini = ""
                MesEventofin = ""
                AnoEventofin = ""
                FechaEventofin = ""
            else:
                index1 = index1 + 5
                index2 = index1 + 2
                MesEventoini = info_evento[index1:index2]
                index1 = info_evento.find("Realizado el:") + 13
                index2 = index1 + 10
                FechaEventoini = info_evento[index1:index2]
                index1 = info_evento.find(",", index1, len(info_evento)) + 48
                index2 = index1 + 4
                AnoEventofin = info_evento[index1:index2]
                if AnoEventofin == " \xa0\r\n" or AnoEventofin == ",":
                    MesEventofin = ""
                    AnoEventofin = ""
                    FechaEventofin = ""
                else:
                    index1 = index1 + 5
                    index2 = index1 + 2
                    MesEventofin = info_evento[index1:index2]
                    index1 = info_evento.find("Realizado el:") + 13
                    index1 = info_evento.find(",", index1,
                                              len(info_evento)) + 48
                    index2 = index1 + 10
                    FechaEventofin = info_evento[index1:index2]
            #Lugar Evento
            index1 = info_evento.find(
                " \xa0\r\n                                            en "
            ) + 51
            index2 = info_evento.find(" \xa0 -  \xa0\r\n")
            LugarEvento = info_evento[index1:index2]
            b_eventos = cont.findAll("td")
            #Autores
            autores = b_eventos[3].findAll("li")
            if len(autores) == 0:
                auto = ""
                vincula = ""
            else:
                for z in range(0, len(autores)):
                    autor = autores[z].text
                    index1 = autor.find("Nombre:") + 8
                    index2 = autor.find(
                        "\r\n                                                Rol en el evento: "
                    )
                    if len(auto) == 0:
                        auto = autor[index1:index2]
                    else:
                        auto = auto + ", " + autor[index1:index2]
                    index1 = autor.find("Rol en el evento: ") + 18
                    index2 = autor.find("\r\n ", index1, len(autor))
                    if len(vincula) == 0:
                        vincula = autor[index1:index2]
                    else:
                        vincula = vincula + ", " + autor[index1:index2]
            #Instituciones
            Instituciones = b_eventos[2].findAll("li")
            if len(Instituciones) == 0:
                insti = ""
                vinculain = ""
            else:
                for z in range(0, len(Instituciones)):
                    institu = Instituciones[z].text
                    index1 = institu.find("Nombre de la institución:") + 25
                    index2 = institu.find(
                        "\r\n                                                Tipo de vinculación"
                    )
                    if len(insti) == 0:
                        insti = institu[index1:index2]
                    else:
                        insti = insti + ", " + institu[index1:index2]
                    index1 = institu.find("Tipo de vinculación") + 19
                    index2 = institu.find("'", index1, len(institu))
                    if len(vinculain) == 0:
                        vinculain = institu[index1:index2]
                    else:
                        vinculain = vinculain + ", " + institu[index1:index2]
            #Productos Asociados
            productos = b_eventos[1].findAll("li")
            if len(productos) == 0:
                init.rel_persona_producto_colciencias.append(str(RH) + ";"\
                + str(COD_PRODUCTO) + ";"\
                + "0" + ";"\
                + "" + ";"\
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',TipoEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + "" + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Ambito.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + "" + ";" \
                + "" + ";" \
                + "" + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',auto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vincula.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + "" + ";" \
                + "" + ";" \
                + "" + ";" \
                + "" + ";" \
                + "" + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',insti.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vinculain.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + "\n")
                init.colciencias_apropiacion.append(str(RH) + str(COD_PRODUCTO) + ";"\
                + str(RH) + ";"\
                + str(COD_PRODUCTO) + ";"\
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                + "\n")
                init.inrel_personas_producto_colciencias.append( \
                "REPLACE INTO `uapa_db`.`rel_personas_producto_colciencias`(`cod_rel_per_prod_col`,`cod_producto`,`cod_rh`,`cod_tipo_producto`,`nombre_producto`,`evento_asociado`,`datos_complementarios`,`lugar`,`ano`,`ambito`,`palabras_clave`,`areas`,`sectores`,`coautores`,`vincula_coautores`,`editorial`,`volumen`,`paginas`,`doi`,`finalidad`,`instituciones_asociadas`,`tipo_vinculacion_institucion`) VALUES"
                + "('"+ str(RH) + str(COD_PRODUCTO) + "',"
                + str(COD_PRODUCTO) + ","\
                + "'" + str(RH) + "',"\
                + "0" + ","\
                + "null" + ","\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Ambito.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                + "null" + ","\
                + "null" + ","\
                + "null" + ","\
                + "null" + ","\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',auto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vincula.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                + "null" + ","\
                + "null" + ","\
                + "null" + ","\
                + "null" + ","\
                + "null" + ","\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',insti.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vinculain.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "');\n")
                init.incolciencias_apropiacion.append( \
                "REPLACE INTO `uapa_db`.`colciencias_apropiacion`(`cod_colciencias_apropiacion`,`cod_rh`,`cod_rel_per_prod_col`,`fecha_ini`,`fecha_fin`,`cod_tipo_evento`) VALUES"
                + "('" + str(COD_PRODUCTO) + "',"\
                + "'" + str(RH) + "',"\
                + "'" + str(RH) + str(COD_PRODUCTO) + "',"\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                + "null" + ");\n")
                COD_PRODUCTO = COD_PRODUCTO + 1
            else:
                for y in range(0, len(productos)):
                    prod = productos[y].text
                    index1 = prod.find("Nombre del producto:") + 20
                    index2 = prod.find("Tipo de producto:")
                    NombreProducto = prod[index1:index2]
                    index1 = prod.find("Tipo de producto:") + 17
                    index2 = prod.find("\r\n", index1, len(prod))
                    Tipopub = prod[index1:index2]
                    if Tipopub == "Producción bibliográfica - Trabajos en eventos (Capítulos de memoria) - Completo":
                        Tipopub = "2"
                    elif Tipopub == "Producción técnica - Presentación de trabajo - Comunicación":
                        Tipopub = "3"
                    elif Tipopub == "Demás trabajos - Demás trabajos - Póster":
                        Tipopub = "4"
                    elif Tipopub == "Producción técnica - Presentación de trabajo - Conferencia":
                        Tipopub = "5"
                    elif Tipopub == "Producción técnica - Presentación de trabajo - Ponencia":
                        Tipopub = "6"
                    elif Tipopub == "Producción bibliográfica - Trabajos en eventos (Capítulos de memoria) - Resumen":
                        Tipopub = "12"
                    elif Tipopub == "Producción técnica - Presentación de trabajo - Congreso":
                        Tipopub = "13"
                    elif Tipopub == "Producción técnica - Presentación de trabajo - Simposio":
                        Tipopub = "14"
                    elif Tipopub == "Producción técnica - Presentación de trabajo - Seminario":
                        Tipopub = "15"
                    elif Tipopub == "Producción técnica - Presentación de trabajo - Otro":
                        Tipopub = "16"
                    else:
                        logging.critical('Añadir a Tipo_Producto: ' +
                                         TipoEvento)
                        print("ALERTA: Revisar el archivo Eventos.log")
                    init.rel_persona_producto_colciencias.append(str(RH) + ";"\
                    + str(COD_PRODUCTO) + ";"\
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Tipopub.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreProducto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',TipoEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + "" + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Ambito.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + "" + ";" \
                    + "" + ";" \
                    + "" + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',auto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vincula.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + "" + ";" \
                    + "" + ";" \
                    + "" + ";" \
                    + "" + ";" \
                    + "" + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',insti.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vinculain.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + "\n")
                    init.inrel_personas_producto_colciencias.append( \
                    "REPLACE INTO `uapa_db`.`rel_personas_producto_colciencias`(`cod_rel_per_prod_col`,`cod_producto`,`cod_rh`,`cod_tipo_producto`,`nombre_producto`,`evento_asociado`,`datos_complementarios`,`lugar`,`ano`,`ambito`,`palabras_clave`,`areas`,`sectores`,`coautores`,`vincula_coautores`,`editorial`,`volumen`,`paginas`,`doi`,`finalidad`,`instituciones_asociadas`,`tipo_vinculacion_institucion`) VALUES"
                    + "('"+ str(RH) + str(COD_PRODUCTO) + "',"
                    + str(COD_PRODUCTO) + ","\
                    + "'" + str(RH) + "',"\
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Tipopub.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreProducto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',NombreEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + "null" + ","\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',LugarEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',Ambito.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + "null" + ","\
                    + "null" + ","\
                    + "null" + ","\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',auto.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vincula.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + "null" + ","\
                    + "null" + ","\
                    + "null" + ","\
                    + "null" + ","\
                    + "null" + ","\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',insti.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',vinculain.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "');\n")
                    init.colciencias_apropiacion.append(str(RH) + str(COD_PRODUCTO) + ";"\
                    + str(RH) + ";"\
                    + str(COD_PRODUCTO) + ";"\
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',TipoEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ","\
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',AnoEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',MesEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + ";" \
                    + "\n")
                    init.incolciencias_apropiacion.append( \
                    "REPLACE INTO `uapa_db`.`colciencias_apropiacion`(`cod_colciencias_apropiacion`,`cod_rh`,`cod_rel_per_prod_col`,`fecha_ini`,`fecha_fin`,`cod_tipo_evento`) VALUES"
                    + "('" + str(COD_PRODUCTO) + "',"\
                    + "'" + str(RH) + "',"\
                    + "'" + str(RH) + str(COD_PRODUCTO) + "',"\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventoini.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + "'" + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',FechaEventofin.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r",""))) + "',"\
                    + re.sub(r'[^A-Za-z0-9éèáàéñèíìúùó ò]',r'',re.sub(' +',' ',TipoEvento.replace('"',"").replace("'","").strip().replace(";" , "|").replace("\r\n","").replace("\n","").replace("\r","")))  + ");\n")
                    COD_PRODUCTO = COD_PRODUCTO + 1
            auto = ""
            vincula = ""
            insti = ""
            vinculain = ""
    else:
        logging.info(' El Docente ' + name + ' ' + last +
                     ' no tiene Eventos Asociados')
    conteventos = [COD_PRODUCTO]
Ejemplo n.º 12
0
import csv  
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
Url = 'https://karki23.github.io/Weather-Data/Albury.html'
pageHtml = uReq(Url)
soup = soup(pageHtml,"html.parser") 
table = soup.find_all("table", { "class" : "tablepress tablepress-id-10 tablepress-responsive-phone" })
with open('Albury.csv', 'w',newline='') as csvfile:
    f = csv.writer(csvfile)
    f.writerow(['Date', 'Location', 'MinTemp','MaxTemp','Rainfall','Evaporation','Sunshine','WindGustDir','WindGustSpeed','WindDir9am','WindDir3pm','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','CLoud3pm','Temp9am','Temp3pm','RainToday','RISK_MM','RainTomorrow'])
    for x in table:
        table_body = x.find('tbody') 
        rows = table_body.find_all('tr') 
        for tr in rows:
            data=[]
            cols = tr.find_all('td') 
            for td in cols:
                data.append(td.text.strip()) 
            f.writerow(data)
            print(data)
                    
        
    
price_list = []
address1_list = []
address2_list = []
room_list = []
shower_list = []
car_list = []
size_list = []


for i in tqdm(range(1,7)):
    # sleep is used to make sure that I dont spam the server too much
    time.sleep(2)
    try:
        my_url = "https://www.domain.com.au/sale/?suburb=caulfield-vic-3162,elsternwick-vic-3185,gardenvale-vic-3185,glen-huntly-vic-3163,mckinnon-vic-3204,murrumbeena-vic-3163,ormond-vic-3204,carnegie-vic-3163,bentleigh-vic-3204,bentleigh-east-vic-3165&ptype=apartment&bedrooms=2-any&price=0-750000&excludeunderoffer=1&carspaces=1-any&ssubs=0&page={}".format(i)
        req = urllib.request.Request(my_url,headers={'User-Agent': "Magic Browser"})
        con = uReq(req)
        page_html = con.read()
        con.close()
        # html parsing
        page_soup = soup(page_html, 'html.parser')
        containers = page_soup.find_all(class_="css-qrqvvg")
        for container in containers:
            # Get price
            try:
                price_container = container.find_all('p', class_="css-mgq8yx")
                price = price_container[0].text.strip().encode('ascii', 'ignore').decode("utf-8")
                price_list.append(price)
                print(price)
            except IndexError:
                print('None')
                price_list.append('NG')
Ejemplo n.º 14
0
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
from time import ctime as ct

# This file aims at scrapping a youtube channel in order to get data as:
# the number of subscribers, the number of views and the starting date of this
# particular channel.


channelName="Mister Geopolitix"
channelUrl="https://www.youtube.com/channel/UCX9lsdsTKfTi1eqoyL-RS-Q/about"

#Opening connection, grabbing the page
uClient = uReq(channelUrl)
pageHtml = uClient.read()
uClient.close()

# Using beautifulsoup module we parse the source code of the webpage
pageSoup = soup(pageHtml, "html.parser")

# We are seeking the 'about-stat' span section:
stats = pageSoup.findAll("span", {"class", "about-stat"})

# Values Extraction
nbSubs = stats[0].find("b").text.replace('\xa0', ' ')
nbViews = stats[1].find("b").text.replace('\xa0', ' ')
startDate = stats[2].text.replace('\xa0', ' ')

# Save data in a file with the current date
record = open("log.txt", "a")
date = ct() #current time
        "KKTC-GİRNE": "85",
        "TÜRKİSTAN-KAZAKİSTAN": "86",
        "KKTC-GÜZELYURT": "87",
        "BİŞKEK-KIRGIZİSTAN": "88",
        "KOMRAT-MOLDOVA": "89",
        "KKTC-LEFKE": "90",
        "ÜSKÜP-MAKEDONYA": "91",
    }
    return switcher.get(cityName, "0")


f = open("university-data.txt", "a")

mainUrl = 'https://www.basarisiralamalari.com/universite-taban-puanlari-2020-ve-basari-siralamalari-osym/'
# Opens up the connection and gets the html page from it
uClient = uReq(mainUrl)
pageHtml = uClient.read()

# Closes the connection
uClient.close()

pageSoup = soup(pageHtml.decode('utf-8', 'ignore'), 'html.parser')

uniTable = pageSoup.find('table', {'id': 'basaritable'})
wholeTbody = uniTable.tbody
allRows = wholeTbody.findAll('tr')

allURLs = []

for row in allRows:
    if row.find('tr', attrs={'style': 'height: 46px;'}):
Ejemplo n.º 16
0
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
#from textblob import TextBlob

#input item name from user and remove spaces 
print("Search for item..?")
item_name_full = input()
item_name = item_name_full.replace(' ', '')

#link to scrap data from
my_url = 'https://www.flipkart.com/search?q=' + item_name + '&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'

#open connection
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

#Use BeautigulSoup to parse the html site
page_soup = soup(page_html, "html.parser")

#on web browser 'inspect element' to get the class of part which we want to scrap
no_page_soup = page_soup.findAll("div", {"class": "_2zg3yZ"})

#print total no of page like 'Page 1 of 8'
'''print(no_page_soup[0].span.text)'''

#Find only total no. of pages. like '8' and then convert to int
num_pages_str = no_page_soup[0].span.text

#['Page',  '1', 'of', '8']
Ejemplo n.º 17
0
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = ["https://www.theravive.com/cities/ma/"]

#opening up connection, grabbing the page
for i in my_url:

    uClient = uReq(i)
    page1_html = uClient.read()
    uClient.close()

    #html parsing
    page1_soup = soup(page1_html, "html.parser")

    #grabs each product
    containers = page1_soup.findAll("div", {"class": "profile-info"})
    container = containers[0]

    filename = "ma.csv"
    f = open(filename, "w")

    headers = "Name, Profession, Contact_Info\n"
    f.write(headers)

    for container in containers:
        address_container = container.findAll("div",
                                              {"class": "profile-address"})
        address = address_container[0].text.strip()

        name_container = container.findAll("h4", {"class": "green-text"})
Ejemplo n.º 18
0
a = 0
while (choice != 4):
    print("1.DO YOU WANT TO SEE PRODUCTS PREVAILING IN FLIPKART?")
    print("2.DO YOU WANT TO SEE PRODUCTS PREVAILING IN SNAPDEAL?")
    print("3.DO YOU WANT TO SEE WHICH E-SITE IS EFFICIENT?")
    print("4.EXIT")
    print(
        "***********************************************************************"
    )
    choice = int(input())

    if (choice == 1):
        i = 0
        print("NOW YOU ARE IN FLIPKART")
        my_url = 'https://www.flipkart.com/search?q=kurti&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'
        uClient = uReq(my_url)
        page_html = uClient.read()
        uClient.close()
        page_soup = soup(page_html, "html.parser")
        containers = page_soup.findAll("div", {"class": "_3liAhj _1R0K0g"})
        print("total number of dresses in this page:", len(containers))
        container = containers[0]
        for container in containers:
            name1 = container.findAll("a", {"class": "_2cLu-l"})
            name = name1[0].text.strip()
            print(name)
            price1 = container.findAll("div", {"class": "_1vC4OE"})
            price = price1[0].text.strip()
            print(price)
            rat1 = container.findAll("span", {"class": "_2_KrJI"})
            try:
Ejemplo n.º 19
0
def index():
    if request.method == 'POST':
        try:
            searchString = request.form['content'].replace(" ", "")
            flipkart_url = "https://www.flipkart.com/search?q=" + searchString
            uClient = uReq(flipkart_url)
            flipkartPage = uClient.read()
            uClient.close()
            flipkart_html = bs(flipkartPage, "html.parser")
            bigboxes = flipkart_html.findAll("div", {"class": "bhgxx2 col-12-12"})
            del bigboxes[0:3]
            box = bigboxes[0]
            productLink = "https://www.flipkart.com" + box.div.div.div.a['href']
            print(productLink)
            prodRes = requests.get(productLink)
            prod_html = bs(prodRes.content, "html.parser")
            prod = prod_html.find('div', {'class': '_29OxBi'}).h1.span.get_text()
            data = prod_html.find('div', {'class': 'swINJg _3nrCtb'})
            parent = data.find_parent()
            url = parent.get('href')
            url = 'https://www.flipkart.com' + url
            req_data = requests.get(url)
            all_reviews = bs(req_data.content, 'html.parser')
            pages = all_reviews.find_all('div', {'class': '_2zg3yZ _3KSYCY'})  # extracts all the pages url info
            page = int(pages[0].span.get_text().split()[-1])
            if page > 3:
                page = 3
            reviews = []
            for i in range(0, page):  # we iterate through all the pages
                commentboxes = all_reviews.find_all('div', {'class': "_1PBCrt"})
                for commentbox in commentboxes:
                    try:
                        name = commentbox.div.div.find_all('p', {'class': '_3LYOAd _3sxSiS'})[0].text
                    except:
                        name = 'No Name'

                    try:
                        rating = commentbox.div.div.div.div.text
                    except:
                        rating = 'No Rating'

                    try:
                        commentHead = commentbox.div.div.div.p.text
                    except:
                        commentHead = 'No Comment Heading'

                    try:
                        comtag = commentbox.div.div.find_all('div', {'class': ''})
                        custComment = comtag[0].div.text
                    except Exception as e:
                        print("Exception while creating dictionary: ", e)

                    mydict = {"Product": prod, "Name": name, "Rating": rating, "CommentHead": commentHead,
                              "Comment": custComment}
                    reviews.append(mydict)
            return render_template('results.html', reviews=reviews[0:(len(reviews) - 1)])
        except Exception as e:
            print('The Exception message is: ', e)
            return 'something is wrong'
    else:
        return render_template('index.html')
Ejemplo n.º 20
0
def add_box_office(csv_in, csv_out):
    ratings = pd.read_csv(csv_in, encoding='ISO-8859-1')
    ratings['Box Office Gross USA'] = 0
    ratings['Box Office Gross USA'].fillna(value=0, inplace=True)
    infl_string = 'Inflation Adjusted Box Office Gross USA'
    ratings[infl_string] = 0
    ratings[infl_string].fillna(value=0, inplace=True)

    context = ssl._create_unverified_context()

    for i in range(len(urls)):
        url = urls[i]

        # Open connection, read html, close connection
        uClient = uReq(url, context=context)
        page_html = uClient.read()
        uClient.close()

        # html parser
        page_soup = soup(page_html, 'html.parser')

        # print(page_soup.body.span)

        title = page_soup.title.get_text()
        print(title)
        movie_year = '2020'
        if (not 'TV Series' in title and not 'TV Mini-Series' in title):
            open_paren = title.find('(')
            close_paren = title.find(')')
            movie_year = title[open_paren + 1:close_paren]
            print('Year: ', movie_year)

        box_office_gross = page_soup.find('div', {'id': 'main_bottom'})
        box_office_gross = box_office_gross.find('div', {'id': 'titleDetails'})
        box_office_gross = box_office_gross.findAll('div',
                                                    {'class': 'txt-block'})
        print('num divs: ', len(box_office_gross))
        j = 0
        has_gross_usa = False
        for j in range(4, len(box_office_gross)):
            if ('Gross USA' in box_office_gross[j].get_text()):
                has_gross_usa = True
                break
        if (has_gross_usa):
            box_office_gross = box_office_gross[j].get_text()
            dollar_index = box_office_gross.find('$')
            box_office_gross = box_office_gross[dollar_index + 1:]
            box_office_gross = box_office_gross.replace(',', '')
        else:
            box_office_gross = 0

        t = datetime.now().year - int(movie_year)
        # Inflation adjusted box office gross
        infl_adj_gross = float(box_office_gross) * (pow(
            (1 + 1.84545 / 100), t))

        box_off_loc = ratings.columns.get_loc('Box Office Gross USA')
        ratings.iat[i, box_off_loc] = float(box_office_gross)
        ratings.iat[i, box_off_loc + 1] = float(infl_adj_gross)

        # Format numbers to $xxx,xxx,xxx.xx
        infl_adj_gross = '${:,.2f}'.format(infl_adj_gross)
        box_office_gross = '${:,.2f}'.format(float(box_office_gross))

        print("Box office gross: ", box_office_gross)
        print("Box office gross (inflation adjusted): ", infl_adj_gross)
        print()
        print('----------------------------------')
        print()

    ratings.to_csv(csv_out)
Ejemplo n.º 21
0
set_url = root_url + set_name_url

print("url: " + set_url)

# spoof header
req = Request(set_url)
req.add_header("User-Agent", "Mozilla/5.0")


###################################
# download and parse page into soup
###################################

scrape_date = str(datetime.now())
web_client = uReq(req)
print("web_client: " + str(web_client.getcode()))

soup = soup(web_client.read(), "html.parser")
web_client.close()


#######################
# isolate pricing table
#######################

html_tables = soup.find_all("table")

# lots of tables. We only need the pricing table
html_table = html_tables[8]
Ejemplo n.º 22
0
c = 0
while True:
    if c == 0:
        cf_id = input("Enter your codeforces id : ")
        c += 1
    else:
        cf_id = input("Enter your codeforces id again\n")
    link = 'https://www.codeforces.com/profile/' + cf_id
    check = requests.get(link)
    display = "INVALID Codeforces ID"
    if check.url == "https://codeforces.com/":
        print(display.center(40, '*'))
    else:
        break

uClient = uReq(link)
link = "https://codeforces.com/api/user.info?handles=" + cf_id
ml = requests.get(link)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
c = page_soup.find_all('li')
p = json.loads(ml.text)
dic = p['result']
ls = dic[0]

#user-id
print("User-ID:", cf_id)

#User Name
print("Name:", ls['firstName'], ls['lastName'])
Ejemplo n.º 23
0
    review_date = item.select("small")[0].text
    review_title = item.select("strong  a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
    review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
    
    print("Name:", name)
    print("Location:", location)
    print("Review_date:", review_date)
    print("Review_Title:", review_title)
    print("Review_Data:", review_data)
    
    row = [name, location, review_date, review_title, review_data]
    csv_writer.writerow(row)

# --- get next url ---

uclient = uReq(url)
page_html = uclient.read()
uclient.close()

soup = BeautifulSoup(page_html, "html.parser")
container = soup.find("ul", {"class": "pages table"})

all_li = container.findAll("li")
if all_li:
    last_div = all_li[-1]
    content = last_div.getText()
    content = int(content)
    container = soup.findAll("li", {"class": "next"})
    li = container[0].find("a", {"class": "btn btn-link"}).attrs['href']
   
# ---- get data ---
Ejemplo n.º 24
0
            "div", {
                "id":
                "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl04_nameRow"
            })
        transName = transDiv.find("div", {"class": "value"})
    csvDict['Trans_Card'] = transName.text.strip()
    with open('cards.csv', 'a', encoding='utf8', newline='') as csvfile:
        scrapeTo = csv.DictWriter(csvfile, fieldnames=card_values)
        scrapeTo.writerow(csvDict)
    return


my_url = 'http://gatherer.wizards.com/Pages/Search/Default.aspx?text=+%5Btransform%5D'

# opening connection, grabbing page
uClient = uReq(my_url)
page_html = uClient.read()

# html parsing
page_soup = soup(page_html, "lxml")

page_nums = page_soup.find("div", {"class": "paging"})

# grabs cardInfo div

siteURL = "http://gatherer.wizards.com"

# scrapeTo = csv.writer(open('card.csv', 'wb'))
cardCount = 0
current_page = 0
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq

# Url of page to scrape
my_url = "http://magic.wizards.com/en/events/coverage/gpman16/mono-blue-prison-with-martin-muller-2016-05-28"

# Grab page html form URL
web_client = uReq(my_url)

# convert raw html to a soup object
page_soup = BeautifulSoup(web_client.read(), "html.parser")

# Extract deck
deck_soup = page_soup.find_all("div", {"class": "deck-list-text"})

# Extract card count quantities from deck
card_counts = page_soup.find_all("a", {"class": "card-name"})
# Extract card information from deck

input("Press any key to end.")
Ejemplo n.º 26
0
def index():
    searchString = request.form['content'].replace(
        " ", "")  # obtaining the search string entered in the form
    try:
        # dbConn = pymongo.MongoClient("mongodb://localhost:27017/")  # opening a connection to Mongo
        #db = dbConn['crawlerDB'] # connecting to the database called crawlerDB
        # reviews = db[searchString].find({}) # searching the collection with the name same as the keyword
        # if reviews.count() > 0: # if there is a collection with searched keyword and it has records in it
        #    return render_template('results.html',reviews=reviews) # show the results to user
        #else:
        flipkart_url = "https://www.flipkart.com/search?q=" + searchString  # preparing the URL to search the product on flipkart
        uClient = uReq(
            flipkart_url)  # requesting the webpage from the internet
        flipkartPage = uClient.read()  # reading the webpage
        uClient.close()  # closing the connection to the web server
        flipkart_html = bs(flipkartPage,
                           "html.parser")  # parsing the webpage as HTML
        bigboxes = flipkart_html.findAll("div", {
            "class": "bhgxx2 col-12-12"
        })  # seacrhing for appropriate tag to redirect to the product link
        del bigboxes[
            0:
            3]  # the first 3 members of the list do not contain relevant information, hence deleting them.
        box = bigboxes[0]  #  taking the first iteration (for demo)
        productLink = "https://www.flipkart.com" + box.div.div.div.a[
            'href']  # extracting the actual product link
        prodRes = requests.get(
            productLink)  # getting the product page from server
        prod_html = bs(prodRes.text,
                       "html.parser")  # parsing the product page as HTML
        commentboxes = prod_html.find_all(
            'div',
            {'class': "_3nrCtb"
             })  # finding the HTML section containing the customer comments

        #table = db[searchString] # creating a collection with the same name as search string. Tables and Collections are analogous.
        #filename = searchString+".csv" #  filename to save the details
        #fw = open(filename, "w") # creating a local file to save the details
        #headers = "Product, Customer Name, Rating, Heading, Comment \n" # providing the heading of the columns
        #fw.write(headers) # writing first the headers to file
        reviews = []  # initializing an empty list for reviews
        #  iterating over the comment section to get the details of customer and their comments
        for commentbox in commentboxes:
            try:
                name = commentbox.div.div.find_all(
                    'p', {'class': '_3LYOAd _3sxSiS'})[0].text

            except:
                name = 'No Name'

            try:
                rating = commentbox.div.div.div.div.text

            except:
                rating = 'No Rating'

            try:
                commentHead = commentbox.div.div.div.p.text
            except:
                commentHead = 'No Comment Heading'
            try:
                comtag = commentbox.div.div.find_all('div', {'class': ''})
                custComment = comtag[0].div.text
            except:
                custComment = 'No Customer Comment'
            #fw.write(searchString+","+name.replace(",", ":")+","+rating + "," + commentHead.replace(",", ":") + "," + custComment.replace(",", ":") + "\n")
            mydict = {
                "Product": searchString,
                "Name": name,
                "Rating": rating,
                "CommentHead": commentHead,
                "Comment": custComment
            }  # saving that detail to a dictionary
            # x = table.insert_one(mydict) #insertig the dictionary containing the rview comments to the collection
            reviews.append(
                mydict)  #  appending the comments to the review list
        return render_template(
            'results.html', reviews=reviews)  # showing the review to the user
    except:
        return 'something is wrong'
def parse_deck(mtggoldfish_url):
    # grab and soup page
    page_client = uReq(mtggoldfish_url)
    print("Grabbing the html through the blind eternities...")
    page_soup = BeautifulSoup(
        page_client.read(), "html.parser"
    )
    page_client.close()

    # Grabs the paper decklist with the prices
    print("Extracting table...")
    deck_table = page_soup.find_all(
        "div", {"id": "tab-paper"}
    )

    # discards things around the table
    deck_table = deck_table[0].div.div.table

    # gets all the rows from the html table
    table_rows = deck_table.find_all("tr")

    decklist = {}

    # Loops through the rows
    print("Parsing rows...")
    for row in table_rows:
        columns = row.find_all("td")
        if(len(columns) == 4):
            # extracts features from each column
            quantity = columns[0].text.strip()
            card_name = columns[1].text.strip()
            color = ""
            try:
                color = columns[2].span.img['alt'].strip()
            except AttributeError:
                pass
            price = round(
                float(
                    columns[3].text.strip()
                ),
                2  # two decimal places
            )
            card_key = card_name.lower()

            decklist[card_key] = {
                "card_name": card_name,
                "color": color,
                "price_amt": price,
                "price_date_utc": str(datetime.datetime.utcnow()),
                "deck_qty": quantity,
            }
            print("Parsed..." +
                  card_name + ", " +
                  color + ", " +
                  quantity + ", " +
                  str(price)
                  )
    print("Printing card names.")
    card_names = list(decklist.keys())
    card_names.sort()
    print(card_names)
    print("Printing dictionary.")
    print(decklist)
Ejemplo n.º 28
0
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from email.mime.text import MIMEText
import smtplib
import sys

MY_URL = 'http://www.secretflying.com/posts/category/error-fare/'
USER_CLIENT = uReq(MY_URL)
PAGE_HTML = USER_CLIENT.read()
USER_CLIENT.close()
PAGE_SOUP = soup(PAGE_HTML, "html.parser")
ERROR_FARES_CONTENT = PAGE_SOUP.findAll(
    "div", {"class": "article-content-wrapper entry-main-content"})
ORIGINAL_TEXT_FILE = "errorfarelist.txt"
DIFFERENCES_TEXT_FILE = "errorfaresdiffs.txt"


# This writes the previous data from a text file into a list prior to the next function potentially finding updates.
def store_existing_deals_in_mem():
    with open(ORIGINAL_TEXT_FILE, "r") as F:
        current_deals_file = []
        for word in F:
            current_deals_file.append(word.strip('\n'))
    return current_deals_file


# The website page 'titles' are parsed then the content is written to a text file (overwrites all data in the file).
def parse_and_write_to_file():
    with open(ORIGINAL_TEXT_FILE, "w+") as F:
        for deals in ERROR_FARES_CONTENT:
            try:
def main():
    # Uses tweepy to access twitter to allow the ability to tweet
    consumer_key = #consumer key
    consumer_secret = #consumer secret

    access_token = #enter access token
    access_token_secret = #enter access password

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    api = tweepy.API(auth)

    # Stores URL
    my_url = 'https://www.snopes.com/category/facts/'

    # Opening connection, grab the page
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()

    # parse
    page_soup = soup(page_html, "html.parser")

    # get specific part of html
    containers = page_soup.findAll("a", {"class": "article-link"})

    title = [15]
    url = [15]
    validity = [15]

    # Grabs title from Snopes web page
    for container in containers[:10]:
        try:
            # Grabs the title of article
            x = 0
            title.insert(x, str(container.h2.text))
        except(NameError, IndexError, AttributeError):
            print("Error")
        # print(title[x])
        x += 1

    # Grabs validity from Snopes web page
    for container in containers[:10]:
        try:
            validity_container = container.findAll("span", {"itemprop": "reviewRating"})
            x = 0
            for valid in validity_container:
                validity.insert(x, valid.span.text)
        except(NameError, IndexError, AttributeError):
            print("Error")
        x += 1

    # Creates link list that stores all url on web page
    links = []
    # URL's we are actually interested in start here
    ind = 94

    # # Grabs url from Snopes web page
    for container in containers[:10]:
        try:
            # Grabs all url tags on web page
            y = 0
            for link in page_soup.findAll('a', attrs={'href': re.compile("^https://www.snopes.com")}):
                links.append(link.get('href'))
            # Iterates to corresponding url
            url.insert(y, links[ind])
            ind += 1
        except(NameError, IndexError, AttributeError):
            print("Error")
    # Test print to make sure three
    # for x in range(10):
    #    print(title[x] + " " + validity[x] + " " + url[x])
    # Creates tweet with articles found false by Snopes
    for x in range(10):
        if validity[x] in ('FALSE', 'MOSTLY FALSE'):
            api.update_status("FAKE NEWS ALERT!! \n" +
                              title[x] + " " + url[x]
                              + " #FAKENEWS #RESISTTRUMP #RESIST"
                                " #STOPHATE #REALNEWS #TRUMP "
                                "#POLITICS #RESIST #NEWS"
                                " #RUSSIA" "ANTIRUSSIA" 
                                " #INVESTIGATE"
                              )
        else:
            continue
    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 30
0
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
current_date = date.today()
filename = "results {}_{}_{}.csv".format(current_date.day, current_date.month,
                                         current_date.year)
f = open(filename, "w")
headers = "suburb,address,bed,result,price,link\n"
f.write(headers)
all_letters = [
    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
    "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
]
for letter in all_letters:
    RE_url = 'https://www.realestateview.com.au/sales-and-auction-results/victoria/' + letter + '/'
    # opens a connection to URL
    uClient = uReq(RE_url)
    # puts all the HTML into a container
    page_html = uClient.read()
    # closes connection
    uClient.close()
    # stores html as soup
    page_soup = soup(page_html, "html.parser")

    #finds relevant info on page
    containers = page_soup.findAll("tr", {"class": "auction-result-item"})

    #things you want: Suburb, address, bedrooms, result, price
    for container in containers:
        #suburb
        suburb_container = container.findAll("meta",
                                             {"itemprop": "addressLocality"})
Ejemplo n.º 31
0
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq

# URL to be scraped. Has to be an edhrec page.
target_url = "https://edhrec.com/sets/akh/"

# Open connection, download page
web_client = uReq(target_url)

# Parse page into a soup data structure
print("Grabbing website: " + target_url)
page_soup = BeautifulSoup(web_client.read(), "html.parser")

# Close the web client
web_client.close()

card_frames = page_soup.find_all("div", {"class": "nw"})

out_filename = "scrape_edhrec_output.tsv"
f = open(out_filename, "w")
headers = "card_name\tin_decks\tprice\n"
f.write(headers)

for card_frame in card_frames:
    name_frame = card_frame.find_all("div", {"class": "nwname"})
    card_name = name_frame[0].text

    quantity_frame = card_frame.find_all("div", {"class": "nwdesc ellipsis"})
    quantity = quantity_frame[0].text
    quantity = quantity.replace(" decks", "")
Ejemplo n.º 32
0
"""Script created by following tutorial here: https://www.youtube.com/watch?v=XQgXKtPSzUI"""

from urllib.request import urlopen as uReq

from bs4 import BeautifulSoup as soup

myurl = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20cards'

filename = "products.csv"
f = open(filename, "w")
headers = "brand, product_name, shipping\n"
f.write(headers)

# Open connection and download page.
uClient = uReq(myurl)  # Download URL.
page_html = uClient.read()  # Downloaded HTML.
uClient.close()  # Close the client after downloading.

# Parse HTML
pagesoup = soup(page_html, "html.parser")  # Parse the file as HTML.

# Grab each product
containers = pagesoup.findAll("div", {"class": "item-container"})
container = containers[0]
#print(container.div.div.a.img["title"])  # Item title example.

for container in containers:
    brand = container.div.div.a.img["title"]

    title_container = container.findAll("a", {"class": "item-title"})
    product_name = title_container[0].text