Ejemplo n.º 1
0
def actualizar_diputados(contador = 0):
        global url
        global fichero
        url2 = url
        if contador:
            url2 += '&paginaActual=' + str(contador)
            tipo = 'a'
        else:
            tipo = 'w'
        
        web = pattern.web.URL(url2)
        pagina = web.download(user_agent='Mozilla/5.0')    
        soup = BeautifulSoup(pagina, 'html.parser')
        diputados = soup.findAll('span', class_ = 'dorado')
        
        with open('files/' + fichero, tipo) as archivo:
            for diputado in diputados:
                nombre = diputado.previous_element.previous_element.encode('utf-8')
                enlace = 'http://www.congreso.es' + \
                    diputado.previous_element.previous_element.parent['href'].encode('utf-8')
                archivo.write(nombre + ': ' + enlace + '\n')
        
        cuenta_total = soup.find('span', class_ = 'sinEnlace')
        if cuenta_total:
            if cuenta_total.string != u'Página Siguiente >>':
                actualizar_diputados(contador + 1)
            else:
                tkMessageBox.showinfo('Información', 'Todos los diputados han sido añadidos al fichero')
        else:
            actualizar_diputados(contador + 1)
Ejemplo n.º 2
0
def get_image():
    img = soup.img.parent.get('href')

    w = screen_width
    # h = screen_height

    img = website+img
    imgdw = download(img, cached=False, unicode=False)
    imgname = timestamp+"_apod.jpg"
    imgfile = open(imgname, "wb")
    imgfile.write(imgdw)
    imgfile.close

    wall = Image.open(imgname)
    draw = ImageDraw.Draw(wall)
    FOREGROUND = (255, 255, 255)
    # font = ImageFont.truetype("sans-serif.ttf", 16)
    font_path = "/usr/share/fonts/truetype/freefont/FreeSans.ttf"
    font = ImageFont.truetype(font_path, 20, encoding='unic')
    y_text = 0
    for line in text:
        width, height = font.getsize(line)
        y_text += height

    y_text = screen_height - y_text
    for line in text:
        draw.text(((w - width) / 2, y_text), line, font=font, fill=FOREGROUND)
        y_text += height

        # draw.text((0, 0),text,(255,255,255))
    wall.save(imgname)

    imgpath = directory + "/" + imgname
    uri = "'file://%s'" % imgpath
    return uri
Ejemplo n.º 3
0
    def get_all_pages(self, url):
        while True:
            try:
                html = download(url)
            except:
                time.sleep(.5)
                print "Error getting all the pages"
                continue
            break

        # get the page numbers element
        pageNumbers = html.split("class=\"pageNumbers\">")[1]
        pageNumbers = pageNumbers.split("</div>")[0]

        # get a list of all the pages
        pages = pageNumbers.split("<")[2:]
        
        min_page = self.get_num(self.clean_href(pages[1]))
        max_page = self.get_num(self.clean_href(pages[len(pages)-2]))
        url_parts = self.get_base_url(self.clean_href(pages[1]))

        urls = []

        for number in range(min_page, 90+30, 30):
            urls.append(url_parts[0] + str(number) + url_parts[1])

        return urls
 def visit(self, link, source=None):
     print 'visited:', repr(link.url), 'from:', link.referrer
     html = download(link.url)
     blob = TextBlob(plaintext(html))
     for sentence in blob.sentences:
         print(sentence.sentiment
         #.polarity
          )
Ejemplo n.º 5
0
def get_html(datafile, folder):
    "stores the html for all the urls in datafile in folder."
    from pattern.web import download
    urls = get_urls_from_datafile(datafile)
    for url in urls:
        filename = url[1] + ":" + url[2] + ".html"
        with open(folder+filename, 'w') as outfile:
            outfile.write(download(url[0], cached=False))
Ejemplo n.º 6
0
def scrap_preceeding(base_url):
    homepage_html_content = web.download(base_url)
    homepage_soup = bsoup(homepage_html_content)
    ul_content = homepage_soup.find_all('ul')
    a_content = bsoup(str(ul_content)).find_all('a')
    volume_page_links = []
    for raw_link in a_content:
        volume_page_links.append(join(base_url, raw_link.get('href'))+'/')


    os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/')

    for base_link in volume_page_links[32:]:
        folder_name = base_link.split('/')[-2]
        address = os.path.join(os.getcwd(), folder_name)
        if not os.path.exists(address):
            os.mkdir(folder_name)
        else:
            index = 1
            while os.path.exists(address):
                folder_name = base_link.split('/')[-2] + '-' + str(index)
                print folder_name
                address = os.path.join(os.getcwd(), folder_name)
                index += 1
            os.mkdir(folder_name)

        os.chdir(address)


        print '--------------'
        print 'downloading from ' + base_link
        volume_content_soup = bsoup(web.download(base_link)).find_all('div', {'id': 'content'})
        a_content = bsoup(str(volume_content_soup)).find_all('a')
        # print a_content
        pdf_links = [join(base_link, link.get('href')) for link in a_content if str(link.get('href')).endswith('pdf')]
        for download_link in pdf_links:
            if not download_link.endswith('supp.pdf'):
                try:
                    content = web.download(download_link)
                except:
                    print 'link : %s is obsolete' % download_link
                    continue
                f = open(download_link.split('/')[-1], 'wb')
                f.write(content)
                f.close()
        os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/')
Ejemplo n.º 7
0
def run(o):

	#http://www.clips.ua.ac.be/pages/pattern-web#mail
	# should be able to do some cool stuff with the pattern libs	

	print "PATTERM TEST ---------------- STARTED"

	from pattern.web import download
	html = download('http://www.clips.ua.ac.be/', cached=False)

	print html
	print "PATTERM TEST ---------------- COMPLETE"
Ejemplo n.º 8
0
def extract_keywords(url, model, all_keywords):
	minimum_word_index_for_unigrams = 20000 # minimum rarity of word to be considered a keyword
	try:
		text = plaintext(download(url))
		words = [word for word in tokenize.word_tokenize(text) if word.isalnum() and word.lower() not in corpus.stopwords.words('english') and word.lower() not in url]
		for collocation, frequency in get_collocations(words):
			word_index = get_index(collocation, model)
			if word_index:
				if collocation.count('_') == 0 and word_index < minimum_word_index_for_unigrams:
					pass
				else:
					all_keywords[collocation] = all_keywords.get(collocation, 0) + evaluate_keyword(frequency, word_index)
	except (URLError, URLTimeout, HTTPError, HTTP403Forbidden, SSLError, UnicodeEncodeError, ValueError) as e:
		pass
Ejemplo n.º 9
0
def extract_keywords(url, model, all_keywords):
	minimum_word_index_for_unigrams = 20000 # minimum rarity of word to be considered a keyword
	try:
		text = plaintext(download(url))
		words = [word for word in tokenize.word_tokenize(text) if word.isalnum() and word.lower() not in corpus.stopwords.words('english') and word.lower() not in url]
		for collocation, frequency in get_collocations(words):
			word_index = get_index(collocation, model)
			if word_index:
				if collocation.count('_') == 0 and word_index < minimum_word_index_for_unigrams:
					pass
				else:
					all_keywords[collocation] = all_keywords.get(collocation, 0) + evaluate_keyword(frequency, word_index)
	except (URLError, URLTimeout, HTTPError, HTTP403Forbidden, SSLError, UnicodeEncodeError, ValueError) as e:
		pass
Ejemplo n.º 10
0
def get_url(soup):
    urls = []
    contador = 0
    for a in soup.findAll('a'):
                html = download(a['href'], unicode=True)
                soupEspaceNet = BeautifulSoup(html)
                urlDescription = soupEspaceNet.findAll('a', {"class": "publicationLinkClass"})
                for url in urlDescription:
                    urlEspace = "http://worldwide.espacenet.com/" + str(url['href'])
                    urlEspace = urlEspace.replace("biblio","description")
                    urls.append(urlEspace)
                    contador = contador + 1
                    if (contador > 9 ) :
                        return urls
    return urls
Ejemplo n.º 11
0
def search_regex(title, url, regex):

    print title
    
    try:
        html = download(url, unicode=True)
    except URLError as e:
        print "Something happened... Error: " + str(e) + "\n"
        exit(0)

    results = re.findall(regex, html)

    if len(results) == 0:
        print "There are no news...\n"
    else:
        print "Go to " + url + " to see some news.\n"
Ejemplo n.º 12
0
def get_url(soup):
    urls = []
    contador = 0
    for a in soup.findAll('a'):
        html = download(a['href'], unicode=True)
        soupEspaceNet = BeautifulSoup(html)
        urlDescription = soupEspaceNet.findAll(
            'a', {"class": "publicationLinkClass"})
        for url in urlDescription:
            urlEspace = "http://worldwide.espacenet.com/" + str(url['href'])
            urlEspace = urlEspace.replace("biblio", "description")
            urls.append(urlEspace)
            contador = contador + 1
            if (contador > 9):
                return urls
    return urls
Ejemplo n.º 13
0
    def get_attractions_from_page(self, url, city, client):

        while True:
            try:
                html = download(url)
            except:
                print "Error Getting Attractions from page"
                time.sleep(.5)
                continue
            break

        properties = html.split("property_title\">")

        for index in range(1, len(properties)):
            attraction = self.get_attraction(properties[index])
            attraction['categories'] = self.get_attraction_details(attraction["url"], attraction["name"])

            if(len(attraction['categories']) > 0):
                client.insert({"city":city, "name":attraction["name"], "_id":attraction["url"], "review_count":attraction["review_count"], "categories":attraction["categories"]})
            else:
                print "Error on: " + attraction["name"]
Ejemplo n.º 14
0
    def get_attraction_details(self, attraction_url, attraction_name):
        while True:
            try:
                html = download(attraction_url)
            except:
                print "Error getting attraction details"
                time.sleep(.5)
                continue
            break

        # get the descriptions of the location in the ugliest line of code I've ever written
        try:
            detailsplit = html.split("div class=\"detail\">")
            split_loc = 1 if "Neighborhood:" not in detailsplit[1] or len(detailsplit)==2 else 2
            details = detailsplit[split_loc].split("</div>")[0]
        except IndexError:
            return []

        # split up the details into categories
        list_details = []

        jank_details = details.split(", ")

        # I'm not really sure what I'm doing here but it works
        for jank in jank_details:
            split = jank.split("<")

            if(len(split) > 1):
                title = split[1]

                title_split = title.split(">")

                if(len(title_split) > 1):
                    clean_title = title_split[1]

                    list_details.append(clean_title)

        return list_details
Ejemplo n.º 15
0
    def run(self):
        while True:
            try:
                logging.info("%s : Waiting for surname"  % self.getName())
                surname = self.queue.get()

                row_data = []

                for j, town in enumerate(towns):
                    logging.info("%s : %s \t #%s \t %s"  % (self.getName(), surname, j, town))
                    
                    url = "http://yellowbook.intelius.com/results.php?ReportType=34&refer=2464&adword=RP&qar=off&qc=%s&qdma=off&qi=0&qk=6&qn=%s&qs=NJ" % (town, surname)

                    try:
                        html = unicode(download(url), "latin-1").encode('ascii', 'replace')
                    except KeyboardInterrupt:
                        sys.exit()
                    except:
                        logging.info("Failed.")
                        continue

                    dom = DOM(html)

                    for result in dom('div.cobrand_wp_multiresult_record'):
                        name = plaintext(result('div.cobrand_wp_multiresult_record_name:first-child')[0].content)
                        addr = plaintext(result('div.cobrand_wp_multiresult_record_address')[0].content).replace("\n"," ")
                        phone = plaintext(result('div.cobrand_wp_multiresult_record_phone')[0].content)

                        row_data.append({"name" : name, "addr" : addr, "phone" : phone})

                df = pd.DataFrame(row_data)
                df.to_csv("%s%s.csv" % (OUTPUT_FOLDER,surname), index = False)
                
                logging.info("%s : Processing surname %s complete"  % (self.getName(), surname))
            except:
                pass
Ejemplo n.º 16
0
from pattern.en import quantify

print(quantify({'strawberry': 200, 'peach': 15}))
print(quantify('orange', amount=1200))

# ## Pattern Library Functions for Data Mining

# For macOS SSL issue when downloading file(s) from external sources
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# ### Accessing Web Pages

from pattern.web import download

page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence',
                     unicode=True)

from pattern.web import URL, extension

page_url = URL(
    'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg')
file = open('football' + extension(page_url.page), 'wb')
file.write(page_url.download())
file.close()

# ### Finding URLs within Text

from pattern.web import find_urls

print(find_urls('To search anything, go to www.google.com', unique=True))
 def visit(self, link, source=None):
     print 'visited:', repr(link.url), 'from:', link.referrer
     html = download(link.url)
     blob = TextBlob(plaintext(html))
     print blob.noun_phrases
Ejemplo n.º 18
0
# Extracting noun phrases from a local file
path = '/Users/sharattadimalla/github/vcu/assignment/'
file_name = 'President_Obama_Speech_2004.txt'

with open(path+file_name, 'rU') as file:
    content = file.readlines()

blob = TextBlob(str(content))

print "Static Text Noun Phrases"
print blob.noun_phrases

# Extracting noun phrases from the web
weblink = 'http://www.americanrhetoric.com/speeches/convention2004/barackobama2004dnc.htm'
html = download(weblink)

blob1 = TextBlob(plaintext(html))

print "Web Page Content Noun Phrases"
print blob1.noun_phrases

# Extracting noun phrases from pdf document
url = URL('http://www.bellevue.edu/student-support/career-services/pdfs/resume-samples.pdf')
pdfcontent = PDF(url.download())

blob2 = TextBlob(pdfcontent)

print blob2.noun_phrases

Ejemplo n.º 19
0
 def visit(self, link, source=None):
     print 'visited:', repr(link.url), 'from:', link.referrer
     print plaintext(download(link.url))
Ejemplo n.º 20
0
 def get(self, route):
     parser = reqparse.RequestParser()
     parser.add_argument('direction',
                         type=str,
                         required=False,
                         help='Direction of the route')
     parser.add_argument('stop',
                         type=int,
                         required=False,
                         help='Stop number')
     parser.add_argument('latitude',
                         type=float,
                         required=False,
                         help='Latitude to sort results by')
     parser.add_argument('longitude',
                         type=float,
                         required=False,
                         help='Longitude to sort results by')
     args = parser.parse_args()
     url = "http://www.ltconline.ca/WebWatch/UpdateWebMap.aspx?u={}"
     try:
         html = download(url.format(route), timeout=60, cached=False)
     except (HTTP404NotFound, URLTimeout) as ex:
         msg = "LTC WebWatch service looks down ({})"
         return {
             "message": msg.format(repr(ex)),
             "status": 408,
         }, 408
     timestamp, main_stops, info_text, minor_stops = html.split("*")
     stops_lines = (main_stops + minor_stops).split(";")
     stops = []
     for stop_line in stops_lines:
         stop_line_split = stop_line.split("|")
         if len(stop_line_split) == 7:
             (latitude, longitude, name, direction, stop_number,
              times_text) = stop_line_split[:6]
             stop_number = int(stop_number.replace("Stop Number ", ""))
             times = []
             for time_text in times_text.split("<br>"):
                 time_text_splits = time_text.split(" TO ")
                 if len(time_text_splits) == 2:
                     time, destination = time_text_splits
                     destination = destination.strip()
                     route_time = unicode(route)
                     if destination.startswith(route_time):
                         destination_split = destination.split(" ", 1)
                         route_time, destination = destination_split
                     times.append({
                         "time": time,
                         "destination": destination.title(),
                         "route": route_time,
                     })
             direction = direction.lower()
             if ((not args["stop"] or args["stop"] == stop_number)
                     and (not args["direction"] or
                          (direction.startswith(args["direction"].lower())
                           or args["direction"].lower() == direction))):
                 stop = {
                     "latitude": float(latitude),
                     "longitude": float(longitude),
                     "name": name.title(),
                     "direction": direction.title(),
                     "number": stop_number,
                     "times": times,
                     "route": route,
                 }
                 if args["latitude"] and args["longitude"]:
                     stop_location = stop["latitude"], stop["longitude"]
                     request_location = args["latitude"], args["longitude"]
                     distance_obj = distance(stop_location,
                                             request_location)
                     stop.update({
                         "distance": {
                             "meters": distance_obj.m,
                             "miles": distance_obj.miles,
                         }
                     })
                 stops.append(stop)
     if stops and args["latitude"] and args["longitude"]:
         stops.sort(key=lambda x: x["distance"]["meters"])
     return stops
Ejemplo n.º 21
0
 def visit(self, link, source=None):
     print 'visited:', repr(link.url), 'from:', link.referrer
     html = download(link.url)
     blob = TextBlob(plaintext(html))
     print blob.noun_phrases
Ejemplo n.º 22
0
 def visit(self, link, source=None):
     print 'visited:', repr(link.url), 'from:', link.referrer
     print plaintext(download(link.url))
Ejemplo n.º 23
0
Archivo: api.py Proyecto: linjoey/ltc
 def get(self, route):
     parser = reqparse.RequestParser()
     parser.add_argument('direction', type=str, required=False,
                         help='Direction of the route')
     parser.add_argument('stop', type=int, required=False,
                         help='Stop number')
     parser.add_argument('latitude', type=float, required=False,
                         help='Latitude to sort results by')
     parser.add_argument('longitude', type=float, required=False,
                         help='Longitude to sort results by')
     args = parser.parse_args()
     url = "http://www.ltconline.ca/WebWatch/UpdateWebMap.aspx?u={}"
     try:
         html = download(url.format(route), timeout=60, cached=False)
     except (HTTP404NotFound, URLTimeout) as ex:
         msg = "LTC WebWatch service looks down ({})"
         return {
             "message": msg.format(repr(ex)),
             "status": 408,
         }, 408
     timestamp, main_stops, info_text, minor_stops = html.split("*")
     stops_lines = (main_stops + minor_stops).split(";")
     stops = []
     for stop_line in stops_lines:
         stop_line_split = stop_line.split("|")
         if len(stop_line_split) == 7:
             (latitude, longitude, name, direction,
              stop_number, times_text) = stop_line_split[:6]
             stop_number = int(stop_number.replace("Stop Number ", ""))
             times = []
             for time_text in times_text.split("<br>"):
                 time_text_splits = time_text.split(" TO ")
                 if len(time_text_splits) == 2:
                     time, destination = time_text_splits
                     destination = destination.strip()
                     route_time = unicode(route)
                     if destination.startswith(route_time):
                         destination_split = destination.split(" ", 1)
                         route_time, destination = destination_split
                     times.append({
                         "time": time,
                         "destination": destination.title(),
                         "route": route_time,
                     })
             direction = direction.lower()
             if ((not args["stop"] or args["stop"] == stop_number) and
                 (not args["direction"]
                  or (direction.startswith(args["direction"].lower())
                      or args["direction"].lower() == direction))):
                 stop = {
                     "latitude": float(latitude),
                     "longitude": float(longitude),
                     "name": name.title(),
                     "direction": direction.title(),
                     "number": stop_number,
                     "times": times,
                     "route": route,
                 }
                 if args["latitude"] and args["longitude"]:
                     stop_location = stop["latitude"], stop["longitude"]
                     request_location = args["latitude"], args["longitude"]
                     distance_obj = distance(stop_location,
                                             request_location)
                     stop.update({
                         "distance": {
                             "meters": distance_obj.m,
                             "miles": distance_obj.miles,
                         }
                     })
                 stops.append(stop)
     if stops and args["latitude"] and args["longitude"]:
         stops.sort(key=lambda x: x["distance"]["meters"])
     return stops
 def visit(self, link, source=None):
     print 'visited:', repr(link.url), 'from:', link.referrer
     html = download(link.url)
     blob = TextBlob(plaintext(html))
     for sentence in blob.sentences:
         print(sentence.sentiment.polarity)
Ejemplo n.º 25
0
            screenh = screenh.split("+")
            screenh = int(screenh[0])

            i += 1
        else:
            break
    return (screenw, screenh)

screen_width, screen_height = get_screensize()

# Set timestamp format
timestamp = time.strftime('%Y%m%d')

# set url for download and download current website, no cache
website = "http://apod.nasa.gov/apod/"
html = download(website+"astropix.html", cached=False, unicode=True)

soup = BeautifulSoup(html, "html.parser")


def get_explanation(soup):
    explanation = soup.find("b", string=" Explanation: ")
    explanation = str(explanation.parent)

    explanation2 = BeautifulSoup(explanation, "html.parser")
    explanation2 = explanation2.p.center
    explanation2 = str(explanation2)

    text = explanation.replace(explanation2, "")
    text = plaintext(text, linebreaks=2, indentation=True)
    text = str(text).replace("\n", "")
Ejemplo n.º 26
0
 def get_dom(self):
     return web.DOM(web.download(self.url, cached=True))