Ejemplo n.º 1
0
def extract_tvseries(dom):

    url = URL(TARGET_URL)
    dom = DOM(url.download(cached=True))
    #print dom.body.content
    x = 0
    csv_row = []
    for series in dom.by_tag('td.title'):    
        title = series.by_tag('a')[0].content.encode('ascii', 'ignore')
        ranking = series.by_tag('span.value')[0].content.encode('ascii', 'ignore')
        genres = series.by_tag('span.genre')[0].by_tag('a')
        genres = [g.content.encode('ascii', 'ignore') for g in genres]
        actors = series.by_tag('span.credit')[0].by_tag('a')
        actors = [a.content.encode('ascii', 'ignore') for a in actors]
        x = x + 1
        try:
            runtime = series.by_tag('span.runtime')[0].content.encode('ascii', 'ignore')
        except:
            runtime = "Unknown"
        #print x, title, ranking, genres, actors, runtime

        csv_titles = title
        csv_ranking = ranking
        csv_genres = genres
        csv_actors = actors
        csv_runtime = runtime
        row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime]
        csv_row.append(row)

    return csv_row
def scrape_education(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=01,ALAMEDA&cType=T&cGender=&Submit=1'
	url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=' + county_num + '01,ALAMEDA&cType=T&cGender=&Submit=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	

	other = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[12].content.replace(',','')
	associates = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[11].content.replace(',','')
	bachelors = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[9].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[10].content.replace(',','')))

	masters = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[4].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[5].content.replace(',','')))
	jurisdoctor = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[3].content.replace(',','')
	doctorate = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[2].content.replace(',','')
	
	bachelors_and_less = str(int(bachelors) + int(associates) + int(other))
	
	post_grad = str(int(masters) + int(jurisdoctor) + int(doctorate))
	
	county = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("a")[0].content

	# write all the collected data to a new row of the output file
	writer.writerow([county, bachelors_and_less, post_grad, associates, bachelors, masters, jurisdoctor, doctorate])
Ejemplo n.º 3
0
def summarize(query=None, k=4,url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary)-(k):])
Ejemplo n.º 4
0
def download_pdfs():
    """download pdfs from fda"""

    # where to save pdfs
    path = 'classifier_docs/pdfs/'

    # create directory if it doesn't exist
    if not os.path.exists(path):
        os.makedirs(path)

    # load in non-standard pdf urls from 2012 to serve as control text
    # note: had to lookup urls manually
    # drugs are erivedge (203388) and sirturo (204384)
    # also, menhibrix (125363) has no medical review available 
    urls = ['http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/203388Orig1s000MedRpdf.pdf',
            'http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/204384Orig1s000MedR_.pdf']
    for url in urls:
        m = re.search('20..\/(\d{6})', url)
        app_num = m.group(1)
        url = URL(url)
        # make sure that url points to PDF, print error otherwise
        if url.mimetype in MIMETYPE_PDF:
            # write pdf for medical review if it doesn't exist
            fn = path + app_num + '.pdf'
            if not os.path.exists(fn):
                print "writing {} from {}".format(fn, url)
                f = open(fn, 'w')
                f.write(url.download(cached = False))
                f.close()
            else:
                print "{} already exists".format(fn)
        else:
            print "warning: {} did not resolve to pdf".format(url)

    return
Ejemplo n.º 5
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn")[:1]:
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Ejemplo n.º 6
0
def get_patent_urls(keyword, limit=10):
    keyword = urllib.quote_plus(keyword)
    base_url = "http://www.lens.org"
    url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) + "&q=" + keyword)
    dom = DOM(url.download())
    links = [base_url + a.attributes.get("href") for a in dom("a.link")]
    return links
    def downloading_csv(self, download_type = 'hist'):
        """ Download the csv information for particular stock.
            download_type can be hist or div. If hist, will download the hist price.
            If div, will download dividend history.
            Kwargs:
                download_type (str): hist or div (default hist).
        """
        self.download_fault = 0

        if download_type == 'hist':
            target_url = self.hist_quotes_full_url
            sav_filename = os.path.join(self.hist_quotes_csvfile_path,'hist_stock_price_'+ self.individual_stock_sym+ '.csv')
        elif download_type == 'div':
            target_url = self.div_history_full_url
            sav_filename = os.path.join(self.hist_quotes_csvfile_path,'div_hist_'+ self.individual_stock_sym+ '.csv')
        else:
            print 'wrong download type'
            raise

        url = URL(target_url)
        f = open(self.tempfile_sav_location, 'wb') # save as test.gif
        try:
            f.write(url.download())#if have problem skip
        except:
            if self.__print_download_fault: print 'Problem with processing this data: ', target_url
            self.download_fault =1
        f.close()

        if not self.download_fault:
            if self.enable_save_raw_file:
                shutil.copyfile(self.tempfile_sav_location,sav_filename )
Ejemplo n.º 8
0
def extract_data(stock_ticker):
    url_base = 'http://financials.morningstar.com/ajax/exportKR2CSV.html?&callback=?&t='
    url_end = '&region=usa&culture=en-US&cur=&order=asc'
    # May add more exchanges later on, but these cover the main US stock exchanges: Nasdaq, New York SE, and Pink Sheets (OTC stocks), respectively
    # Loops through main stock exchanges to get proper URL for data extraction
    stock_exchange_list = ['XNAS:','XNYS:','PINX:'] 
    for exchange in stock_exchange_list:
        test = URL(url_base+exchange+stock_ticker+url_end)
        if sys.getsizeof(test.download()) > 35: #A broken URL produces an empty string, which has memory size 33; size 35 allows for minor variation in the size
            break
    temp_data = 'C:/Users/Owner/Documents/temp.csv'
    f = open(temp_data, mode='w')
    try:
        f.write(test.download())
    except:
        raise IOError('There was an error processing this data')
        sys.exit(1)
    f.close()
    try:
        stock_data_df =  pd.read_csv(temp_data, header=2,thousands =',',index_col=0,skiprows=[19,20,31,41,42,43,48,58,53,64,65,72,73,95,101,102])
    except:
        raise IOError('Problem downloading files')
        os.remove(temp_data)
        sys.exit(1)
    os.remove(temp_data)
    stock_data_df = stock_data_df.transpose()
    return(stock_data_df)
Ejemplo n.º 9
0
def summarize_evaluation(query=None, url=None, summary=None):
    j=[]
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))]
    vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))]
    angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2]
    return str(abs(1 - float(angles[1])/float(pi/2)))
Ejemplo n.º 10
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # Download the HTML file
    url = URL(url)
    html = url.download()

    # Parse the HTML file into a DOM representation
    dom = DOM(html)

    # Iterate through all 250 table rows on the index page
    for movies in dom('.lister-list > tr'):
        # take the movie's href attribute and put it in href
        href = movies('td.titleColumn a')[0].attrs["href"]
        # append the href attribute to the string, but also add http://www.imdb.com/ in front of it
        movie_urls.append("http://www.imdb.com/" + href)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Ejemplo n.º 11
0
    def cats(self, namespace=0, start=None, acmin=1, count=100, cached=True, **kwargs):
        """ Returns an iterator over all article titles (for a given namespace id).
        """
        kwargs.setdefault("unicode", True)
        kwargs.setdefault("throttle", self.throttle)
        # Fetch article titles (default) or a custom id.
        id = kwargs.pop("_id", "title")
        id = "*"
        # Loop endlessly (= until the last request no longer yields an "apcontinue").
        # See: http://www.mediawiki.org/wiki/API:Allpages
        while start != -1:
            url = URL(self._url, method=GET, query={
                     "action": "query",
                       "list": "allcategories",
                     "acfrom": start or "",
                    "aclimit": min(count, 500),
                    "acprop": "size",
                    "acmin": max(1, acmin),
                     "format": "json"
            })
            data = url.download(cached=cached, **kwargs)
            data = json.loads(data)
            for x in data.get("query", {}).get("allcategories", {}):
                # print(x)
                if x.get(id):
                    # yield x[id]
                    x['name'] = x.pop('*')
                    yield x

            start = data.get("query-continue", {}).get("allcategories", {})
            start = start.get("accontinue", start.get("acfrom", -1))
        raise StopIteration
Ejemplo n.º 12
0
def getRandomHistoryDOM(language):
    url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
    #Gets the url only of the page this redirects to
    redirectUrl = url.redirect
    try:
        #Grab the name of the wikipedia article from the url
        urlComponents = string.split(redirectUrl, '/')
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        return getRandomHistoryDOM(language)

    #Get the history section of the article
    redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
    print "Current article is: " +str(urlComponents[4])
    #print redirectUrl
    url = URL(redirectUrl);
    dom = DOM(url.download(cached=False))
    try:
        historyList = dom.by_id("pagehistory").by_tag("li")
        return historyList, urlComponents[4]
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        dom = getRandomHistoryDOM(language)

    return getRandomHistoryDOM(language)
Ejemplo n.º 13
0
def process_page():

    url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
    dom = DOM(url.download(cached=True))
    domIndex = 0

    for title in dom.by_class("title"):

        theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
        titleCatalog.append(Title(theTitle))
    
        try:

            match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            #print match.group(1)
            # titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            titleCatalog[domIndex].addRunTime(match.group(1))

        except Exception, e:
            pass

        try:
            titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
        except Exception, e:
            pass
Ejemplo n.º 14
0
def main():
	table = Datasheet()
	tel = ''
	street = ''
	locality = ''
	title = ''
	for i in range(3):
		page = i+1
		url = 	URL("http://torino.paginegialle.it/pgol/4-veterinari/3-torino/p-%s?mr=50" % page)
		print "collecting from %s" % url
		connection = url.open()
		doc = Document( connection.read() )
		items = doc.by_class('item_sx')
		row = []
		for j, item in enumerate(items):
			divs = item.by_class('address')
			try:	
				title = item.by_class('item_head')[0].by_tag('a')[0].content
			except IndexError, e:
				print >> sys.stderr, "%s" % j, e
				pass
			for z, div in enumerate(divs):
				if div != None:
					try:
						street = div.by_class('street-address')[0].content
						locality = div.by_class('locality')[0].content
						tel = div.by_class('tel')[0].by_class('value')[0].content
					except IndexError, e:
						print >> sys.stderr, "%s" % z, e
						pass
					save = "%s, %s %s, %s \n" % ( plaintext(title), plaintext(street).replace(",", ""), plaintext(locality).replace('(TO)', ''), plaintext(tel).replace(",", "") )
					print >> sys.stderr, save
					row.append(save)
    def download_single_image(self, url_link, pic_prefix_str):
        """ Download data according to the url link given.
            Args:
                url_link (str): url str.
                pic_prefix_str (str): pic_prefix_str for unique label the pic
        """
        self.download_fault = 0
        file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
        temp_filename = pic_prefix_str + file_ext
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename )
 
        valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive
 
        url = URL(url_link)
        if url.redirect:
            return # if there is re-direct, return
 
        if file_ext not in valid_image_ext_list:
            return #return if not valid image extension
 
        f = open(temp_filename_full_path, 'wb') # save as test.gif
        print url_link
        self.pic_info_list.append(pic_prefix_str + ': ' + url_link )
        try:
            f.write(url.download())#if have problem skip
        except:
            #if self.__print_download_fault:
            print 'Problem with processing this data: ', url_link
            self.download_fault =1
        f.close()
Ejemplo n.º 16
0
def getQuotes(sym):
	frontUrl = "http://real-chart.finance.yahoo.com/table.csv?s="
	endUrl = "&amp;a=10&amp;b=8&amp;c=1997&amp;d=10&amp;e=8&amp;f=2015&amp;g=d&amp;ignore=.csv"
	
	failed = []
	count = 1

	for ticker in sym:
		fname = "quotes/" + ticker + ".csv"
		df = object()
		tickerUrl = frontUrl + ticker + endUrl
		url = URL(tickerUrl)
		f = open(fname, 'wb')
		try:
			f.write(url.download())
		except:
			print "quotes csv download failed: " + ticker
			failed.append(ticker)
			count += 1
			continue
		f.close()
		count+=1
		print "progress: " + str(count) + "/" + str(len(sym))

	return failed
Ejemplo n.º 17
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    url = URL(url)
    html = url.download()
    dom = DOM(html)
    homeUrl = 'http://www.imdb.com'
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    for e in dom.by_tag("td.titleColumn"):
        absoluteUrl = ''
        for a in e.by_tag("a"):
            link = a.attributes.get("href","")
            absoluteUrl = homeUrl + link
            movie_urls.append(absoluteUrl)
        
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def scrape_starrtest(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	
	#sciend_num = dom.by_class("rm")[4].content
	scicst_num = dom.by_class("rm")[3].content
	math_num = dom.by_class("rm")[2].content
	hist_num = dom.by_class("rm")[1].content
	ela_num = dom.by_class("rm")[0].content
	
	#sciend_percent = dom.by_class("rs")[4].content[:5]
	scicst_percent = dom.by_class("rs")[3].content[:5]
	math_percent = dom.by_class("rs")[2].content[:5]
	hist_percent = dom.by_class("rs")[1].content[:5]
	ela_percent = dom.by_class("rs")[0].content[:5]
	
	county = dom.by_tag("h2")[0].content
	
	
	# write all the collected data to a new row of the output file
	writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
def scrape_top_250(url):
    """
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    """

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = "http://www.imdb.com/chart/top"
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    """
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    """

    for e in dom.by_tag("td.titleColumn"):
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def download_single_image(url_link, pic_prefix_str, target_folder, image_size):
    """ Download data according to the url link given.
        Args:
            url_link (str): url str.
            pic_prefix_str (str): pic_prefix_str for unique label the pic
    """
    file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
    temp_filename = pic_prefix_str + ".jpg"
    temp_filename_full_path = os.path.join(target_folder, temp_filename)

    valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive

    url = URL(url_link)
    if url.redirect:
        return # if there is re-direct, return

    if file_ext not in valid_image_ext_list:
        return #return if not valid image extension

     # save as test.gif
    print url_link
    try:
        response = url.download()
        img = resize_image(response, image_size)
        img.save(temp_filename_full_path, "JPEG")
    except Exception as e:
        #if self.__print_download_fault:
        print 'Problem with processing this data: ', str(e), url_link
Ejemplo n.º 21
0
def convertMapData():
    print '[2/2] Convert map data'

    # output dictionary
    d3mapData = {}

    # download the file
    url = URL(DATASET3)
    data = url.download()

    # create array
    data = list(json.loads(data))

    # fill output dictionary
    for dataRow in data:
        if dataRow['Year'] == '2014':
            population = dataRow['Value']
            fillColor = defineColor(dataRow['Value'])
            d3mapData[dataRow['Country Code']] = {'population': population, 'fillKey': fillColor}

    print '[2/2] Write to json'

    # write output dictionary to json file
    with open('D3LinkedViews/data_map.json', 'wb') as output_file:
        json.dump(d3mapData, output_file)

    print '[2/2] Finish'
Ejemplo n.º 22
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.
    Args:
        url: pattern.web.URL instance pointing to the top 250 index page
    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    #absolute_url = 'http://www.imdb.com'

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download(cached=True))
    
    # return the dom value
    
    for e in dom('.titleColumn'):
        for link in e('a'):
            movie_urls.append(abs(link.attributes.get('href')), )
            
    # return url list
    return movie_urls
Ejemplo n.º 23
0
def get_by_year(year):

    url = URL("http://www.imdb.com/event/ev0000003/" + str(year))
    dom = DOM(url.download(cached=True))
    
    dictAll = {}
    
    awards = dom.by_class('award')
    awardTitles = awards[0].by_tag('h2')
    awardList = []
    for award in awardTitles:
        awardList.append(award.content)

    prize = awards[0].by_tag('blockquote')
    for index, title in enumerate(prize[1:25]):
        winner = title.by_tag('strong')[0].by_tag('a')[0].content
        winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1])

        nomineeList = []
        for each in title.by_tag('strong')[1::]:
            name = each.by_tag('a')[0].content
            id = str(each.by_tag('a')[0].attrs['href'][-8:-1])
            nomineeList.append((clean_unicode(name),id))
            
        winnersAndNominees = {}
        winnersAndNominees['winner'] = (clean_unicode(winner),winner_id)
        winnersAndNominees['nominees'] = nomineeList
        dictAll[awardList[index]] =  winnersAndNominees
    return dictAll
Ejemplo n.º 24
0
def dl_byUrllib2(url, filename):
    myurl = URL(url)
    if os.path.exists(filename):
        return
    with open(filename,'wb') as fp:
        fp.write(myurl.download(cached=False))
        fp.close()
Ejemplo n.º 25
0
def getContributorInfo(devUrl):

    url          = URL(devUrl)
    contribInfo  = json.loads(url.download())


    """
Ejemplo n.º 26
0
def downloadPDFs(dictListJSON, state, jsonExists = False):
    #state = dictListJSON[0, 2]
    dlJSONFile = open(dictListJSON, "r")
    dictJSON = json.load(dlJSONFile)
    dlJSONFile.close()
    #some condition to check if the JSON already exists
    if jsonExists:
        pdfDictList = dictJSON
    else:
        pdfDictList = findPDFLinks(dictJSON, state)


    count = 0
    for dict in pdfDictList:
        #test if date > 01/01/13
        fileName = "".join(str(dict["AdvertiserInfo"]).split())
        print "Writing to " + fileName
        url = dict["PDFLink"]
        url = re.sub(' ', '%20', url)
        print url
        if url != "NO URL":
            urlOpened = URL(url)
            f = open(fileName, 'wb')
            #download to state pdfs directory
            f.write(urlOpened.download(cached=False))
            f.close()
        count += 1
        if count > 4:
            break
Ejemplo n.º 27
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    dom = DOM(url.download(cached = True))
    for e in dom.by_tag("td.titleColumn")[:250]:
        for link in e.by_tag("a"):
            link = link.attrs.get("href","")
            link = abs(link, base=url.redirect or url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Ejemplo n.º 28
0
def get_patent(url):
    url = URL(url + "/fulltext")
    html = url.download()
    dom = DOM(html)
    title = plaintext(dom("h3 a")[0].content)
    body = plaintext(dom("#contents")[0].content)
    return [title, body]
Ejemplo n.º 29
0
def loadPage(numPage):
    #Load the content from the given page
    url = URL(url_estruc_1 + str(numPage) + url_estruc_2)
    dom = DOM(url.download(cached=True))
    for row in dom(ROWS_PATH)[1:]:
        #pprint.pprint(plaintext(row(CELLS_PATH)[0].content))
        RESULTS.append({"place": plaintext(row(CELLS_PATH)[0].content), "place_gender": plaintext(row(CELLS_PATH)[1].content) })
    pprint.pprint(str(numPage + 1) + "/" + str(last_page))
Ejemplo n.º 30
0
def read_web(url):
    html = ''
    start = etime()
    try:
        uri = URL(url)
        html = uri.download(cached=True)
    except Exception, e:
        print 'HTTP Error:' + str(e.message)
Ejemplo n.º 31
0
def get_dom(url):
    
    try:
        s_content = URL(url).download(timeout=120, cached=False)
    except (URLError, HTTP404NotFound):
        print "Error downloading article"
        return None

    #for AJE compatibility
    try:
        s_content = s_content.decode('unicode_escape')
    except (UnicodeEncodeError):
        pass
    
    return Document(s_content)
    def download_single_image(self, url_link, pic_prefix_str):
        """ Download data according to the url link given.
            Args:
                url_link (str): url str.
                pic_prefix_str (str): pic_prefix_str for unique label the pic
        """
        self.download_fault = 0
        file_ext = os.path.splitext(url_link)[
            1]  # use for checking valid pic ext
        temp_filename = pic_prefix_str + file_ext
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath,
                                               temp_filename)

        valid_image_ext_list = [
            '.png', '.PNG', '.jpg', '.jpeg', '.JPG', '.JPEG', '.gif', '.GIF',
            '.bmp', '.BMP', '.tiff', '.TIFF'
        ]  # not comprehensive

        if type(url_link) is int:
            return

        url_link = urllib.unquote(url_link).decode('utf8')
        print(url_link)

        if self.is_image_watermarked(url_link):
            return

        url = URL(url_link)

        try:

            if url.redirect:
                return  # if there is re-direct, return

            if file_ext not in valid_image_ext_list:
                return  # return if not valid image extension

            self.pic_info_list.append(pic_prefix_str + ': ' + url_link)
            downloaded_img = url.download()

            if len(downloaded_img) > 0:  # sometimes download is empty
                f = open(temp_filename_full_path, 'wb')  # save as test.gif
                f.write(downloaded_img)  # if have problem skip
                f.close()
        except:
            # if self.__print_download_fault:
            print 'Problem with processing this data: ', url_link
            self.download_fault = 1
Ejemplo n.º 33
0
def scrape(url, f):

    week = url.split("/")
    week = week[-1]
    url = URL(url)
    dom = DOM(url.download(cached=True))
    # geeft de week
    i = 1
    # de lijst van de top 40 selecteren

    for l in dom.by_tag("ol.top40"):
        # per nummer selecteren=
        print "lijst top 40"
        for e in l.by_tag("div.clearfix")[0:40]:
            muziekGegevens = ""
            #positie in de top 40
            muziekGegevens += str(i) + ","
            print i, 'positie'
            i += 1  # opletten met resetten
            # de artiest selecteren
            for artiest in e.by_class(
                    "credit"):  #error niet te veel elementen!
                muziekGegevens += artiest.content + ","
            #positie
            for inner in e.by_tag("strong")[1:2]:
                print inner.content, "1:2"
                muziekGegevens += inner.content + ","
            # hoogste notering
            for inner in e.by_tag("strong")[2:3]:
                print inner.content, "2:3"
                muziekGegevens += inner.content + ","
            # aantal punten
            for inner in e.by_tag("strong")[3:4]:
                print inner.content, "3:4"
                muziekGegevens += inner.content + ","
            # jaar van het nummer
            for inner in e.by_tag("strong")[4:5]:
                print inner.content.strip(), "4:5"
                muziekGegevens += inner.content.strip()
            h = HTMLParser.HTMLParser()
            muziekGegevens = h.unescape(muziekGegevens)

            if not whatisthis(muziekGegevens):
                muziekGegevens = unicode(muziekGegevens, "utf-8")
                print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                f.write(muziekGegevens + "\n")
            else:
                f.write(muziekGegevens + "\n")
    def agregarInformacionDocumento(self, url, contenido):
        """Metodo para obtener diferentes partes del documento"""
        try:
            unaUrl = URL(url)
            if not 'pdf' in extension(unaUrl.page):
                html = contenido
                unElemento = Element(self.descargarContenidoHtml(url))
                body = self.getBody(unElemento)
                urlValues = self.getUrlValues(unElemento)
                titulo = self.getTitulo(unElemento)

                html = self.verificarContenidoVacio(html)
                body = self.verificarContenidoVacio(body)
                urlValues = self.verificarContenidoVacio(urlValues)
                titulo = self.verificarContenidoVacio(titulo)

                self.mongodb.setInformacionDocumento(html, url, titulo,
                                                     urlValues, body)
            else:
                html = self.verificarContenidoVacio(contenido)
                body = ""
                urlValues = ""
                titulo = ""
                self.mongodb.setInformacionDocumento(html, url, titulo,
                                                     urlValues, body)
        except Exception as e:
            print str(e)
Ejemplo n.º 35
0
    def download_single_image(self, url_link, pic_prefix_str):

        self.download_fault = 0
        file_ext = os.path.splitext(url_link)[1] 
        #print(pic_prefix_str, file_ext)
        temp_filename = pic_prefix_str + str(file_ext)
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename )
 
        valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive
 
        url = URL(url_link)
        if url.redirect:
            print("RD")
            return 
 
        if file_ext not in valid_image_ext_list:
            print("Invalid file type")
            return 
 
        f = open(temp_filename_full_path, 'wb')
        print(url_link)
        self.pic_info_list.append(pic_prefix_str + ': ' + url_link )
        try:
            urllib.request.URLopener.version = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134"
            #f.write(url.download(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134"))
            f.write(urllib.request.urlopen(url_link).read())
            #urllib.request.urlretrieve(url_link, temp_filename_full_path)
        except:
            print('Problem with processing this data: ', url_link)
            self.download_fault =1
        f.close()
Ejemplo n.º 36
0
 def busca_google(self,
                  service,
                  versao='1.0',
                  inicio=0,
                  quant=8,
                  lingua='pt_br',
                  ip=None):
     if not ip:
         ip = socket.gethostbyname(socket.gethostname())
     query = urllib.urlencode({
         'v': versao,
         'start': str(inicio),
         'rsz': str(quant),
         'hl': lingua,
         'q': self.termo,
         'userip': ip
     })
     apiurl = "http://ajax.googleapis.com/ajax/services/search/"
     queryurl = apiurl + service + '?' + query
     try:
         search_results = URL(queryurl).download()
         json = simplejson.loads(search_results)
         results = json['responseData']['results']
         return results
     except (TypeError, URLError):
         print 'erro para a busca Google', service, self.termo
         pass
Ejemplo n.º 37
0
 def busca(self):
     apiurl = 'http://data.alexa.com/data?cli=10&dat=snbamz&url='
     queryurl = apiurl + self.url
     try:
         return URL(queryurl).download()
     except:
         pass
Ejemplo n.º 38
0
    def startClouds(self, urls):
        """
        statClouds: Recibe un conjunto de url's en la lista urls.

        Por cada URL Esta creando un grafo dirigido, ubicando como unico nodo, al que esta en n[0], para ello crea un objeto
        Structure, al cual se le añade el grafo dirigido de un elemento y el dominio de la url, obtenido por url.domain
        """
        clouds = list()
        for n in urls:  #Recorre cada url presente en urls (urls es una lista en la que cada url es a su vez una lista)
            url = URL(
                n[0]
            )  #Crea un objeto pattern.URL, enviando la url contenida en la posicion 0 de n
            graph = nx.DiGraph(
            )  #Inicializa un grafo dirigido (Apunta a uno nodo en especifico) vacio (permite auto apuntado)
            graph.add_node(
                n[0],
                select=True,
                ID=0,
                weight_VSM=0.0,
                weight_WA=0.0,
                weight_OKAPI=0.0,
                weight_SVM=0.0,
                weight_CRANK=0.0,
                totalScore=0.0,
                link=n[0],
                methodData=None,
            )
            clouds.append(Structure(graph,
                                    url.domain))  #Crea un objeto Structure
        return clouds
Ejemplo n.º 39
0
def crawl(topic, N=100, Nbatch=25):
    t = Twitter()  # language='en','id'
    M = N // Nbatch  #integer
    i, Tweets, keepCrawling = None, [], True
    for j in tqdm(range(M)):
        if keepCrawling:
            for tweet in t.search(topic, start=i, count=Nbatch):
                try:
                    Tweets.append(tweet)
                    i = tweet.id
                except:
                    print("Twitter Limit reached")
                    keepCrawling = False  # Second Break (outer loop)
                    break
        else:
            break
    print('Making sure we get the full tweets, please wait ...')
    for i, tweet in enumerate(tqdm(Tweets)):
        try:
            webPage = URL(tweet.url).download()
            soup = bs(webPage, 'html.parser')
            full_tweet = soup.find_all(
                'p',
                class_='TweetTextSize')[0]  #modify this to get all replies
            full_tweet = bs(str(full_tweet), 'html.parser').text
            Tweets[i]['fullTxt'] = full_tweet
        except:
            Tweets[i]['fullTxt'] = tweet.txt
    print('Done!... Total terdapat {0} tweet'.format(len(Tweets)))
    return Tweets
Ejemplo n.º 40
0
def main():
    '''
    Crawl the IMDB top 250 movies, save CSV with their information.

    Note:
        This function also makes backups of the HTML files in a sub-directory
        called HTML_BACKUPS (those will be used in grading).
    '''

    # Create a directory to store copies of all the relevant HTML files (those
    # will be used in testing).
    print 'Setting up backup dir if needed ...'
    create_dir(BACKUP_DIR)

    # Make backup of the IMDB top 250 movies page
    print 'Access top 250 page, making backup ...'
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html)

    # extract the top 250 movies
    print 'Scraping top 250 page ...'
    url_strings = scrape_top_250(top_250_url)

    # grab all relevant information from the 250 movie web pages
    rows = []
    for i, url in enumerate(url_strings):  # Enumerate, a great Python trick!
        print 'Scraping movie %d ...' % i
        # Grab web page
        try:
            movie_html = URL(url).download(cached=True)
        except urllib2.URLError:
            print "Url timeout"
            time.sleep(wait_period=10000)

        # Extract relevant information for each movie
        movie_dom = DOM(movie_html)
        rows.append(scrape_movie_page(movie_dom))

        # Save one of the IMDB's movie pages (for testing)
        if i == 83:
            html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i)
            make_backup(html_file, movie_html)

    # Save a CSV file with the relevant information for the top 250 movies.
    print 'Saving CSV ...'
    save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)
    def retrieve_shortsell_info(self):
        """ Retrieve the shortsell information.
            will form the url and retrieved the information using pandas to make into table.
            The function will set to self_shortsell_info_df.
            make it iterat over the days to get the latest data
        """
        for last_effective_date in range(7):
            self.form_shortsell_url(last_effective_date)
            url = URL(self.shortsell_full_url)
            try:
                #see data is available for that current date
                url_data = url.download(timeout=50)
                shortsell_list = pandas.io.html.read_html(url_data)
                self.shortsell_info_df = shortsell_list[1]
            except:
                continue

            #continue if there is no data
            if len(self.shortsell_info_df) == 0: continue

            self.shortsell_info_df.rename(columns={
                0: 'Security',
                1: 'Short Sale Volume',
                2: 'Currency',
                3: 'Short Sale Value',
            },
                                          inplace=True)
            self.shortsell_info_df = self.shortsell_info_df[1:-3]
            #change type of the columns
            self.shortsell_info_df[['Short Sale Volume', 'Short Sale Value'
                                    ]] = self.shortsell_info_df[[
                                        'Short Sale Volume', 'Short Sale Value'
                                    ]].astype(float)
            #need a rank on the short sell
            self.shortsell_info_df[
                'ranked_shortsell'] = self.shortsell_info_df[
                    'Short Sale Volume'].rank(method='min', ascending=False)
            self.shortsell_info_df[
                'shortsell_lastdate'] = self.set_last_desired_date(
                    last_effective_date)
            #need percentage as well

            # have a sorting of data?
            return

        print 'No suitable data found within time frame.'
        return
Ejemplo n.º 42
0
 def detect(self,link):
     url=URL(link)
     #print url.domain
     if re.search(self.badLinks,url.domain)!=None:
         bad=True
     else:
         bad=False
     return bad
Ejemplo n.º 43
0
    def get_dom_object(self, url_target):
        """ Get dom object based on element for scraping
            Take into consideration that there might be query problem.
            Args:
                url_target (str): url link to be searched.
            Returns:
                (DOM): dom object correspond to the url.

        """
        try:
            url = URL(url_target)
            dom_object = DOM(url.download(cached=True))
        except:
            print 'Problem retrieving data for this url: ', url_target
            self.url_query_timeout = 1

        return dom_object
Ejemplo n.º 44
0
    def url_site_download(self):
        """ Download the csv information for particular stock depending on the retrieval type.
            Retrieval type determine by self.retrieval_type
            Return:
                (str): output html from url.
        """
        self.download_fault = 0
        self.form_url_str()

        url = URL(self.target_full_url)
        try:
            return url.download()
        except:
            if self.__print_download_fault:
                print('Problem with processing this data: ',
                      self.target_full_url)
            self.download_fault = 1
            return None
Ejemplo n.º 45
0
 def busca(self, service, query):
     apiurl = 'https://socialgraph.googleapis.com/'
     queryurl = apiurl + service + '?' + query
     try:
         search_results = URL(queryurl).download()
         results = simplejson.loads(search_results)
         return results
     except:
         print 'erro socialgraph'
Ejemplo n.º 46
0
def detect(link):
    badLinks = 'youtube|linkedin|amazon|books.google|facebook|twitter|instagram|plus.google|yahoo|ebay|ebayinc|flickr|t.co|.google.|youtu.be|microsoft|microsoftstore'
    url = URL(link)
    #print url.domain
    if re.search(badLinks, url.domain) != None:
        bad = True
    else:
        bad = False
    return bad
    def download_single_image(self, url_link, pic_prefix_str):
        """ Download data according to the url link given.
            Args:
                url_link (str): url str.
                pic_prefix_str (str): pic_prefix_str for unique label the pic
        """
        self.download_fault = 0
        file_ext = os.path.splitext(url_link)[
            1]  # use for checking valid pic ext
        temp_filename = pic_prefix_str + file_ext
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath,
                                               temp_filename)
        temp_filename_full_path = temp_filename_full_path.replace("+", " ")
        folder_name = temp_filename_full_path.split("/")
        if not os.path.exists(
                temp_filename_full_path.replace(folder_name[-1], "")):
            os.makedirs(temp_filename_full_path.replace(folder_name[-1], ""))
        valid_image_ext_list = [
            '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'
        ]  # not comprehensive

        url = URL(url_link.replace("%2F", "/").replace("%3A", ":"))
        try:
            if url.redirect:
                return  # if there is re-direct, return
            if file_ext not in valid_image_ext_list:
                return  # return if not valid image extension

            f = open(temp_filename_full_path, 'wb')  # save as test.gif
            print(url_link)
            self.pic_info_list.append(pic_prefix_str + ': ' + url_link)

            image = url.download()
            # import matplotlib.pyplot as p
            # p.imshow(image)
            # p.show(image)
            f.write(image)  # if have problem skip

            # if self.__print_download_fault:
            print('Problem with processing this data: ', url_link)
            self.download_fault = 1
            f.close()
        except:
            pass
Ejemplo n.º 48
0
    def parseUrl(urlString):
        match = re.search('//', urlString)
        if not match:
            urlString = '//' + urlString

        url = urlparse.urlsplit(urlString)
        if not url.scheme:
            url = url._replace(scheme='http')

        return URL(url.geturl())
Ejemplo n.º 49
0
 def busca_topsy(self, service):
     apiurl = 'http://otter.topsy.com/'
     queryurl = apiurl + service + '.json?' + self.query
     try:
         search_results = URL(queryurl).download()
         json = simplejson.loads(search_results)
         resultados = json['response']
         return resultados
     except:
         pass
Ejemplo n.º 50
0
 def __init__(self, main_website):
     self.main_website = main_website
     self.browser_main = webdriver.PhantomJS()
     self.browser_main.set_window_size(1024, 768)
     self.browser_main.get(self.main_website)
     self.website_main = self.browser_main.page_source
     self.browser_main.quit()
     self.dom = web.Element(self.website_main)
     self.links = self.dom.by_class('expanded')
     self.main_url = URL(self.main_website)
Ejemplo n.º 51
0
def procesarSumario(url_sumario, allDocs):

    url_sumario = url_sumario
    print url_sumario
    content = URL(url_sumario).download()
    xml = etree.XML(content)
    ids = etree.XPath("//item/@id")
    for id in ids(xml):
        url_doc = url_boe.format(id)
        allDocs.append(url_doc)
Ejemplo n.º 52
0
def box_office_titles():
    # download the webpage
    html = URL(BOX_OFFICE_URL).download()
    dom = DOM(html)

    # find the movie titles
    title_elements = dom(MOVIE_TITLE_TAG)
    titles = map(lambda x: x.content, title_elements)

    return titles
def inflect(word, language="italian"):

    inflections = {}
    url = "http://en.wiktionary.org/wiki/" + word.replace(" ", "_") 
    dom = DOM(URL(url).download(throttle=10, cached=True))

    pos = ""

    # Search the header that marks the start for the given language:
    # <h2><span class="mw-headline" id="Italian">Italian</span></h2>

    e = dom("#" + language)[0].parent

    while e is not None: # e = e.next_sibling

        if e.type == "element":

            if e.tag == "hr": # Horizontal line = next language.
                break

            if e.tag == "h3": # <h3>Adjective [edit]</h3>
                pos = plaintext(e.content.lower())
                pos = pos.replace("[edit]", "").strip()[:3].rstrip("ouer") + "-"

            # Parse inflections, using regular expressions.

            s = plaintext(e.content)

            # affetto m (f affetta, m plural affetti, f plural affette)

            if s.startswith(word):

                for gender, regexp, i in (
                  ("m" , r"(" + word + r") m", 1),
                  ("f" , r"(" + word + r") f", 1),
                  ("m" , r"(" + word + r") (mf|m and f)", 1),
                  ("f" , r"(" + word + r") (mf|m and f)", 1),
                  ("m" , r"masculine:? (\S*?)(,|\))", 1),
                  ("f" , r"feminine:? (\S*?)(,|\))", 1),
                  ("m" , r"(\(|, )m(asculine)? (\S*?)(,|\))", 3),
                  ("f" , r"(\(|, )f(eminine)? (\S*?)(,|\))", 3),
                  ("mp", r"(\(|, )m(asculine)? plural (\S*?)(,|\))", 3),
                  ("fp", r"(\(|, )f(eminine)? plural (\S*?)(,|\))", 3),
                  ( "p", r"(\(|, )plural (\S*?)(,|\))", 2),
                  ( "p", r"m and f plural (\S*?)(,|\))", 1)):
                    m = re.search(regexp, s, re.I)
                    if m is not None:
                        # {"adj-m": "affetto", "adj-fp": "affette"}
                        inflections[pos + gender] = m.group(i)

            #print s

         e = e.next_sibling

    return inflections
Ejemplo n.º 54
0
    def get_raw_wikihow_page(title=None):
        if title is not None and 'how to' in title.lower():
            title = title.lower().replace('how to', '', 1).strip()

        # keep to "human" articles
        #allowed_cats = ['Youth', 'Family Life', 'Relationships', 'Personal Care and Style', 'Work World']
        allowed_cats = ['Youth', 'Family Life', 'Relationships']
        main_cat = ""
        s = ""

        while main_cat not in allowed_cats:
            try:
                s = URL('http://www.wikihow.com/{}'.format(title)).read() if title is not None \
                    else URL('http://www.wikihow.com/Special:Randomizer').read()
                main_cat = Element(s)('ul#breadcrumb li a')[2].string
                print(main_cat)
            except:
                time.sleep(5)

        return s
Ejemplo n.º 55
0
def downloadText(link, dir, filename, sleep):
    """Downloads PDF file at given link and places in dir"""
    cur = os.getcwd()
    if not os.path.exists(dir):
        os.makedirs(dir)
    os.chdir(dir)
    Ddir = os.getcwd()
    files = [
        f for f in os.listdir(Ddir) if os.path.isfile(os.path.join(Ddir, f))
    ]
    if filename + '.pdf' not in files:
        url = URL(link)
        try:
            f = open(filename + '.pdf', 'wb')
            f.write(url.download(cached=False))
            f.close()
            print filename + ' stored'
            time.sleep(sleep)
        except pattern.web.HTTP500InternalServerError, e:
            print '\n ' + filename + ' link broken' + '\n '
Ejemplo n.º 56
0
def extract_data_ML(i):
    url = 'http://macaulaylibrary.org/audio/%s' % i
    page = URL(url).download()
    dom = DOM(page)
    description = dom('meta')[0].attr['content']
    result = [x.content for x in dom('script') if 'jwplayer(' in x.content][0]
    result = [
        x.strip() for x in result.split('\n') if x.strip().startswith('file')
    ][0]
    path_to_mp3 = result.split('"')[1]
    return {'index': i, 'desc': description, 'mp3': path_to_mp3}
Ejemplo n.º 57
0
def summarize_evaluation(query=None, url=None, summary=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors = [(dot(lsa.S, lsa.U[0, :]), dot(lsa.S, lsa.U[i, :]))
               for i in range(len(lsa.U))]
    vectors2 = [(dot(lsa2.S, lsa2.U[0, :]), dot(lsa2.S, lsa2.U[i, :]))
                for i in range(len(lsa2.U))]
    angles = [
        arccos(dot(a, b) / (norm(a, 2) * norm(b, 2))) for a in vectors
        for b in vectors2
    ]
    return str(abs(1 - float(angles[1]) / float(pi / 2)))
Ejemplo n.º 58
0
def reply_tweet(tweet, reply_id, reply_user="******"):
    from pattern.web import URL, Twitter

    tweet = reply_user + " " + tweet
    url = URL("https://api.twitter.com/1.1/statuses/update.json",
              method="post",
              query={
                  "status": tweet,
                  "in_reply_to_status_id": reply_id
              })

    twitter = Twitter(license=ccpattern)
    url = twitter._authenticate(url)

    try:
        # Send the post request.
        url.open()
    except Exception as e:
        print e
        print e.src
        print e.src.read()
Ejemplo n.º 59
0
    def research_on(self, what, where):

        url = URL(
            "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
            what + "&ou=" + where + "&proximite=0")
        dom = DOM(url.download(cached=True))

        for a in dom.by_tag("div.main-title pj-on-autoload "):
            for e in a.by_tag("span.denombrement"):
                number_of_results = int(
                    self.decode_if_unicode(plaintext(e.content))[:3])

        number_of_page_results = number_of_results / 20
        if (number_of_results % 20 > 0):
            number_of_page_results += 1

        self.exctract_values(dom, self.myInfo)

        for i in range(2, number_of_page_results + 1):
            url = URL(
                "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
                what + "&ou=" + where + "&proximite=0+"
                "&page=" + str(i))
            dom = DOM(url.download(cached=True))
            self.exctract_values(dom, self.myInfo)

        self.myInfo.sort_and_merge()
Ejemplo n.º 60
0
 def get(self):
     url = URL("http://www.ltconline.ca/WebWatch/ada.aspx")
     try:
         dom = DOM(url.download(cached=True))
     except (HTTP404NotFound, URLTimeout):
         return {
             "message": "LTC WebWatch service looks down",
             "status": 408,
         }, 408
     routes = []
     for a in dom("a.ada"):
         a_split = a.content.split(",")
         route = a_split[0].strip()
         try:
             route = int(route)
         except ValueError:
             pass
         routes.append({
             "route": route,
             "name": a.content.split(", ", 1)[1].strip().title(),
         })
     return routes