def _extractTextFromHtml(self, content, webPage): from BeautifulSoup import BeautifulSoup import re import urlparse from consider.rules import inputrules from consider import diff unprocessedSoup = BeautifulSoup(''.join(content)) soup = BeautifulSoup(unprocessedSoup.prettify()) tagsToStrip = ['script', 'style', 'menu'] for currentTag in tagsToStrip: junkTags = soup.body.findAll(currentTag) [junkSection.extract() for junkSection in junkTags] stylesToStrip = ['display:none', 'display: none'] for currentStyle in stylesToStrip: junk = soup.body.findAll(style=currentStyle) [junkSection.extract() for junkSection in junk] hostname = urlparse.urlparse(webPage).hostname for rule in inputrules.nameRules: result = re.search(rule, hostname) if result: soup = inputrules.nameRules[rule](soup) processedContent = soup.body(text = True) processedContent = [diff.unescapeEntities(line) for line in processedContent] return processedContent
def parse_text(text): soup = BeautifulSoup(text) [tag.extract() for tag in soup.findAll({'script':True})] [tag.extract() for tag in soup.findAll(text=lambda x: isinstance(x, Comment))] words = filter(lambda x: len(x) > 0,[wordify(s) for s in soup.body(text=True)]) words = reduce(reduce_func,words) return words
def parse_text(text): soup = BeautifulSoup(text) map(lambda tag: tag.extract(),soup.findAll({'script':True})) map(lambda tag: tag.extract(),soup.findAll(text=lambda x: isinstance(x, Comment))) words = filter(lambda x: len(x) > 0,map(wordify,soup.body(text=True))) words = reduce(reduce_func,words) return words
def searchUrl(url, searchText, caseSensitive): # do not go to other websites global website netloc = urlparse.urlsplit(url).netloc netlocSplit = netloc.split('.') if netlocSplit[-2] + netlocSplit[-1] != website: return global urlList if url in urlList: # prevent using the same URL again return try: urlContent = urllib2.urlopen(url).read() urlList.append(url) except: return # if not an HTML file then return if urlContent.find('<html') == -1 and urlContent.find('<HTML') == -1: return soup = BeautifulSoup(''.join(urlContent)) # remove script tags c=soup.findAll('script') for i in c: i.extract() # get text content of the URL try: body_texts = soup.body(text=True) except: return text = ''.join(body_texts) # search if caseSensitive: if text.find(searchText) > -1: print url print else: if text.lower().find(searchText.lower()) > -1: print url print # if there are links on the webpage then recursively repeat linkTags = soup.findAll('a') for linkTag in linkTags: try: linkUrl = linkTag['href'] # if relative URL then convert to absolute if urlparse.urlsplit(linkUrl).scheme == '': linkUrl = urlparse.urlsplit(url).scheme + '://' + netloc + '/' + linkUrl searchUrl(linkUrl, searchText, caseSensitive) except: pass
def parse_index_html(limit=100): url = 'http://ascl.net/code/all/page/1/limit/{0}/order/date/listmode/compact/dir/desc'.format( limit) parsed_html = BeautifulSoup(urllib2.urlopen(url)) return ((i.find('span', attrs={ 'class': 'ascl_id' }).text, i.find('span', attrs={ 'class': 'title' }).find('a')['href'][1:]) for i in parsed_html.body('div', attrs={'class': 'item'}))
def extract_content(url): # XXX for now just the body text page = urllib2.urlopen(url) soup = BeautifulSoup(page.read()) body = soup.body(text=True) # XXX maybe should let Xapian do this. body = [line.strip() for line in body if line.strip() not in ('\n', '')] body = ' '.join(body) body = body.replace('\n', '') body = body.encode('utf8') return body, soup.title.text
def RunConversion(): global DBlist, DBdict path = "manpages/" dirList = os.listdir(path) for fname in dirList: if fname.endswith(".html"): DBdict = dict() content = False print "\nReading", fname newstring = '.'.join(fname.split('.')[0:-1]) + '.txt' f = open(path + fname, 'r') content = f.read() #NAME f.close() if content: # if content : try: content = (re.sub(".*[M|n]an.*converted.*", "", content)) content = (re.sub(".*man2html.*", "", content)) soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES) c = ''.join(soup.body(text=True)) f = open(path + newstring, 'w') towrite = c.encode('utf-8') cleandata = re.search("(\w+\(.*)", towrite, re.S).group(1) DBdict['name'] = fname.split('.')[ 0][:-1] + "(" + fname.split('.')[0][-1:] + ")".strip() DBdict['cleandata'] = cleandata.strip() if re.search("NAME\n(.*)\n", cleandata, re.S): DBdict['header'] = re.search("NAME\n(.+?)\n", cleandata, re.S).group(1).strip() else: DBdict['header'] = fname.split('.')[0][:-1] DBlist.append(DBdict) f.write(cleandata) f.close() print newstring, " done !" except TypeError, e: print "*" * 100, "Error", fname ErrorFile.write( str("\tError " + fname + " - " + str(e) + "\n")) except UnicodeEncodeError, e: print "*" * 100, "Error", fname ErrorFile.write( str("\t\tError " + fname + " - " + str(e) + "\n")) except AttributeError, e: print "*" * 100, "Error", fname ErrorFile.write( str("\t\t\tError " + fname + " - " + str(e) + "\n"))
def login(self): home_page_url = self.base_url() + '/en/' logging.info('fetching home page from %s', home_page_url) home_page_content = self.fetcher(home_page_url, deadline=10).content home_page = BeautifulSoup(home_page_content) login_url = None login_anchor = home_page.body.find('a', attrs={'id': 'myAccount'}) if login_anchor: login_url = login_anchor['href'].strip() if not login_url: self.raise_login_error("can't find login url on home page") logging.info('fetching login page from %s', login_url) login_page_content = self.fetcher(login_url, deadline=10).content login_page = BeautifulSoup(login_page_content) login_form = login_page.body('form', attrs={'id': 'loginPageForm'})[0] if not login_form: self.raise_login_error("can't find login form on login page") form_fields = {} for input_field in login_page.findAll(name='input'): if input_field['type'] == 'submit': form_fields['submit'] = input_field['name'] else: form_fields[input_field['name']] = input_field.get('value', '') form_fields.update({ 'j_username': self.card.number, 'j_password': self.card.pin, }) submit_login_url = urlparse.urljoin(login_url, login_form['action']) logging.info('submitting login information to %s', submit_login_url) login_response = self.fetcher(submit_login_url, form_fields) login_response_content = login_response.content redirect_to_url = re.search("RedirectAfterLogin\('([^']+)'\)", login_response_content) if not redirect_to_url: self.raise_login_error("Can't find redirect. Login failed.") logging.info('redirecting to %s', redirect_to_url.group(1)) return redirect_to_url.group(1)
def getgenres(): from BeautifulSoup import BeautifulSoup from urllib import urlopen url = 'http://www.multimediasoft.com/amp3dj/help/amp3dj_00003e.htm' soup = BeautifulSoup(urlopen(url)) genres = [None for _ in xrange(256)] for div in soup.body('div', 's0'): val = div.renderContents().replace('\xc2\xa0', ' ') val = val.replace(' ', ' ').replace('&', '&') try: i, genre = val.split('-', 1) i = int(i) except ValueError: continue genres[i] = genre.strip() return [genre for genre in genres if genre]
def RunConversion(): global DBlist, DBdict path="manpages/" dirList=os.listdir(path) for fname in dirList: if fname.endswith(".html"): DBdict = dict() content = False print "\nReading",fname newstring='.'.join(fname.split('.')[0:-1])+'.txt' f = open(path+fname, 'r') content = f.read() #NAME f.close() if content: # if content : try : content = (re.sub(".*[M|n]an.*converted.*","",content)) content = (re.sub(".*man2html.*","",content)) soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES) c = ''.join(soup.body(text=True)) f = open(path+newstring, 'w') towrite = c.encode('utf-8') cleandata = re.search("(\w+\(.*)",towrite,re.S).group(1) DBdict['name'] = fname.split('.')[0][:-1] + "(" + fname.split('.')[0][-1:] + ")".strip() DBdict['cleandata'] = cleandata.strip() if re.search("NAME\n(.*)\n",cleandata,re.S): DBdict['header'] = re.search("NAME\n(.+?)\n",cleandata,re.S).group(1).strip() else: DBdict['header'] = fname.split('.')[0][:-1] DBlist.append(DBdict) f.write(cleandata) f.close() print newstring, " done !" except TypeError, e : print "*"*100, "Error", fname ErrorFile.write(str("\tError " + fname+" - "+ str(e) +"\n")) except UnicodeEncodeError, e : print "*"*100, "Error", fname ErrorFile.write(str("\t\tError " + fname+" - "+ str(e) +"\n")) except AttributeError, e : print "*"*100, "Error", fname ErrorFile.write(str("\t\t\tError " + fname+" - "+ str(e) +"\n"))
def searchUrl(url, level, searchText): # the root URL is level 0 # do not go to other websites global website netloc = urlparse.urlsplit(url).netloc.split('.') if netloc[-2] + netloc[-1] != website: return global urlList if url in urlList: # prevent using the same URL again return try: urlContent = urllib2.urlopen(url).read() urlList.append(url) except: return soup = BeautifulSoup(''.join(urlContent)) # remove script tags c=soup.findAll('script') for i in c: i.extract() # get text content of the URL try: body_texts = soup.body(text=True) except: return text = ''.join(body_texts) # search if text.find(searchText) > -1: print url print # if there are links on the webpage then recursively repeat if level > 0: linkTags = soup.findAll('a') if len(linkTags) > 0: for linkTag in linkTags: try: linkUrl = linkTag['href'] searchUrl(linkUrl, level - 1, searchText) except: pass
def searchUrl(url, level, searchText): # the root URL is level 0 # do not go to other websites global website netloc = urlparse.urlsplit(url).netloc.split('.') if netloc[-2] + netloc[-1] != website: return global urlList if url in urlList: # prevent using the same URL again return try: urlContent = urllib2.urlopen(url).read() urlList.append(url) except: return soup = BeautifulSoup(''.join(urlContent)) # remove script tags c = soup.findAll('script') for i in c: i.extract() # get text content of the URL try: body_texts = soup.body(text=True) except: return text = ''.join(body_texts) # search if text.find(searchText) > -1: print url print # if there are links on the webpage then recursively repeat if level > 0: linkTags = soup.findAll('a') if len(linkTags) > 0: for linkTag in linkTags: try: linkUrl = linkTag['href'] searchUrl(linkUrl, level - 1, searchText) except: pass
def urlretrieve(url, filename, cache={}, lock=threading.Lock()): 'Read contents of an open url, use etags and decompress if needed' request = urllib2.Request(url) #request.add_header('Cache-Control', 'no-cache') # Not expecting compressed files #request.add_header('Accept-Encoding', 'gzip') with lock: if ('etag ' + url) in cache: request.add_header('If-None-Match', cache['etag ' + url]) if ('mod ' + url) in cache: request.add_header('If-Modified-Since', cache['mod ' + url]) try: u = urllib2.urlopen(request) except urllib2.HTTPError as e: return Response(e.code, e.msg, False, False) content = u.read() u.close() compressed = u.info().getheader('Content-Encoding') == 'gzip' #if compressed: # content = gzip.GzipFile(fileobj=cStringIO.StringIO(content), mode='rb').read() #else: soup = BeautifulSoup(content) # Let's take HTML out! soup.body(text=True) returns this as a list of **unicode** content = str(''.join(soup.body(text=True))) written = writefile(filename, content) with lock: etag = u.info().getheader('Etag') if etag: cache['etag ' + url] = etag timestamp = u.info().getheader('Last-Modified') if timestamp: cache['mod ' + url] = timestamp return Response(u.code, u.msg, compressed, written)
def handle(self, **options): since = get_last_change() writer = get_writer() try: while True: changes = settings.db.changes(since=since) since = changes["last_seq"] for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: continue if "type" in doc and doc["type"] == "page": print "indexing", doc["url"] soup = BeautifulSoup(doc["content"]) if soup.body is None: continue desc = soup.findAll('meta', attrs={"name": desc_re}) writer.update_document( title=unicode(soup.title( text=True)[0]) if soup.title is not None and len(soup.title(text=True)) > 0 else doc["url"], url=unicode(doc["url"]), desc=unicode(desc[0]["content"]) if len(desc) > 0 and desc[0]["content"] is not None else u"", rank=doc["rank"], content=unicode( soup.title(text=True)[0] + "\n" + doc["url"] + "\n" + "".join(soup.body(text=True)))) writer.commit() writer = get_writer() set_last_change(since) finally: set_last_change(since)
def searchUrl(url, level, searchText, stayInOriginalRoot = False): # the root URL is level 0 # do not go to other websites global website netloc = urlparse.urlsplit(url).netloc.split('.') if netloc[-2] + netloc[-1] != website: return # if we desire to stay witihin the rooturl's path, then return whenever we're not if stayInOriginalRoot : if url.find(rootUrl) == -1: return global urlList if url in urlList: # prevent using the same URL again return try: urlContent = urllib2.urlopen(url).read() urlList.append(url) except: return print "About to search " + url + " at level " + str(3-level) + "\n\n" soup = BeautifulSoup(''.join(urlContent)) # remove script tags c=soup.findAll('script') for i in c: i.extract() # get text content of the URL try: body_texts = soup.body(text=True) except: return text = ''.join(body_texts) # search if text.find(searchText) > -1: print url print hitList.append(url) # if there are links on the webpage then recursively repeat if level > 0: linkTags = soup.findAll('a') if len(linkTags) > 0: for linkTag in linkTags: try: linkUrl = linkTag['href'] # FIXED: error when searching pages that are supposed to concatonate onto main address!! # Debug part 1 print "\n\n --------------------------------------" print 'Link url before = ' + linkUrl # Concatonate onto address if extended link if (linkUrl.find('/') == -1 or linkUrl.find('/') == 0) and linkUrl.find('.html') == -1: #TODO still not perfect, because should append found string to the raw domain name that indexes server's root directory main_url = '' for n in netloc: main_url = main_url + n + '.' if linkUrl.find('/') == 0 : main_url = main_url[0:-1] else : main_url = main_url[0:-1] + '/' linkUrl = 'http://' + main_url + linkUrl # Debug part 2 print 'Link url after = ' + linkUrl print 'Levels left before exit = ' + level print "---------------------------------------- \n\n" searchUrl(linkUrl, level - 1, searchText, stayInOriginalRoot) except: pass
import urllib2 import urllib import re from BeautifulSoup import BeautifulSoup starturl="http://videolectures.net/site/list/events/" baseurl="http://videolectures.net/" page = urllib2.urlopen(starturl) soup = BeautifulSoup(page) #get all the event names #soup.body(name='span',attrs={"class":"search_res"}) eventTagList=soup.body('span','search_res') numEvents=len(eventTagList) eventInfoList= list() #each dict will have entries for name, date, abbr, url for eventTag in eventTagList: name=eventTag.next #the event name relurl=eventTag.parent.attrs[0][1] #the event abbreviation, and url absurl=urllib.basejoin(baseurl,relurl) abbrev=relurl.split('/')[1].upper() date=eventTag.parent.parent.contents[5].contents[0].contents[0] #the event date eventInfoList.append({"name":name,"abbrev":abbrev,"date":date,"url":absurl}) #get all the event dates soup.body(name='span',attrs={"class":"text_bold"}) soup.body('span','text_bold')
def test_email_diff_subtitles(self): initial_count = len(mail.outbox) # set a user who can receive notification # make sure we have a different author, else he won't get notified author = User(username='******', email='*****@*****.**', notify_by_email=True, valid_email=True) author.save(send_email_confirmation=False) # bypass logic from hell author.valid_email = True author.save() # this is needed for the non_editor template check user2 = User(username='******', email='*****@*****.**', notify_by_email=True, valid_email=True) user2.save(send_email_confirmation=False) # bypass logic from hell user2.valid_email = True user2.save() # version is indentical to previous one video, video_url = Video.add("http://wwww.example.com/video-diff.mp4", None) video.followers.add(author) video.followers.add(user2) language = SubtitleLanguage(video=video, language_code='en') language.save() subs_data = [ [0, 1000, '1'], [1000, 2000, '2'], ] subtitles_1 = SubtitleSet.from_list('en', subs_data) old_version = language.add_version(subtitles=subtitles_1, author=author) # now we change the text on the second sub subs_data[1][2] = '2 changed' # add a regular sub subs_data.append([2000, 3000, 'new sub']) # add an unsyced subs_data.append([None, None, 'no sync']) subtitles_2 = SubtitleSet.from_list('en', subs_data) new_version = language.add_version(subtitles=subtitles_2) self.assertTrue(len(video.notification_list()) > 0) res = send_new_version_notification(new_version.pk) self.assertNotEqual(res, None) # we expect two emails, one is the new-edits-non-editor, and # the other for mail_notification.html self.assertEqual(len(mail.outbox), initial_count + 2) for email_number, email_msg in enumerate(mail.outbox): # make sure this is the right message self.assertIn("New edits to ", email_msg.subject) self.assertIn("video-diff.mp4", email_msg.subject) html = BeautifulSoup(email_msg.body) html_text = "".join(html.body(text=True)).replace("\n", "") if email_number == 0: # assert text and timing changes are correct self.assertIn('67% of the text', html_text) self.assertIn('33% of the timing was changed.', html_text) # find the listed text changes to make sure they match diff_table = html.findAll('table', attrs={'class': 'diffs'})[0] old_version_changes = [] new_version_changes = [] for i, node in enumerate(diff_table.findAll('td')): if i % 2 == 0: old_version_changes.append(node.text) else: new_version_changes.append(node.text) self.assertEqual(old_version_changes, [u'2', u'', u'']) self.assertEqual(new_version_changes, [ u'2 changed', u'new sub', u'no sync', ])
def handle(self, **options): since = get_last_change() writer = get_writer() try: while True: changes = settings.db.changes(since=since) since = changes["last_seq"] for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: continue if "type" in doc and doc["type"] == "page": print "indexing", doc["url"] soup = BeautifulSoup(doc["content"]) if soup.body is None: continue desc = soup.findAll('meta', attrs={ "name": desc_re }) writer.update_document( title=unicode(soup.title(text=True)[0]) if soup.title is not None and len(soup.title(text=True)) > 0 else doc["url"], url=unicode(doc["url"]), desc=unicode(desc[0]["content"]) if len(desc) > 0 and desc[0]["content"] is not None else u"", rank=doc["rank"], content=unicode(soup.title(text=True)[0] + "\n" + doc["url"] + "\n" + "".join(soup.body(text=True))) ) writer.commit() writer = get_writer() set_last_change(since) finally: set_last_change(since)
import urllib2 import urllib import re from BeautifulSoup import BeautifulSoup starturl="http://videolectures.net/site/list/events/" baseurl="http://videolectures.net/" page = urllib2.urlopen(starturl) soup = BeautifulSoup(page) #get all the event names #full version: #soup.body(name='span',attrs={"class":"search_res"}) eventTagList=soup.body('span','search_res') numEvents=len(eventTagList) eventInfoList= list() #eventInfoList will be a list of dicts #each dict will have entries for event name, date, abbr, and url for eventTag in eventTagList: relurl=eventTag.parent.attrs[0][1] #the event abbreviation, and url absurl=urllib.basejoin(baseurl,relurl) name=eventTag.next #the event name name=name.encode('ascii','ignore') abbrev=relurl.split('/')[1].upper() date=eventTag.parent.parent.contents[5].contents[0].contents[0] #the event date eventInfoList.append({"name":name,"abbrev":abbrev,"date":date,"url":absurl})
#!/usr/bin/env python """Generate a .signature file.""" __author__ = 'Jed Frechette <*****@*****.**>' __version__ = '0.1' __date__ = '1 June 2007' from BeautifulSoup import BeautifulSoup from os import path from urllib import urlopen if __name__ == '__main__': dest = '/home/jdfrechette/briefcase' base_sig = ['Jed Frechette\n', 'http://jdfrechette.alturl.com\n\n'] soup = BeautifulSoup(urlopen('http://icasualties.org')) dead = soup.body('span', id='lblCount')[0].find('font').string wounded = soup.body('table', id='dgYear')[0] wounded = wounded.findAll('td')[-1].string tag = '%s Dead, %s Wounded' % (dead, wounded) sig = open(path.join(dest, '.signature'), 'w') sig.writelines(base_sig) sig.write(tag) sig.close()
# #f.write(a.replace('\n{3,}', '\n').encode('utf-8')) # # f.close() for i in range(1, 9): if i is not 6: path = "/Users/fyelles/Desktop/man-html-20111120/htmlman%s/" % ( str(i)) # insert the path to the directory of interest dirList = os.listdir(path) for fname in dirList: if fname.endswith(".html"): content = False print "\nReading", fname newstring = '.'.join(fname.split('.')[0:-1]) + '.txt' f = open(path + fname, 'r') content = f.read() f.close() soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES) c = ''.join(soup.body(text=True)) f = open(path + newstring, 'w') f.write((re.sub('\n{3,}', '\n\n', c)).encode('utf-8')) f.close() print newstring, " done !" def main(): pass if __name__ == '__main__': main()
for script in soup.findAll('script'): script.extract() for link in soup.findAll('a', href=True): if len(link['href']) > 9: pat = re.compile(r'^http').findall(link['href']) if pat: href=re.compile(r"/$").sub('',link['href']) temp=re.compile(r"\.").split( href.lower()) size = len(temp) size = size -1 ext=temp[size] if mime.has_key(ext): err=1 else: urls.append(href) body = soup.body(text=True) body = ' '.join(body) body=convertAccents(body) body=cleanHTML(body) title=convertAccents(title) title=cleanHTML(title) try: body=unicodedata.normalize('NFKD',body).encode('ascii', 'ignore') except: err=1 try: title=unicodedata.normalize('NFKD',title).encode('ascii', 'ignore') except: err=1 body=re.compile(r'\n').sub(' ',body) body=re.compile(r'[ ]+').sub(' ',body)
import urllib2 import urllib import re from BeautifulSoup import BeautifulSoup starturl="http://videolectures.net/site/list/events/" baseurl="http://videolectures.net/" page = urllib2.urlopen(starturl) soup = BeautifulSoup(page) #get all the event names #full version: #soup.body(name='span',attrs={"class":"search_res"}) eventTagList=soup.body('span','search_res') numEvents=len(eventTagList) eventInfoList= list() #eventInfoList will be a list of dicts #each dict will have entries for event name, date, abbr, and url for eventTag in eventTagList: relurl=eventTag.parent.attrs[0][1] #the event abbreviation, and url absurl=urllib.basejoin(baseurl,relurl) name=eventTag.next #the event name name=name.encode('ascii','ignore') abbrev=relurl.split('/')[1].upper() date=eventTag.parent.parent.contents[5].contents[0].contents[0] #the event date eventInfoList.append({"name":name,"abbrev":abbrev,"date":date,"url":absurl}) page=urllib2.urlopen(absurl)
def checkUrl(url, use_proxy=True): if url is None or url == "": return {"status": 900, "reason": "BAD-URL"} o = urlparse(url) resolvable = isResolvable(o.hostname) if not resolvable: return {"status": 903, "reason": "UNRESOLVABLE"} try: if o.scheme == "https": if use_proxy: conn = httplib.HTTPSConnection("explorer.bl.uk", 3127, timeout=10) else: conn = httplib.HTTPSConnection(o.netloc, timeout=10) else: if use_proxy: conn = httplib.HTTPConnection("explorer.bl.uk", 3127, timeout=10) else: conn = httplib.HTTPConnection(o.netloc, timeout=10) # Rebuild the full path inc. query etc. fullpath = o.path if o.params: fullpath += ';' + o.params if o.query: fullpath += '?' + o.query # Now make the request: if use_proxy: conn.request("GET", url) else: conn.request("GET", fullpath) res = conn.getresponse() except socket.timeout: return {"status": 924, "reason": "TIMEOUT"} except Exception as e: if str(e) == "[Errno 65] No route to host": return {"status": 903, "reason": "NOROUTE"} elif str(e) == "[Errno 51] Network is unreachable": return {"status": 903, "reason": "NETWORK-UNREACHABLE"} elif str(e) == "[Errno 61] Connection refused" or str( e) == "[Errno 111] Connection refused": return {"status": 903, "reason": "CONNECTION-REFUSED"} elif str(e) == "[Errno 54] Connection reset by peer": return {"status": 903, "reason": "CONNECTION-RESET"} else: return {"status": 903, "reason": "CONNECTION-FAILED: " + str(e)} if res.status / 100 == 3: location = res.getheader('location') state = checkUrl(location) status = state['status'] reason = state['reason'] if reason.endswith("VIA-REDIRECT+"): return state else: state['reason'] = reason + " VIA-REDIRECT+" return state elif res.status / 100 == 2: # Get a copy, hash it, get the title and ssdeep the text try: payload = res.read() except Exception as e: return {"status": 903, "reason": "READ-FAILED: " + str(e)} # Clean up and grab the text: title = "" text = "" first_fragment = "" fh = None try: soup = BeautifulSoup(payload, convertEntities=BeautifulSoup.HTML_ENTITIES) if soup.title != None: title = normaliseText(soup.title.string) [ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ] comments = soup.findAll( text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] if soup.body != None: texts = [unicode(x) for x in soup.body(text=True)] text = soup.title.string + normaliseText(" ".join(texts)) # Just pull out the first bit: first_fragment = text[:200] # Fuzzy hash if text != "": fh = fuzzyHash(text) except: pass # And the binary hash: md5 = hashlib.md5(payload).hexdigest() # And return: return { "status": res.status, "reason": res.reason, "title": title, "first_fragment": first_fragment, "fh": fh, "md5": md5, "text": text } else: return {"status": res.status, "reason": res.reason}
def scrape_dk(self): ''' Scrape method for Digikey. ''' # Clear previous pricing data (in case price break keys change) search_url = 'http://search.digikey.com/us/en/products/' + self.manufacturer_pn search_page = urllib2.urlopen(search_url) search_soup = BeautifulSoup(search_page) # Create a list of product URLs from the search page prod_urls = [] search_table = search_soup.body('table', id="productTable") if len(search_table) > 0: product_table = search_table[0] #print 'product_table: \n', product_table #print 'product_table.contents: \n', product_table.contents # Find tbody tag in table tbody_tag = product_table.find('tbody') #print 'tbody: \n', type(tbody_tag), tbody_tag #print 'tbody.contents: \n', type(tbody_tag.contents), tbody_tag.contents #print 'tbody.contents[0]: \n', type(tbody_tag.contents[0]), tbody_tag.contents[0] prod_rows = tbody_tag.findAll('tr') #print 'prod_rows: \n', type(prod_rows), prod_rows for row in prod_rows: #print "Search row in prod_rows: ", row anchor = row.find('a') # DK uses a relative path for these links prod_urls.append('http://search.digikey.com' + anchor['href']) #print 'Adding URL: ', 'http://search.digikey.com' + anchor['href'] for url in prod_urls: page = urllib2.urlopen(url) soup = BeautifulSoup(page) #print "URL: %s" % url # Get prices prices = {} price_table = soup.body('table', id="pricing") #print 'price_table: ', type(price_table), price_table if len(price_table) == 0: raise ScrapeException(VENDOR_DK, self.manufacturer_pn, 4) # price_table.contents[x] should be the tr tags... for tag in price_table: #print 'tag: ', type(tag), tag for row in tag: #print 'row: ', type(row), row # row.contents should be td Tags... except the first! if row == '\n': pass elif row.contents[0].name == 'th': pass #print "Found row.name == th" else: new_break_str = row.contents[0].string # Remove commas if new_break_str.isdigit() == False: new_break_str = new_break_str.replace(",", "") #print "new_break_str is: %s" % new_break_str new_break = int(new_break_str) new_unit_price = float(row.contents[1].string) prices[new_break] = new_unit_price #print 'Adding break/price to pricing dict: ', (new_break, new_unit_price) # Get inventory # If the item is out of stock, the <td> that normally holds the # quantity available will have a text input box that we need to # watch out for inv_soup = soup.body('td', id="quantityavailable") #print 'inv_soup: ', type(inv_soup), inv_soup #print "Length of form search results: %s" % len(inv_soup[0].findAll('form')) if len(inv_soup[0].findAll('form')) > 0: inventory = 0 else: inv_str = inv_soup[0].contents[0] #print 'inv_str: ', type(inv_str), inv_str if inv_str.isdigit() == False: inv_str = inv_str.replace(",", "") inventory = int(inv_str) #print 'inventory: ', type(inventory), inventory vendor_pn = soup.body("th", text="Digi-Key Part Number")[0].parent.nextSibling.contents[0].string.__str__() # Get manufacturer and PN self.manufacturer = soup.body("th", text="Manufacturer")[0].parent.nextSibling.contents[0].contents[0].string.__str__() #print "manufacturer is: %s" % self.manufacturer self.manufacturer_pn = soup.body('th', text="Manufacturer Part Number")[0].parent.nextSibling.contents[0].string.__str__() #print "manufacturer_pn is: %s" % self.manufacturer_pn # Get datasheet filename and download datasheet_soup = soup.body('th', text="Datasheets")[0].parent.nextSibling datasheet_anchor = datasheet_soup.findAllNext('a')[0] #print "datasheet_soup is: %s" % datasheet_soup #print "datasheet_anchor is: %s" % datasheet_anchor self.datasheet_url = datasheet_anchor['href'] #print "self.datasheet_url is: %s" % self.datasheet_url row = urllib2.urlopen(urllib2.Request(self.datasheet_url)) try: file_name = get_filename(url,row) self.datasheet = file_name; # TODO: Do not re-download if already saved if DOWNLOAD_DATASHEET: with open(file_name, 'wb') as f: shutil.copyfileobj(row,f) finally: row.close() #print "datasheet is: %s" % self.datasheet # Get remaining strings (desc, category, family, series, package) self.description = soup.body('th', text="Description")[0].parent.nextSibling.contents[0].string.__str__() #print "description is: %s" % self.description category = soup.body('th', text="Category")[0].parent.nextSibling.contents[0].string.__str__() #print "category is: %s" % category family = soup.body('th', text="Family")[0].parent.nextSibling.contents[0].string.__str__() #print "family is: %s" % family series = soup.body('th', text="Series")[0].parent.nextSibling.contents[0].string.__str__() #print "series is: %s" % series self.package = soup.body('th', text="Package / Case")[0].parent.nextSibling.contents[0].string.__str__() #print "package is: %s" % self.package packaging_soup = soup.body('th', text="Packaging")[0].parent.parent.nextSibling.contents[0] #print "packaging_soup: ", type(packaging_soup), packaging_soup if type(packaging_soup) == NavigableString: packaging = packaging_soup.string.__str__() #print "packaging (from text): ", type(packaging), packaging elif type(packaging_soup) == Tag: packaging = packaging_soup.contents[0].string.__str__() #print "packaging (from link): ", type(packaging), packaging else: print 'Error: DK Packaging scrape failure!' if "Digi-Reel" in packaging: packaging = "Digi-Reel" # Remove Restricted symbol key = VENDOR_DK + ': ' + vendor_pn + ' (' + packaging + ')' self.listings[key] = Listing(VENDOR_DK, vendor_pn, self.manufacturer_pn, prices, inventory, packaging) #v = Listing(VENDOR_DK, vendor_pn, self.manufacturer_pn, prices, inventory, pkg, reel, cat, fam, ser) self.listings[key].category = category self.listings[key].family = family self.listings[key].series = series if "Digi-Reel" in packaging: self.listings[key].reel_fee = 7
def scrape_dk(self): ''' Scrape method for Digikey. ''' # Clear previous pricing data (in case price break keys change) search_url = 'http://search.digikey.com/us/en/products/' + self.manufacturer_pn search_page = urllib2.urlopen(search_url) search_soup = BeautifulSoup(search_page) # Create a list of product URLs from the search page prod_urls = [] search_table = search_soup.body('table', id="productTable") if len(search_table) > 0: product_table = search_table[0] #print 'product_table: \n', product_table #print 'product_table.contents: \n', product_table.contents # Find tbody tag in table tbody_tag = product_table.find('tbody') #print 'tbody: \n', type(tbody_tag), tbody_tag #print 'tbody.contents: \n', type(tbody_tag.contents), tbody_tag.contents #print 'tbody.contents[0]: \n', type(tbody_tag.contents[0]), tbody_tag.contents[0] prod_rows = tbody_tag.findAll('tr') #print 'prod_rows: \n', type(prod_rows), prod_rows for row in prod_rows: #print "Search row in prod_rows: ", row anchor = row.find('a') # DK uses a relative path for these links prod_urls.append('http://search.digikey.com' + anchor['href']) #print 'Adding URL: ', 'http://search.digikey.com' + anchor['href'] for url in prod_urls: page = urllib2.urlopen(url) soup = BeautifulSoup(page) #print "URL: %s" % url # Get prices prices = {} price_table = soup.body('table', id="pricing") #print 'price_table: ', type(price_table), price_table if len(price_table) == 0: raise ScrapeException(VENDOR_DK, self.manufacturer_pn, 4) # price_table.contents[x] should be the tr tags... for tag in price_table: #print 'tag: ', type(tag), tag for row in tag: #print 'row: ', type(row), row # row.contents should be td Tags... except the first! if row == '\n': pass elif row.contents[0].name == 'th': pass #print "Found row.name == th" else: new_break_str = row.contents[0].string # Remove commas if new_break_str.isdigit() == False: new_break_str = new_break_str.replace(",", "") #print "new_break_str is: %s" % new_break_str new_break = int(new_break_str) new_unit_price = float(row.contents[1].string) prices[new_break] = new_unit_price #print 'Adding break/price to pricing dict: ', (new_break, new_unit_price) # Get inventory # If the item is out of stock, the <td> that normally holds the # quantity available will have a text input box that we need to # watch out for inv_soup = soup.body('td', id="quantityavailable") #print 'inv_soup: ', type(inv_soup), inv_soup #print "Length of form search results: %s" % len(inv_soup[0].findAll('form')) if len(inv_soup[0].findAll('form')) > 0: inventory = 0 else: inv_str = inv_soup[0].contents[0] #print 'inv_str: ', type(inv_str), inv_str if inv_str.isdigit() == False: inv_str = inv_str.replace(",", "") inventory = int(inv_str) #print 'inventory: ', type(inventory), inventory vendor_pn = soup.body( "th", text="Digi-Key Part Number" )[0].parent.nextSibling.contents[0].string.__str__() # Get manufacturer and PN self.manufacturer = soup.body( "th", text="Manufacturer" )[0].parent.nextSibling.contents[0].contents[0].string.__str__() #print "manufacturer is: %s" % self.manufacturer self.manufacturer_pn = soup.body( 'th', text="Manufacturer Part Number" )[0].parent.nextSibling.contents[0].string.__str__() #print "manufacturer_pn is: %s" % self.manufacturer_pn # Get datasheet filename and download datasheet_soup = soup.body('th', text="Datasheets")[0].parent.nextSibling datasheet_anchor = datasheet_soup.findAllNext('a')[0] #print "datasheet_soup is: %s" % datasheet_soup #print "datasheet_anchor is: %s" % datasheet_anchor self.datasheet_url = datasheet_anchor['href'] #print "self.datasheet_url is: %s" % self.datasheet_url row = urllib2.urlopen(urllib2.Request(self.datasheet_url)) try: file_name = get_filename(url, row) self.datasheet = file_name # TODO: Do not re-download if already saved if DOWNLOAD_DATASHEET: with open(file_name, 'wb') as f: shutil.copyfileobj(row, f) finally: row.close() #print "datasheet is: %s" % self.datasheet # Get remaining strings (desc, category, family, series, package) self.description = soup.body( 'th', text="Description" )[0].parent.nextSibling.contents[0].string.__str__() #print "description is: %s" % self.description category = soup.body( 'th', text="Category" )[0].parent.nextSibling.contents[0].string.__str__() #print "category is: %s" % category family = soup.body( 'th', text="Family" )[0].parent.nextSibling.contents[0].string.__str__() #print "family is: %s" % family series = soup.body( 'th', text="Series" )[0].parent.nextSibling.contents[0].string.__str__() #print "series is: %s" % series self.package = soup.body( 'th', text="Package / Case" )[0].parent.nextSibling.contents[0].string.__str__() #print "package is: %s" % self.package packaging_soup = soup.body( 'th', text="Packaging")[0].parent.parent.nextSibling.contents[0] #print "packaging_soup: ", type(packaging_soup), packaging_soup if type(packaging_soup) == NavigableString: packaging = packaging_soup.string.__str__() #print "packaging (from text): ", type(packaging), packaging elif type(packaging_soup) == Tag: packaging = packaging_soup.contents[0].string.__str__() #print "packaging (from link): ", type(packaging), packaging else: print 'Error: DK Packaging scrape failure!' if "Digi-Reel" in packaging: packaging = "Digi-Reel" # Remove Restricted symbol key = VENDOR_DK + ': ' + vendor_pn + ' (' + packaging + ')' self.listings[key] = Listing(VENDOR_DK, vendor_pn, self.manufacturer_pn, prices, inventory, packaging) #v = Listing(VENDOR_DK, vendor_pn, self.manufacturer_pn, prices, inventory, pkg, reel, cat, fam, ser) self.listings[key].category = category self.listings[key].family = family self.listings[key].series = series if "Digi-Reel" in packaging: self.listings[key].reel_fee = 7
def checkUrl(url, use_proxy=True): if url is None or url == "": return { "status": 900, "reason": "BAD-URL" } o = urlparse(url) resolvable = isResolvable(o.hostname) if not resolvable: return { "status": 903, "reason": "UNRESOLVABLE" } try: if o.scheme == "https": if use_proxy: conn = httplib.HTTPSConnection("explorer.bl.uk", 3127, timeout=10) else: conn = httplib.HTTPSConnection(o.netloc, timeout=10) else: if use_proxy: conn = httplib.HTTPConnection("explorer.bl.uk", 3127, timeout=10) else: conn = httplib.HTTPConnection(o.netloc, timeout=10) # Rebuild the full path inc. query etc. fullpath = o.path if o.params: fullpath += ';'+o.params if o.query: fullpath += '?'+o.query # Now make the request: if use_proxy: conn.request("GET", url ) else: conn.request("GET", fullpath ) res = conn.getresponse() except socket.timeout: return { "status": 924, "reason": "TIMEOUT" } except Exception as e: if str(e) == "[Errno 65] No route to host": return { "status": 903, "reason": "NOROUTE" } elif str(e) == "[Errno 51] Network is unreachable": return { "status": 903, "reason": "NETWORK-UNREACHABLE" } elif str(e) == "[Errno 61] Connection refused" or str(e) == "[Errno 111] Connection refused": return { "status": 903, "reason": "CONNECTION-REFUSED" } elif str(e) == "[Errno 54] Connection reset by peer": return { "status": 903, "reason": "CONNECTION-RESET" } else: return { "status": 903, "reason": "CONNECTION-FAILED: "+str(e) } if res.status / 100 == 3: location = res.getheader('location') state = checkUrl(location) status = state['status'] reason = state['reason'] if reason.endswith("VIA-REDIRECT+"): return state else: state['reason'] = reason+" VIA-REDIRECT+" return state elif res.status / 100 == 2: # Get a copy, hash it, get the title and ssdeep the text try: payload = res.read() except Exception as e: return { "status": 903, "reason": "READ-FAILED: "+str(e) } # Clean up and grab the text: title = "" text = "" first_fragment = "" fh = None try: soup = BeautifulSoup(payload,convertEntities=BeautifulSoup.HTML_ENTITIES) if soup.title != None: title = normaliseText(soup.title.string) [ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ] comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] if soup.body != None: texts = [ unicode(x) for x in soup.body(text=True) ] text = soup.title.string + normaliseText( " ".join(texts) ) # Just pull out the first bit: first_fragment = text[:200] # Fuzzy hash if text != "": fh = fuzzyHash(text) except: pass # And the binary hash: md5 = hashlib.md5(payload).hexdigest() # And return: return { "status": res.status, "reason": res.reason, "title": title, "first_fragment": first_fragment, "fh":fh, "md5":md5, "text":text } else: return { "status": res.status, "reason": res.reason }
for eventString in eventFile.readlines(): eventurl=eventString.split('---')[-1].rstrip() eventAbbrev=eventString.split('---')[0] #eventurl="http://videolectures.net/epsrcws08_sheffield/" #page = urllib2.urlopen(http://videolectures.net/epsrcws08_rasmussen_lgp/) page = urllib2.urlopen(eventurl) soup = BeautifulSoup(page) #get all the event names #full version: #soup.body(name='div',attrs={"class":"author"}) vidPageList=soup.body('div','lec_thumb_click') numVidPages = len(vidPageList) vidPageInfoList = list() #vidPageInfoList will be a list of dicts #each dict will have entries for vidpage title, and vidpage url for vidPage in vidPageList: #authorName=authorTag.contents[0].string relurl = vidPage.next.attrs[0][1] absurl = urllib.basejoin(baseurl,relurl) page = urllib2.urlopen(absurl) soup = BeautifulSoup(page) title = soup.head.title.string title = title.encode('ascii','ignore')
def get_text(self): bs = BeautifulSoup(self.data, convertEntities=BeautifulSoup.HTML_ENTITIES) return ''.join(bs.body(text=True))
def parse_index_html(limit = 100): url = 'http://ascl.net/code/all/page/1/limit/{0}/order/date/listmode/compact/dir/desc'.format(limit) parsed_html = BeautifulSoup(urllib2.urlopen(url)) return ((i.find('span', attrs={'class':'ascl_id'}).text, i.find('span', attrs={'class':'title'}).find('a')['href'][1:]) for i in parsed_html.body('div', attrs={'class':'item'}))
# f.write((re.sub('\n{3,}','\n\n',a)).encode('utf-8')) # #f.write(a.replace('\n{3,}', '\n').encode('utf-8')) # # f.close() for i in range(1,9): if i is not 6 : path="/Users/fyelles/Desktop/man-html-20111120/htmlman%s/"% ( str(i)) # insert the path to the directory of interest dirList=os.listdir(path) for fname in dirList: if fname.endswith(".html"): content = False print "\nReading",fname newstring='.'.join(fname.split('.')[0:-1])+'.txt' f = open(path+fname, 'r') content = f.read() f.close() soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES) c = ''.join(soup.body(text=True)) f = open(path+newstring, 'w') f.write((re.sub('\n{3,}','\n\n',c)).encode('utf-8')) f.close() print newstring, " done !" def main(): pass if __name__ == '__main__': main()