Beispiel #1
0
    def _extractTextFromHtml(self, content, webPage):
        from BeautifulSoup import BeautifulSoup
        import re
        import urlparse

        from consider.rules import inputrules
        from consider import diff

        unprocessedSoup = BeautifulSoup(''.join(content))

        soup = BeautifulSoup(unprocessedSoup.prettify())
        
        tagsToStrip = ['script', 'style', 'menu']
        for currentTag in tagsToStrip:
            junkTags = soup.body.findAll(currentTag)
            [junkSection.extract() for junkSection in junkTags]
        
        stylesToStrip = ['display:none', 'display: none']
        for currentStyle in stylesToStrip:
            junk = soup.body.findAll(style=currentStyle)
            [junkSection.extract() for junkSection in junk]

        hostname = urlparse.urlparse(webPage).hostname
        for rule in inputrules.nameRules:
            result = re.search(rule, hostname)
            if result:
                soup = inputrules.nameRules[rule](soup)

        processedContent = soup.body(text = True)

        processedContent = [diff.unescapeEntities(line) for line in processedContent]

        return processedContent
def parse_text(text):
	soup = BeautifulSoup(text)
	[tag.extract() for tag in soup.findAll({'script':True})]
	[tag.extract() for tag in soup.findAll(text=lambda x: isinstance(x, Comment))]
	words = filter(lambda x: len(x) > 0,[wordify(s) for s in soup.body(text=True)])
	words = reduce(reduce_func,words)
	return words
def parse_text(text):
	soup = BeautifulSoup(text)
	map(lambda tag: tag.extract(),soup.findAll({'script':True}))
	map(lambda tag: tag.extract(),soup.findAll(text=lambda x: isinstance(x, Comment)))
	words = filter(lambda x: len(x) > 0,map(wordify,soup.body(text=True)))
	words = reduce(reduce_func,words)
	return words
Beispiel #4
0
def searchUrl(url, searchText, caseSensitive):

    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc
    netlocSplit = netloc.split('.')
    if netlocSplit[-2] + netlocSplit[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
    except:
        return

    # if not an HTML file then return
    if urlContent.find('<html') == -1 and urlContent.find('<HTML') == -1:
            return

    soup = BeautifulSoup(''.join(urlContent))
    # remove script tags
    c=soup.findAll('script')
    for i in c:
        i.extract() 
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
    except:
        return
    text = ''.join(body_texts) 

    # search
    if caseSensitive:
        if text.find(searchText) > -1:
			print url
			print
    else:
        if text.lower().find(searchText.lower()) > -1:

	          	  print url
        	    	  print

    # if there are links on the webpage then recursively repeat
    linkTags = soup.findAll('a')

    for linkTag in linkTags:
        try:
            linkUrl = linkTag['href']
            # if relative URL then convert to absolute
            if urlparse.urlsplit(linkUrl).scheme == '':
                linkUrl = urlparse.urlsplit(url).scheme + '://' + netloc + '/' + linkUrl

            searchUrl(linkUrl, searchText, caseSensitive)
        except:
            pass
Beispiel #5
0
def searchUrl(url, searchText, caseSensitive):

    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc
    netlocSplit = netloc.split('.')
    if netlocSplit[-2] + netlocSplit[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
    except:
        return

    # if not an HTML file then return
    if urlContent.find('<html') == -1 and urlContent.find('<HTML') == -1:
            return

    soup = BeautifulSoup(''.join(urlContent))
    # remove script tags
    c=soup.findAll('script')
    for i in c:
        i.extract()
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
    except:
        return
    text = ''.join(body_texts)

    # search
    if caseSensitive:
        if text.find(searchText) > -1:
            print url
            print
    else:
        if text.lower().find(searchText.lower()) > -1:
            print url
            print

    # if there are links on the webpage then recursively repeat
    linkTags = soup.findAll('a')

    for linkTag in linkTags:
        try:
            linkUrl = linkTag['href']
            # if relative URL then convert to absolute
            if urlparse.urlsplit(linkUrl).scheme == '':
                linkUrl = urlparse.urlsplit(url).scheme + '://' + netloc + '/' + linkUrl

            searchUrl(linkUrl, searchText, caseSensitive)
        except:
            pass
Beispiel #6
0
def parse_index_html(limit=100):
    url = 'http://ascl.net/code/all/page/1/limit/{0}/order/date/listmode/compact/dir/desc'.format(
        limit)
    parsed_html = BeautifulSoup(urllib2.urlopen(url))
    return ((i.find('span', attrs={
        'class': 'ascl_id'
    }).text, i.find('span', attrs={
        'class': 'title'
    }).find('a')['href'][1:])
            for i in parsed_html.body('div', attrs={'class': 'item'}))
Beispiel #7
0
def extract_content(url):
    # XXX for now just the body text
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page.read())
    body = soup.body(text=True)

    # XXX maybe should let Xapian do this.
    body = [line.strip() for line in body if line.strip() not in ('\n', '')]
    body = ' '.join(body)
    body = body.replace('\n', '')
    body = body.encode('utf8')
    return body, soup.title.text
Beispiel #8
0
def RunConversion():
    global DBlist, DBdict
    path = "manpages/"
    dirList = os.listdir(path)

    for fname in dirList:
        if fname.endswith(".html"):
            DBdict = dict()
            content = False
            print "\nReading", fname
            newstring = '.'.join(fname.split('.')[0:-1]) + '.txt'
            f = open(path + fname, 'r')
            content = f.read()  #NAME
            f.close()
            if content:
                #            if content :
                try:
                    content = (re.sub(".*[M|n]an.*converted.*", "", content))
                    content = (re.sub(".*man2html.*", "", content))
                    soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES)
                    c = ''.join(soup.body(text=True))
                    f = open(path + newstring, 'w')
                    towrite = c.encode('utf-8')
                    cleandata = re.search("(\w+\(.*)", towrite, re.S).group(1)

                    DBdict['name'] = fname.split('.')[
                        0][:-1] + "(" + fname.split('.')[0][-1:] + ")".strip()
                    DBdict['cleandata'] = cleandata.strip()
                    if re.search("NAME\n(.*)\n", cleandata, re.S):
                        DBdict['header'] = re.search("NAME\n(.+?)\n",
                                                     cleandata,
                                                     re.S).group(1).strip()
                    else:
                        DBdict['header'] = fname.split('.')[0][:-1]
                    DBlist.append(DBdict)

                    f.write(cleandata)
                    f.close()
                    print newstring, " done !"
                except TypeError, e:
                    print "*" * 100, "Error", fname
                    ErrorFile.write(
                        str("\tError " + fname + " - " + str(e) + "\n"))
                except UnicodeEncodeError, e:
                    print "*" * 100, "Error", fname
                    ErrorFile.write(
                        str("\t\tError " + fname + " - " + str(e) + "\n"))
                except AttributeError, e:
                    print "*" * 100, "Error", fname
                    ErrorFile.write(
                        str("\t\t\tError " + fname + " - " + str(e) + "\n"))
Beispiel #9
0
    def login(self):
        home_page_url = self.base_url() + '/en/'
        logging.info('fetching home page from %s', home_page_url)
        home_page_content = self.fetcher(home_page_url, deadline=10).content
        home_page = BeautifulSoup(home_page_content)

        login_url = None
        login_anchor = home_page.body.find('a', attrs={'id': 'myAccount'})
        if login_anchor:
            login_url = login_anchor['href'].strip()

        if not login_url:
            self.raise_login_error("can't find login url on home page")

        logging.info('fetching login page from %s', login_url)
        login_page_content = self.fetcher(login_url, deadline=10).content
        login_page = BeautifulSoup(login_page_content)

        login_form = login_page.body('form', attrs={'id': 'loginPageForm'})[0]

        if not login_form:
            self.raise_login_error("can't find login form on login page")

        form_fields = {}

        for input_field in login_page.findAll(name='input'):
            if input_field['type'] == 'submit':
                form_fields['submit'] = input_field['name']
            else:
                form_fields[input_field['name']] = input_field.get('value', '')

        form_fields.update({
            'j_username': self.card.number,
            'j_password': self.card.pin,
        })

        submit_login_url = urlparse.urljoin(login_url, login_form['action'])
        logging.info('submitting login information to %s', submit_login_url)

        login_response = self.fetcher(submit_login_url, form_fields)
        login_response_content = login_response.content

        redirect_to_url = re.search("RedirectAfterLogin\('([^']+)'\)",
                                    login_response_content)
        if not redirect_to_url:
            self.raise_login_error("Can't find redirect. Login failed.")

        logging.info('redirecting to %s', redirect_to_url.group(1))
        return redirect_to_url.group(1)
Beispiel #10
0
    def login(self):
        home_page_url = self.base_url() + '/en/'
        logging.info('fetching home page from %s', home_page_url)
        home_page_content = self.fetcher(home_page_url, deadline=10).content
        home_page = BeautifulSoup(home_page_content)

        login_url = None
        login_anchor = home_page.body.find('a', attrs={'id': 'myAccount'})
        if login_anchor:
            login_url = login_anchor['href'].strip()

        if not login_url:
            self.raise_login_error("can't find login url on home page")

        logging.info('fetching login page from %s', login_url)
        login_page_content = self.fetcher(login_url, deadline=10).content
        login_page = BeautifulSoup(login_page_content)

        login_form = login_page.body('form', attrs={'id': 'loginPageForm'})[0]

        if not login_form:
            self.raise_login_error("can't find login form on login page")

        form_fields = {}

        for input_field in login_page.findAll(name='input'):
            if input_field['type'] == 'submit':
                form_fields['submit'] = input_field['name']
            else:
                form_fields[input_field['name']] = input_field.get('value', '')

        form_fields.update({
            'j_username': self.card.number,
            'j_password': self.card.pin,
        })

        submit_login_url = urlparse.urljoin(login_url, login_form['action'])
        logging.info('submitting login information to %s', submit_login_url)

        login_response = self.fetcher(submit_login_url, form_fields)
        login_response_content = login_response.content

        redirect_to_url = re.search("RedirectAfterLogin\('([^']+)'\)", login_response_content)
        if not redirect_to_url:
            self.raise_login_error("Can't find redirect. Login failed.")

        logging.info('redirecting to %s', redirect_to_url.group(1))
        return redirect_to_url.group(1)
Beispiel #11
0
 def getgenres():
     from BeautifulSoup import BeautifulSoup
     from urllib import urlopen
     url = 'http://www.multimediasoft.com/amp3dj/help/amp3dj_00003e.htm'
     soup = BeautifulSoup(urlopen(url))
     genres = [None for _ in xrange(256)]
     for div in soup.body('div', 's0'):
         val = div.renderContents().replace('\xc2\xa0', ' ')
         val = val.replace('&nbsp;', ' ').replace('&amp;', '&')
         try:
             i, genre = val.split('-', 1)
             i = int(i)
         except ValueError:
             continue
         genres[i] = genre.strip()
     return [genre for genre in genres if genre]
Beispiel #12
0
def RunConversion():
    global DBlist, DBdict
    path="manpages/"
    dirList=os.listdir(path)
    
    for fname in dirList:
        if fname.endswith(".html"):
            DBdict = dict()
            content = False
            print "\nReading",fname
            newstring='.'.join(fname.split('.')[0:-1])+'.txt'
            f = open(path+fname, 'r')
            content  =  f.read() #NAME
            f.close()
            if content:
#            if content :
                try :
                    content = (re.sub(".*[M|n]an.*converted.*","",content))    
                    content = (re.sub(".*man2html.*","",content))    
                    soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES)
                    c =  ''.join(soup.body(text=True))
                    f = open(path+newstring, 'w')
                    towrite = c.encode('utf-8')
                    cleandata = re.search("(\w+\(.*)",towrite,re.S).group(1)
                    
                    DBdict['name'] = fname.split('.')[0][:-1] + "(" + fname.split('.')[0][-1:] + ")".strip()
                    DBdict['cleandata'] = cleandata.strip()
                    if re.search("NAME\n(.*)\n",cleandata,re.S):       
                        DBdict['header'] =  re.search("NAME\n(.+?)\n",cleandata,re.S).group(1).strip()
                    else:
                        DBdict['header'] = fname.split('.')[0][:-1]
                    DBlist.append(DBdict)
                    
                    f.write(cleandata)
                    f.close()
                    print newstring, " done !"
                except TypeError, e :
                    print "*"*100, "Error", fname
                    ErrorFile.write(str("\tError " + fname+" - "+ str(e) +"\n"))
                except UnicodeEncodeError, e :
                    print "*"*100, "Error", fname
                    ErrorFile.write(str("\t\tError " + fname+" - "+ str(e) +"\n"))
                except AttributeError, e :
                    print "*"*100, "Error", fname
                    ErrorFile.write(str("\t\t\tError " + fname+" - "+ str(e) +"\n"))
Beispiel #13
0
def searchUrl(url, level, searchText): # the root URL is level 0
    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
    except:
        return

    soup = BeautifulSoup(''.join(urlContent))
    # remove script tags
    c=soup.findAll('script')
    for i in c:
        i.extract() 
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
    except:
        return
    text = ''.join(body_texts) 

    # search
    if text.find(searchText) > -1:
        print url
        print

    # if there are links on the webpage then recursively repeat
    if level > 0:
        linkTags = soup.findAll('a')
        if len(linkTags) > 0:
            for linkTag in linkTags:
                try:
                    linkUrl = linkTag['href']
                    searchUrl(linkUrl, level - 1, searchText)
                except:
                    pass
Beispiel #14
0
def searchUrl(url, level, searchText):  # the root URL is level 0
    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    global urlList
    if url in urlList:  # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
    except:
        return

    soup = BeautifulSoup(''.join(urlContent))
    # remove script tags
    c = soup.findAll('script')
    for i in c:
        i.extract()
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
    except:
        return
    text = ''.join(body_texts)

    # search
    if text.find(searchText) > -1:
        print url
        print

    # if there are links on the webpage then recursively repeat
    if level > 0:
        linkTags = soup.findAll('a')
        if len(linkTags) > 0:
            for linkTag in linkTags:
                try:
                    linkUrl = linkTag['href']
                    searchUrl(linkUrl, level - 1, searchText)
                except:
                    pass
def urlretrieve(url, filename, cache={}, lock=threading.Lock()):
    'Read contents of an open url, use etags and decompress if needed'    
    request = urllib2.Request(url)
    #request.add_header('Cache-Control', 'no-cache')
    # Not expecting compressed files
    #request.add_header('Accept-Encoding', 'gzip')
    with lock:
        if ('etag ' + url) in cache:
            request.add_header('If-None-Match', cache['etag ' + url])
        if ('mod ' + url) in cache:
            request.add_header('If-Modified-Since', cache['mod ' + url])

    try:
        u = urllib2.urlopen(request)
    except urllib2.HTTPError as e:
        return Response(e.code, e.msg, False, False)
    content = u.read()
    u.close()

    compressed = u.info().getheader('Content-Encoding') == 'gzip'
    #if compressed:                                       
    #    content = gzip.GzipFile(fileobj=cStringIO.StringIO(content), mode='rb').read()
    #else:

    soup = BeautifulSoup(content)
    # Let's take HTML out! soup.body(text=True) returns this as a list of **unicode**
    content = str(''.join(soup.body(text=True)))

    written = writefile(filename, content) 

    with lock:
        etag = u.info().getheader('Etag')
        if etag:
            cache['etag ' + url] = etag
        timestamp = u.info().getheader('Last-Modified')
        if timestamp:
            cache['mod ' + url] = timestamp

    return Response(u.code, u.msg, compressed, written)
Beispiel #16
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()
        try:
            while True:
                changes = settings.db.changes(since=since)
                since = changes["last_seq"]
                for changeset in changes["results"]:
                    try:
                        doc = settings.db[changeset["id"]]
                    except couchdb.http.ResourceNotFound:
                        continue
                    if "type" in doc and doc["type"] == "page":
                        print "indexing", doc["url"]
                        soup = BeautifulSoup(doc["content"])
                        if soup.body is None:
                            continue

                        desc = soup.findAll('meta', attrs={"name": desc_re})

                        writer.update_document(
                            title=unicode(soup.title(
                                text=True)[0]) if soup.title is not None
                            and len(soup.title(text=True)) > 0 else doc["url"],
                            url=unicode(doc["url"]),
                            desc=unicode(desc[0]["content"]) if len(desc) > 0
                            and desc[0]["content"] is not None else u"",
                            rank=doc["rank"],
                            content=unicode(
                                soup.title(text=True)[0] + "\n" + doc["url"] +
                                "\n" + "".join(soup.body(text=True))))

                    writer.commit()
                    writer = get_writer()

                set_last_change(since)
        finally:
            set_last_change(since)
Beispiel #17
0
def searchUrl(url, level, searchText, stayInOriginalRoot = False): # the root URL is level 0

    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    # if we desire to stay witihin the rooturl's path, then return whenever we're not
    if stayInOriginalRoot :
        if url.find(rootUrl) == -1:
            return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
    except:
        return

    print "About to search " + url + " at level " + str(3-level) + "\n\n"

    soup = BeautifulSoup(''.join(urlContent))
    # remove script tags
    c=soup.findAll('script')
    for i in c:
        i.extract()
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
    except:
        return
    text = ''.join(body_texts)

    # search
    if text.find(searchText) > -1:
        print url
        print
        hitList.append(url)

    # if there are links on the webpage then recursively repeat
    if level > 0:
        linkTags = soup.findAll('a')
        if len(linkTags) > 0:
            for linkTag in linkTags:
                try:

                    linkUrl = linkTag['href']  # FIXED: error when searching pages that are supposed to concatonate onto main address!!
                    # Debug part 1
                    print "\n\n --------------------------------------"
                    print 'Link url before = ' + linkUrl

                    # Concatonate onto address if extended link
                    if (linkUrl.find('/') == -1 or linkUrl.find('/') == 0) and linkUrl.find('.html') == -1: #TODO still not perfect, because should append found string to the raw domain name that indexes server's root directory
                        main_url = ''
                        for n in netloc:
                            main_url = main_url + n + '.'

                        if linkUrl.find('/') == 0 :
                            main_url = main_url[0:-1]
                        else :
                            main_url = main_url[0:-1] + '/'

                        linkUrl = 'http://' + main_url + linkUrl

                    # Debug part 2
                    print 'Link url after = ' + linkUrl
                    print 'Levels left before exit = ' + level
                    print "---------------------------------------- \n\n"

                    searchUrl(linkUrl, level - 1, searchText, stayInOriginalRoot)

                except:
                    pass
Beispiel #18
0
import urllib2
import urllib
import re
from BeautifulSoup import BeautifulSoup

starturl="http://videolectures.net/site/list/events/"
baseurl="http://videolectures.net/"
page = urllib2.urlopen(starturl)
soup = BeautifulSoup(page)

#get all the event names
#soup.body(name='span',attrs={"class":"search_res"})
eventTagList=soup.body('span','search_res')

numEvents=len(eventTagList)
eventInfoList= list()
#each dict will have entries for name, date, abbr, url

for eventTag in eventTagList:
    name=eventTag.next #the event name
    relurl=eventTag.parent.attrs[0][1] #the event abbreviation, and url
    absurl=urllib.basejoin(baseurl,relurl)
    abbrev=relurl.split('/')[1].upper()    
    date=eventTag.parent.parent.contents[5].contents[0].contents[0] #the event date
    eventInfoList.append({"name":name,"abbrev":abbrev,"date":date,"url":absurl})
    


#get all the event dates
soup.body(name='span',attrs={"class":"text_bold"})
soup.body('span','text_bold')
Beispiel #19
0
    def test_email_diff_subtitles(self):
        initial_count = len(mail.outbox)
        # set a user who can receive notification
        # make sure we have a different author, else he won't get notified
        author = User(username='******',
                      email='*****@*****.**',
                      notify_by_email=True,
                      valid_email=True)
        author.save(send_email_confirmation=False)
        # bypass logic from hell
        author.valid_email = True
        author.save()

        # this is needed for the non_editor template check
        user2 = User(username='******',
                     email='*****@*****.**',
                     notify_by_email=True,
                     valid_email=True)
        user2.save(send_email_confirmation=False)
        # bypass logic from hell
        user2.valid_email = True
        user2.save()
        # version is indentical to previous one
        video, video_url = Video.add("http://wwww.example.com/video-diff.mp4",
                                     None)
        video.followers.add(author)
        video.followers.add(user2)

        language = SubtitleLanguage(video=video, language_code='en')
        language.save()
        subs_data = [
            [0, 1000, '1'],
            [1000, 2000, '2'],
        ]

        subtitles_1 = SubtitleSet.from_list('en', subs_data)
        old_version = language.add_version(subtitles=subtitles_1,
                                           author=author)

        # now we change the text on the second sub
        subs_data[1][2] = '2 changed'
        # add a regular sub
        subs_data.append([2000, 3000, 'new sub'])
        # add an unsyced
        subs_data.append([None, None, 'no sync'])
        subtitles_2 = SubtitleSet.from_list('en', subs_data)
        new_version = language.add_version(subtitles=subtitles_2)
        self.assertTrue(len(video.notification_list()) > 0)

        res = send_new_version_notification(new_version.pk)
        self.assertNotEqual(res, None)
        # we expect two emails, one is the new-edits-non-editor, and
        # the other for mail_notification.html
        self.assertEqual(len(mail.outbox), initial_count + 2)
        for email_number, email_msg in enumerate(mail.outbox):
            # make sure this is the right message
            self.assertIn("New edits to ", email_msg.subject)
            self.assertIn("video-diff.mp4", email_msg.subject)
            html = BeautifulSoup(email_msg.body)
            html_text = "".join(html.body(text=True)).replace("\n", "")
            if email_number == 0:
                # assert text and timing changes are correct
                self.assertIn('67% of the text', html_text)
                self.assertIn('33% of the timing was changed.', html_text)
            # find the listed text changes to make sure they match
            diff_table = html.findAll('table', attrs={'class': 'diffs'})[0]
            old_version_changes = []
            new_version_changes = []
            for i, node in enumerate(diff_table.findAll('td')):
                if i % 2 == 0:
                    old_version_changes.append(node.text)
                else:
                    new_version_changes.append(node.text)
            self.assertEqual(old_version_changes, [u'2', u'', u''])
            self.assertEqual(new_version_changes, [
                u'2 changed',
                u'new sub',
                u'no sync',
            ])
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()
        try:
            while True:
                changes = settings.db.changes(since=since)
                since = changes["last_seq"]
                for changeset in changes["results"]:
                    try:
                        doc = settings.db[changeset["id"]]
                    except couchdb.http.ResourceNotFound:
                        continue
                    if "type" in doc and doc["type"] == "page":
                        print "indexing", doc["url"]
                        soup = BeautifulSoup(doc["content"])
                        if soup.body is None:
                            continue

                        desc = soup.findAll('meta', attrs={ "name": desc_re })

                        writer.update_document(
                                title=unicode(soup.title(text=True)[0]) if soup.title is not None and len(soup.title(text=True)) > 0 else doc["url"],
                                url=unicode(doc["url"]),
                                desc=unicode(desc[0]["content"]) if len(desc) > 0 and desc[0]["content"] is not None else u"",
                                rank=doc["rank"],
                                content=unicode(soup.title(text=True)[0] + "\n" + doc["url"] + "\n" + "".join(soup.body(text=True)))
                            )

                    writer.commit()
                    writer = get_writer()

                set_last_change(since)
        finally:
            set_last_change(since)
import urllib2
import urllib
import re
from BeautifulSoup import BeautifulSoup

starturl="http://videolectures.net/site/list/events/"
baseurl="http://videolectures.net/"
page = urllib2.urlopen(starturl)
soup = BeautifulSoup(page)

#get all the event names
#full version: 
#soup.body(name='span',attrs={"class":"search_res"})
eventTagList=soup.body('span','search_res')

numEvents=len(eventTagList)
eventInfoList= list()
#eventInfoList will be a list of dicts
#each dict will have entries for event name, date, abbr, and url

for eventTag in eventTagList:
    relurl=eventTag.parent.attrs[0][1] #the event abbreviation, and url
    absurl=urllib.basejoin(baseurl,relurl)


    name=eventTag.next #the event name
    name=name.encode('ascii','ignore')
    abbrev=relurl.split('/')[1].upper()    
    date=eventTag.parent.parent.contents[5].contents[0].contents[0] #the event date
    eventInfoList.append({"name":name,"abbrev":abbrev,"date":date,"url":absurl})
Beispiel #22
0
#!/usr/bin/env python
"""Generate a .signature file."""

__author__ = 'Jed Frechette <*****@*****.**>'
__version__ = '0.1'
__date__ = '1 June 2007'

from BeautifulSoup import BeautifulSoup
from os import path
from urllib import urlopen

if __name__ == '__main__':
    dest = '/home/jdfrechette/briefcase'
    base_sig = ['Jed Frechette\n', 'http://jdfrechette.alturl.com\n\n']

    soup = BeautifulSoup(urlopen('http://icasualties.org'))
    dead = soup.body('span', id='lblCount')[0].find('font').string
    wounded = soup.body('table', id='dgYear')[0]
    wounded = wounded.findAll('td')[-1].string
    tag = '%s Dead, %s Wounded' % (dead, wounded)

    sig = open(path.join(dest, '.signature'), 'w')
    sig.writelines(base_sig)
    sig.write(tag)
    sig.close()
Beispiel #23
0
# #f.write(a.replace('\n{3,}', '\n').encode('utf-8'))
#
# f.close()
for i in range(1, 9):
    if i is not 6:
        path = "/Users/fyelles/Desktop/man-html-20111120/htmlman%s/" % (
            str(i))  # insert the path to the directory of interest
        dirList = os.listdir(path)
        for fname in dirList:
            if fname.endswith(".html"):
                content = False
                print "\nReading", fname
                newstring = '.'.join(fname.split('.')[0:-1]) + '.txt'
                f = open(path + fname, 'r')
                content = f.read()
                f.close()
                soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES)
                c = ''.join(soup.body(text=True))
                f = open(path + newstring, 'w')
                f.write((re.sub('\n{3,}', '\n\n', c)).encode('utf-8'))
                f.close()
                print newstring, " done !"


def main():
    pass


if __name__ == '__main__':
    main()
 for script in soup.findAll('script'):
   script.extract()
 for link in soup.findAll('a', href=True):
   if len(link['href']) > 9:
     pat = re.compile(r'^http').findall(link['href'])
     if pat:
       href=re.compile(r"/$").sub('',link['href'])
       temp=re.compile(r"\.").split( href.lower())
       size = len(temp)
       size = size -1
       ext=temp[size]
       if mime.has_key(ext):
         err=1
       else:
         urls.append(href)
 body = soup.body(text=True)
 body = ' '.join(body)
 body=convertAccents(body)
 body=cleanHTML(body)
 title=convertAccents(title)
 title=cleanHTML(title)
 try:
   body=unicodedata.normalize('NFKD',body).encode('ascii', 'ignore')
 except:
   err=1
 try:
   title=unicodedata.normalize('NFKD',title).encode('ascii', 'ignore')
 except:
   err=1
 body=re.compile(r'\n').sub(' ',body)
 body=re.compile(r'[ ]+').sub(' ',body)
import urllib2
import urllib
import re
from BeautifulSoup import BeautifulSoup

starturl="http://videolectures.net/site/list/events/"
baseurl="http://videolectures.net/"
page = urllib2.urlopen(starturl)
soup = BeautifulSoup(page)

#get all the event names
#full version: 
#soup.body(name='span',attrs={"class":"search_res"})
eventTagList=soup.body('span','search_res')

numEvents=len(eventTagList)
eventInfoList= list()
#eventInfoList will be a list of dicts
#each dict will have entries for event name, date, abbr, and url

for eventTag in eventTagList:
    relurl=eventTag.parent.attrs[0][1] #the event abbreviation, and url
    absurl=urllib.basejoin(baseurl,relurl)

    name=eventTag.next #the event name
    name=name.encode('ascii','ignore')
    abbrev=relurl.split('/')[1].upper()    
    date=eventTag.parent.parent.contents[5].contents[0].contents[0] #the event date
    eventInfoList.append({"name":name,"abbrev":abbrev,"date":date,"url":absurl})

    page=urllib2.urlopen(absurl)
Beispiel #26
0
def checkUrl(url, use_proxy=True):
    if url is None or url == "":
        return {"status": 900, "reason": "BAD-URL"}

    o = urlparse(url)

    resolvable = isResolvable(o.hostname)
    if not resolvable:
        return {"status": 903, "reason": "UNRESOLVABLE"}

    try:
        if o.scheme == "https":
            if use_proxy:
                conn = httplib.HTTPSConnection("explorer.bl.uk",
                                               3127,
                                               timeout=10)
            else:
                conn = httplib.HTTPSConnection(o.netloc, timeout=10)
        else:
            if use_proxy:
                conn = httplib.HTTPConnection("explorer.bl.uk",
                                              3127,
                                              timeout=10)
            else:
                conn = httplib.HTTPConnection(o.netloc, timeout=10)

        # Rebuild the full path inc. query etc.
        fullpath = o.path
        if o.params:
            fullpath += ';' + o.params
        if o.query:
            fullpath += '?' + o.query

        # Now make the request:
        if use_proxy:
            conn.request("GET", url)
        else:
            conn.request("GET", fullpath)
        res = conn.getresponse()
    except socket.timeout:
        return {"status": 924, "reason": "TIMEOUT"}
    except Exception as e:
        if str(e) == "[Errno 65] No route to host":
            return {"status": 903, "reason": "NOROUTE"}
        elif str(e) == "[Errno 51] Network is unreachable":
            return {"status": 903, "reason": "NETWORK-UNREACHABLE"}
        elif str(e) == "[Errno 61] Connection refused" or str(
                e) == "[Errno 111] Connection refused":
            return {"status": 903, "reason": "CONNECTION-REFUSED"}
        elif str(e) == "[Errno 54] Connection reset by peer":
            return {"status": 903, "reason": "CONNECTION-RESET"}
        else:
            return {"status": 903, "reason": "CONNECTION-FAILED: " + str(e)}

    if res.status / 100 == 3:
        location = res.getheader('location')
        state = checkUrl(location)
        status = state['status']
        reason = state['reason']
        if reason.endswith("VIA-REDIRECT+"):
            return state
        else:
            state['reason'] = reason + " VIA-REDIRECT+"
            return state
    elif res.status / 100 == 2:
        # Get a copy, hash it, get the title and ssdeep the text
        try:
            payload = res.read()
        except Exception as e:
            return {"status": 903, "reason": "READ-FAILED: " + str(e)}

        # Clean up and grab the text:
        title = ""
        text = ""
        first_fragment = ""
        fh = None
        try:
            soup = BeautifulSoup(payload,
                                 convertEntities=BeautifulSoup.HTML_ENTITIES)
            if soup.title != None:
                title = normaliseText(soup.title.string)
            [
                elem.extract()
                for elem in soup.findAll(['script', 'link', 'style'])
            ]
            comments = soup.findAll(
                text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]
            if soup.body != None:
                texts = [unicode(x) for x in soup.body(text=True)]
                text = soup.title.string + normaliseText(" ".join(texts))
            # Just pull out the first bit:
            first_fragment = text[:200]
            # Fuzzy hash
            if text != "":
                fh = fuzzyHash(text)
        except:
            pass
        # And the binary hash:
        md5 = hashlib.md5(payload).hexdigest()
        # And return:
        return {
            "status": res.status,
            "reason": res.reason,
            "title": title,
            "first_fragment": first_fragment,
            "fh": fh,
            "md5": md5,
            "text": text
        }
    else:
        return {"status": res.status, "reason": res.reason}
Beispiel #27
0
	def scrape_dk(self):
		''' Scrape method for Digikey. '''
		# Clear previous pricing data (in case price break keys change)
		search_url = 'http://search.digikey.com/us/en/products/' + self.manufacturer_pn
		search_page = urllib2.urlopen(search_url)
		search_soup = BeautifulSoup(search_page)
		
		# Create a list of product URLs from the search page
		prod_urls = []
		search_table = search_soup.body('table', id="productTable")
		if len(search_table) > 0:
			product_table = search_table[0]
			#print 'product_table: \n', product_table
			#print 'product_table.contents: \n', product_table.contents
			
			# Find tbody tag in table
			tbody_tag = product_table.find('tbody')
			#print 'tbody: \n', type(tbody_tag), tbody_tag
			#print 'tbody.contents: \n', type(tbody_tag.contents), tbody_tag.contents
			#print 'tbody.contents[0]: \n', type(tbody_tag.contents[0]), tbody_tag.contents[0]
			prod_rows = tbody_tag.findAll('tr')
			#print 'prod_rows: \n', type(prod_rows), prod_rows
			for row in prod_rows:
				#print "Search row in prod_rows: ", row
				anchor = row.find('a')
				# DK uses a relative path for these links
				prod_urls.append('http://search.digikey.com' + anchor['href'])
				#print 'Adding URL: ', 'http://search.digikey.com' + anchor['href']
		
		for url in prod_urls:
		
			page = urllib2.urlopen(url)
			soup = BeautifulSoup(page)
			#print "URL: %s" % url
			# Get prices
			prices = {}
			price_table = soup.body('table', id="pricing")
			#print 'price_table: ', type(price_table), price_table
			if len(price_table) == 0:
				raise ScrapeException(VENDOR_DK, self.manufacturer_pn, 4)
			# price_table.contents[x] should be the tr tags...
			for tag in price_table:
				#print 'tag: ', type(tag), tag
				for row in tag:
					#print 'row: ', type(row), row
					# row.contents should be td Tags... except the first!
					if row == '\n':
						pass
					elif row.contents[0].name == 'th':
						pass
						#print "Found row.name == th"
					else:
						new_break_str = row.contents[0].string
						# Remove commas
						if new_break_str.isdigit() == False:
							new_break_str = new_break_str.replace(",", "")
						#print "new_break_str is: %s" % new_break_str					
						new_break = int(new_break_str)
						new_unit_price = float(row.contents[1].string)
						prices[new_break] = new_unit_price
						#print 'Adding break/price to pricing dict: ', (new_break, new_unit_price)
					
			# Get inventory
			# If the item is out of stock, the <td> that normally holds the
			# quantity available will have a text input box that we need to
			# watch out for
			inv_soup = soup.body('td', id="quantityavailable")
			#print 'inv_soup: ', type(inv_soup), inv_soup
			#print "Length of form search results: %s" % len(inv_soup[0].findAll('form'))
			if len(inv_soup[0].findAll('form')) > 0:
				inventory = 0
			
			else:
				inv_str = inv_soup[0].contents[0]
				#print 'inv_str: ', type(inv_str), inv_str
				if inv_str.isdigit() == False:
					inv_str = inv_str.replace(",", "")
				inventory = int(inv_str)
				#print 'inventory: ', type(inventory), inventory
			
			vendor_pn = soup.body("th", text="Digi-Key Part Number")[0].parent.nextSibling.contents[0].string.__str__()
			# Get manufacturer and PN
			self.manufacturer = soup.body("th", text="Manufacturer")[0].parent.nextSibling.contents[0].contents[0].string.__str__()
			#print "manufacturer is: %s" % self.manufacturer
			self.manufacturer_pn = soup.body('th', text="Manufacturer Part Number")[0].parent.nextSibling.contents[0].string.__str__()
			#print "manufacturer_pn is: %s" % self.manufacturer_pn
			
			# Get datasheet filename and download
			datasheet_soup = soup.body('th', text="Datasheets")[0].parent.nextSibling
			datasheet_anchor = datasheet_soup.findAllNext('a')[0]
			#print "datasheet_soup is: %s" % datasheet_soup
			#print "datasheet_anchor is: %s" % datasheet_anchor
			self.datasheet_url = datasheet_anchor['href']
			#print "self.datasheet_url is: %s" % self.datasheet_url
			
			row = urllib2.urlopen(urllib2.Request(self.datasheet_url))
			try:
				file_name = get_filename(url,row)
				self.datasheet = file_name;
				# TODO: Do not re-download if already saved
				if DOWNLOAD_DATASHEET:
					with open(file_name, 'wb') as f:
						shutil.copyfileobj(row,f)
			finally:
				row.close()
			#print "datasheet is: %s" % self.datasheet
			# Get remaining strings (desc, category, family, series, package)
			self.description = soup.body('th', text="Description")[0].parent.nextSibling.contents[0].string.__str__()
			#print "description is: %s" % self.description
			category = soup.body('th', text="Category")[0].parent.nextSibling.contents[0].string.__str__()
			#print "category is: %s" % category
			family = soup.body('th', text="Family")[0].parent.nextSibling.contents[0].string.__str__()
			#print "family is: %s" % family
			series = soup.body('th', text="Series")[0].parent.nextSibling.contents[0].string.__str__()
			#print "series is: %s" % series
			self.package = soup.body('th', text="Package / Case")[0].parent.nextSibling.contents[0].string.__str__()
			#print "package is: %s" % self.package
			
			packaging_soup = soup.body('th', text="Packaging")[0].parent.parent.nextSibling.contents[0]
			#print "packaging_soup: ", type(packaging_soup), packaging_soup
			if type(packaging_soup) == NavigableString:
				packaging = packaging_soup.string.__str__()
				#print "packaging (from text): ", type(packaging), packaging
			elif type(packaging_soup) == Tag:
				packaging = packaging_soup.contents[0].string.__str__()
				#print "packaging (from link): ", type(packaging), packaging
			else:
				print 'Error: DK Packaging scrape failure!'
			if "Digi-Reel" in packaging:
				packaging = "Digi-Reel"	# Remove Restricted symbol
			key = VENDOR_DK + ': ' + vendor_pn + ' (' + packaging + ')'
			self.listings[key] = Listing(VENDOR_DK, vendor_pn, self.manufacturer_pn, prices, inventory, packaging)
			#v = Listing(VENDOR_DK, vendor_pn, self.manufacturer_pn, prices, inventory, pkg, reel, cat, fam, ser)
			self.listings[key].category = category
			self.listings[key].family = family
			self.listings[key].series = series
			if "Digi-Reel" in packaging:
				self.listings[key].reel_fee = 7
Beispiel #28
0
    def scrape_dk(self):
        ''' Scrape method for Digikey. '''
        # Clear previous pricing data (in case price break keys change)
        search_url = 'http://search.digikey.com/us/en/products/' + self.manufacturer_pn
        search_page = urllib2.urlopen(search_url)
        search_soup = BeautifulSoup(search_page)

        # Create a list of product URLs from the search page
        prod_urls = []
        search_table = search_soup.body('table', id="productTable")
        if len(search_table) > 0:
            product_table = search_table[0]
            #print 'product_table: \n', product_table
            #print 'product_table.contents: \n', product_table.contents

            # Find tbody tag in table
            tbody_tag = product_table.find('tbody')
            #print 'tbody: \n', type(tbody_tag), tbody_tag
            #print 'tbody.contents: \n', type(tbody_tag.contents), tbody_tag.contents
            #print 'tbody.contents[0]: \n', type(tbody_tag.contents[0]), tbody_tag.contents[0]
            prod_rows = tbody_tag.findAll('tr')
            #print 'prod_rows: \n', type(prod_rows), prod_rows
            for row in prod_rows:
                #print "Search row in prod_rows: ", row
                anchor = row.find('a')
                # DK uses a relative path for these links
                prod_urls.append('http://search.digikey.com' + anchor['href'])
                #print 'Adding URL: ', 'http://search.digikey.com' + anchor['href']

        for url in prod_urls:

            page = urllib2.urlopen(url)
            soup = BeautifulSoup(page)
            #print "URL: %s" % url
            # Get prices
            prices = {}
            price_table = soup.body('table', id="pricing")
            #print 'price_table: ', type(price_table), price_table
            if len(price_table) == 0:
                raise ScrapeException(VENDOR_DK, self.manufacturer_pn, 4)
            # price_table.contents[x] should be the tr tags...
            for tag in price_table:
                #print 'tag: ', type(tag), tag
                for row in tag:
                    #print 'row: ', type(row), row
                    # row.contents should be td Tags... except the first!
                    if row == '\n':
                        pass
                    elif row.contents[0].name == 'th':
                        pass
                        #print "Found row.name == th"
                    else:
                        new_break_str = row.contents[0].string
                        # Remove commas
                        if new_break_str.isdigit() == False:
                            new_break_str = new_break_str.replace(",", "")
                        #print "new_break_str is: %s" % new_break_str
                        new_break = int(new_break_str)
                        new_unit_price = float(row.contents[1].string)
                        prices[new_break] = new_unit_price
                        #print 'Adding break/price to pricing dict: ', (new_break, new_unit_price)

            # Get inventory
            # If the item is out of stock, the <td> that normally holds the
            # quantity available will have a text input box that we need to
            # watch out for
            inv_soup = soup.body('td', id="quantityavailable")
            #print 'inv_soup: ', type(inv_soup), inv_soup
            #print "Length of form search results: %s" % len(inv_soup[0].findAll('form'))
            if len(inv_soup[0].findAll('form')) > 0:
                inventory = 0

            else:
                inv_str = inv_soup[0].contents[0]
                #print 'inv_str: ', type(inv_str), inv_str
                if inv_str.isdigit() == False:
                    inv_str = inv_str.replace(",", "")
                inventory = int(inv_str)
                #print 'inventory: ', type(inventory), inventory

            vendor_pn = soup.body(
                "th", text="Digi-Key Part Number"
            )[0].parent.nextSibling.contents[0].string.__str__()
            # Get manufacturer and PN
            self.manufacturer = soup.body(
                "th", text="Manufacturer"
            )[0].parent.nextSibling.contents[0].contents[0].string.__str__()
            #print "manufacturer is: %s" % self.manufacturer
            self.manufacturer_pn = soup.body(
                'th', text="Manufacturer Part Number"
            )[0].parent.nextSibling.contents[0].string.__str__()
            #print "manufacturer_pn is: %s" % self.manufacturer_pn

            # Get datasheet filename and download
            datasheet_soup = soup.body('th',
                                       text="Datasheets")[0].parent.nextSibling
            datasheet_anchor = datasheet_soup.findAllNext('a')[0]
            #print "datasheet_soup is: %s" % datasheet_soup
            #print "datasheet_anchor is: %s" % datasheet_anchor
            self.datasheet_url = datasheet_anchor['href']
            #print "self.datasheet_url is: %s" % self.datasheet_url

            row = urllib2.urlopen(urllib2.Request(self.datasheet_url))
            try:
                file_name = get_filename(url, row)
                self.datasheet = file_name
                # TODO: Do not re-download if already saved
                if DOWNLOAD_DATASHEET:
                    with open(file_name, 'wb') as f:
                        shutil.copyfileobj(row, f)
            finally:
                row.close()
            #print "datasheet is: %s" % self.datasheet
            # Get remaining strings (desc, category, family, series, package)
            self.description = soup.body(
                'th', text="Description"
            )[0].parent.nextSibling.contents[0].string.__str__()
            #print "description is: %s" % self.description
            category = soup.body(
                'th', text="Category"
            )[0].parent.nextSibling.contents[0].string.__str__()
            #print "category is: %s" % category
            family = soup.body(
                'th', text="Family"
            )[0].parent.nextSibling.contents[0].string.__str__()
            #print "family is: %s" % family
            series = soup.body(
                'th', text="Series"
            )[0].parent.nextSibling.contents[0].string.__str__()
            #print "series is: %s" % series
            self.package = soup.body(
                'th', text="Package / Case"
            )[0].parent.nextSibling.contents[0].string.__str__()
            #print "package is: %s" % self.package

            packaging_soup = soup.body(
                'th',
                text="Packaging")[0].parent.parent.nextSibling.contents[0]
            #print "packaging_soup: ", type(packaging_soup), packaging_soup
            if type(packaging_soup) == NavigableString:
                packaging = packaging_soup.string.__str__()
                #print "packaging (from text): ", type(packaging), packaging
            elif type(packaging_soup) == Tag:
                packaging = packaging_soup.contents[0].string.__str__()
                #print "packaging (from link): ", type(packaging), packaging
            else:
                print 'Error: DK Packaging scrape failure!'
            if "Digi-Reel" in packaging:
                packaging = "Digi-Reel"  # Remove Restricted symbol
            key = VENDOR_DK + ': ' + vendor_pn + ' (' + packaging + ')'
            self.listings[key] = Listing(VENDOR_DK, vendor_pn,
                                         self.manufacturer_pn, prices,
                                         inventory, packaging)
            #v = Listing(VENDOR_DK, vendor_pn, self.manufacturer_pn, prices, inventory, pkg, reel, cat, fam, ser)
            self.listings[key].category = category
            self.listings[key].family = family
            self.listings[key].series = series
            if "Digi-Reel" in packaging:
                self.listings[key].reel_fee = 7
Beispiel #29
0
def checkUrl(url, use_proxy=True):
    if url is None or url == "":
        return { "status": 900, "reason": "BAD-URL" }

    o = urlparse(url)
    
    resolvable = isResolvable(o.hostname)
    if not resolvable:
        return { "status": 903, "reason": "UNRESOLVABLE" }

    try:
        if o.scheme == "https":
            if use_proxy:
                conn = httplib.HTTPSConnection("explorer.bl.uk", 3127, timeout=10)
            else:
                conn = httplib.HTTPSConnection(o.netloc, timeout=10)
        else:
            if use_proxy:
                conn = httplib.HTTPConnection("explorer.bl.uk", 3127, timeout=10)
            else:
                conn = httplib.HTTPConnection(o.netloc, timeout=10)

        # Rebuild the full path inc. query etc.
        fullpath = o.path
        if o.params:
            fullpath += ';'+o.params
        if o.query:
            fullpath += '?'+o.query

        # Now make the request:
        if use_proxy:
            conn.request("GET", url )
        else:
            conn.request("GET", fullpath )
        res = conn.getresponse()
    except socket.timeout:
        return { "status": 924, "reason": "TIMEOUT" }
    except Exception as e:
        if str(e) == "[Errno 65] No route to host":
            return { "status": 903, "reason": "NOROUTE" }
        elif str(e) == "[Errno 51] Network is unreachable":
            return { "status": 903, "reason": "NETWORK-UNREACHABLE" }
        elif str(e) == "[Errno 61] Connection refused" or str(e) == "[Errno 111] Connection refused":
            return { "status": 903, "reason": "CONNECTION-REFUSED" }
        elif str(e) == "[Errno 54] Connection reset by peer":
            return { "status": 903, "reason": "CONNECTION-RESET" }
        else:
            return { "status": 903, "reason": "CONNECTION-FAILED: "+str(e) }
    
    if res.status / 100 == 3:
        location = res.getheader('location')
        state = checkUrl(location)
        status = state['status']
        reason = state['reason']
        if reason.endswith("VIA-REDIRECT+"):
            return state
        else:
            state['reason'] = reason+" VIA-REDIRECT+" 
            return state
    elif res.status / 100 == 2:
        # Get a copy, hash it, get the title and ssdeep the text
        try:
            payload = res.read()
        except Exception as e:
            return { "status": 903, "reason": "READ-FAILED: "+str(e) }

        # Clean up and grab the text:
        title = ""
        text = ""
        first_fragment = ""
        fh = None
        try:
            soup = BeautifulSoup(payload,convertEntities=BeautifulSoup.HTML_ENTITIES)
            if soup.title != None:
                title = normaliseText(soup.title.string)
            [ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ]
            comments = soup.findAll(text=lambda text:isinstance(text, Comment))
            [comment.extract() for comment in comments]
            if soup.body != None:
                texts = [ unicode(x) for x in soup.body(text=True) ]
                text =  soup.title.string + normaliseText( " ".join(texts) )
            # Just pull out the first bit:
            first_fragment = text[:200]
            # Fuzzy hash
            if text != "":
                fh = fuzzyHash(text)
        except:
            pass
        # And the binary hash:
        md5 = hashlib.md5(payload).hexdigest()
        # And return:
        return { "status": res.status, "reason": res.reason, "title": title, "first_fragment": first_fragment, "fh":fh, "md5":md5, "text":text }
    else:
        return { "status": res.status, "reason": res.reason }
Beispiel #30
0
for eventString in eventFile.readlines():
    eventurl=eventString.split('---')[-1].rstrip()
    eventAbbrev=eventString.split('---')[0]
    #eventurl="http://videolectures.net/epsrcws08_sheffield/"
    
    #page = urllib2.urlopen(http://videolectures.net/epsrcws08_rasmussen_lgp/)
    page = urllib2.urlopen(eventurl)
    soup = BeautifulSoup(page)
    
    
    
    
    #get all the event names
    #full version: 
    #soup.body(name='div',attrs={"class":"author"})
    vidPageList=soup.body('div','lec_thumb_click')
    
    
    numVidPages = len(vidPageList)
    vidPageInfoList = list()
    #vidPageInfoList will be a list of dicts
    #each dict will have entries for vidpage title, and vidpage url
    
    for vidPage in vidPageList:
        #authorName=authorTag.contents[0].string
        relurl = vidPage.next.attrs[0][1]
        absurl = urllib.basejoin(baseurl,relurl)
        page   = urllib2.urlopen(absurl)
        soup   = BeautifulSoup(page)
        title  = soup.head.title.string
        title  = title.encode('ascii','ignore') 
Beispiel #31
0
 def get_text(self):
     bs = BeautifulSoup(self.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
     return ''.join(bs.body(text=True))
Beispiel #32
0
def parse_index_html(limit = 100):
    url = 'http://ascl.net/code/all/page/1/limit/{0}/order/date/listmode/compact/dir/desc'.format(limit)
    parsed_html = BeautifulSoup(urllib2.urlopen(url))
    return ((i.find('span', attrs={'class':'ascl_id'}).text,
             i.find('span', attrs={'class':'title'}).find('a')['href'][1:])
            for i in parsed_html.body('div', attrs={'class':'item'}))
Beispiel #33
0
# f.write((re.sub('\n{3,}','\n\n',a)).encode('utf-8')) 
# #f.write(a.replace('\n{3,}', '\n').encode('utf-8')) 
# 
# f.close()
for i in range(1,9):
    if i is not 6 :
        path="/Users/fyelles/Desktop/man-html-20111120/htmlman%s/"% ( str(i))  # insert the path to the directory of interest
        dirList=os.listdir(path)
        for fname in dirList:
            if fname.endswith(".html"):
                content = False
                print "\nReading",fname
                newstring='.'.join(fname.split('.')[0:-1])+'.txt'
                f = open(path+fname, 'r')
                content  =  f.read() 
                f.close()    
                soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES)
                c =  ''.join(soup.body(text=True))
                f = open(path+newstring, 'w')
                f.write((re.sub('\n{3,}','\n\n',c)).encode('utf-8')) 
                f.close()
                print newstring, " done !"

def main():
    pass


if __name__ == '__main__':
    main()