Python BeautifulSoup.firstText Examples, BeautifulSoup.BeautifulSoup.firstText Python Examples

Example #1

0

Show file

File: nightly_import_scrape.py Project: cc-archive/stats

def scrape_scratch(query):
    '''Scrape scratch.mit.edu for project count'''
    page = urllib2.urlopen(query).read()
    soup = BeautifulSoup(page)
    regex = re.compile('([0-9,]+)\s+projects\s+uploaded,')
    count_part = soup.firstText(regex)
    if count_part:
        count = regex.match(count_part.strip()).group(1)
        return int(count.replace(',',''))
    else:
        raise Exception, "No project count found at Scratch!"

Example #2

0

Show file

    def search(self, terms, limit=10):
        """
		@brief This function searches Google Scholar using the specified terms.
		
		Returns a list of dictionarys. Each
		dictionary contains the information related to the article:
			"URL"		: link to the article/n
			"Title"		: title of the publication/n
			"Authors"	: authors (example: DF Easton, DT Bishop, D Ford)/n
			"JournalYear" 	: journal name & year (example: Nature, 2001)/n
			"JournalURL"	: link to the journal main website (example: www.nature.com)/n
			"Abstract"	: abstract of the publication/n
			"NumCited"	: number of times the publication is cited/n
			"Terms"		: list of search terms used in the query/n

		@param terms List of search terms
		@param limit Maximum number of results to be returned (default=10)
		@return List of results, this is the empty list if nothing is found
		"""
        params = urllib.urlencode({'q': "+".join(terms), 'num': limit})
        headers = {
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        }

        url = self.SEARCH_BASE_URL + "?" + params
        conn = httplib.HTTPConnection(self.SEARCH_HOST)
        conn.request("GET", url, {}, headers)

        resp = conn.getresponse()

        if resp.status == 200:
            html = resp.read()
            results = []
            html = html.decode('ascii', 'ignore')

            # Screen-scrape the result to obtain the publication information
            soup = BeautifulSoup(html)
            citations = 0
            for record in soup('p', {'class': 'g'}):

                # Includeds error checking
                topPart = record.first('span', {'class': 'w'})

                pubURL = topPart.a['href']
                # Clean up the URL, make sure it does not contain '\' but '/' instead
                pubURL = pubURL.replace('\\', '/')

                pubTitle = ""

                for part in topPart.a.contents:
                    pubTitle += str(part)

                if pubTitle == "":
                    match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a',
                                        str(record))
                    match2 = re.split('- <a', match1[citations])
                    pubTitle = re.sub('<\/?(\S)+>', "", match2[0])
                    citations = citations + 1

                authorPart = record.first('font', {'color': 'green'}).string
                if str(authorPart) == 'Null':
                    authorPart = ''
                    # Sometimes even BeautifulSoup can fail, fall back to regex
                    m = re.findall('<font color="green">(.*)</font>',
                                   str(record))
                    if len(m) > 0:
                        authorPart = m[0]
                num = authorPart.count(" - ")
                # Assume that the fields are delimited by ' - ', the first entry will be the
                # list of authors, the last entry is the journal URL, anything in between
                # should be the journal year
                idx_start = authorPart.find(' - ')
                idx_end = authorPart.rfind(' - ')
                pubAuthors = authorPart[:idx_start]
                pubJournalYear = authorPart[idx_start + 3:idx_end]
                pubJournalURL = authorPart[idx_end + 3:]
                # If (only one ' - ' is found) and (the end bit contains '\d\d\d\d')
                # then the last bit is journal year instead of journal URL
                if pubJournalYear == '' and re.search('\d\d\d\d',
                                                      pubJournalURL) != None:
                    pubJournalYear = pubJournalURL
                    pubJournalURL = ''

                # This can potentially fail if all of the abstract can be contained in the space
                # provided such that no '...' is found
                delimiter = soup.firstText("...").parent
                pubAbstract = ""
                while str(delimiter) != 'Null' and (
                        str(delimiter) != '<b>...</b>' or pubAbstract == ""):
                    pubAbstract += str(delimiter)
                    delimiter = delimiter.nextSibling
                pubAbstract += '<b>...</b>'

                match = re.search("Cited by ([^<]*)", str(record))
                pubCitation = ''
                if match != None:
                    pubCitation = match.group(1)
                results.append({
                    "URL": pubURL,
                    "Title": pubTitle,
                    "Authors": pubAuthors,
                    "JournalYear": pubJournalYear,
                    "JournalURL": pubJournalURL,
                    "Abstract": pubAbstract,
                    "NumCited": pubCitation,
                    "Terms": terms
                })
            return results
        else:
            print "ERROR: ",
            print resp.status, resp.reason
            return []

Example #3

0

Show file

File: google_search.py Project: ravivmg/google-scholar-search

    def search(self, terms, limit=10):
        start = 0
        results = []
        while start+10<=limit:
            params = urllib.urlencode({'q': "+".join(terms),'as_yhi': 2008, 'start': start })
            headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}

            url = self.SEARCH_BASE_URL+"?"+params
            conn = httplib.HTTPConnection(self.SEARCH_HOST)
            conn.request("GET", url, {}, headers)
    
            resp = conn.getresponse()      
            if resp.status==200:
                html = resp.read()
                html = html.decode('ascii', 'ignore')
                        
                # Screen-scrape the result to obtain the publication information
                soup = BeautifulSoup(html)
                citations = 0
                for record in soup('div', {'class': 'gs_r'}):
             
                    # Includeds error checking
                    topPart = record.first('h3')                                
                        
                    pubURL = topPart.a['href']
                
                    # Clean up the URL, make sure it does not contain '\' but '/' instead
                    pubURL = pubURL.replace('\\', '/')

                    pubTitle = ""
                    for part in topPart.a.contents:
                        pubTitle += str(part.string)
                

                    if pubTitle == "":
                        match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a',str(record))
                        match2 = re.split('- <a',match1[citations])
                        pubTitle = re.sub('<\/?(\S)+>',"",match2[0])
                        citations = citations + 1
               
                    authorPart = record.first('span', {'class': 'gs_a'}).string
                    if authorPart == None:
                        authorPart = re.search('<span class="gs_a">(.*?)</span>',str(record)).group(1)
                    num = authorPart.count(" - ")
                    # Assume that the fields are delimited by ' - ', the first entry will be the
                    # list of authors, the last entry is the journal URL, anything in between
                    # should be the journal year
                    idx_start = authorPart.find(' - ')
                    idx_end = authorPart.rfind(' - ')
                    pubAuthors = authorPart[:idx_start]             
                    pubJournalYear = re.search('\d{4}',authorPart[idx_start + 3:idx_end]).group(0)
                    pubJournalURL = authorPart[idx_end + 3:]
                    # If (only one ' - ' is found) and (the end bit contains '\d\d\d\d')
                    # then the last bit is journal year instead of journal URL
                    if pubJournalYear=='' and re.search('\d\d\d\d', pubJournalURL)!=None:
                        pubJournalYear = pubJournalURL
                        pubJournalURL = ''
                               
                    # This can potentially fail if all of the abstract can be contained in the space
                    # provided such that no '...' is found
                    delimiter = soup.firstText("...").parent
                    pubAbstract = ""
                    while str(delimiter)!='Null' and (str(delimiter)!='<b>...</b>' or pubAbstract==""):
                        pubAbstract += str(delimiter)
                        delimiter = delimiter.nextSibling
                    pubAbstract += '<b>...</b>'
                
                    match = re.search("Cited by ([^<]*)", str(record))
                    pubCitation = ''
                    if match != None:
                        pubCitation = match.group(1)
                    results.append({
                        "URL": pubURL,
                        "Title": pubTitle,
                        "Authors": pubAuthors,
                        "JournalYear": pubJournalYear,
                        "JournalURL": pubJournalURL,
                        "Abstract": pubAbstract,
                        "NumCited": pubCitation,
                        "Terms": terms
                    })
            else:
                print "ERROR: ",
                print resp.status, resp.reason
                return []
            start+=10
            time.sleep(3)
        return results

Example #4

0

Show file

File: login.py Project: chris-martin/tcube

  cookiejar = CookieJar()
  browser = Browser()
  browser.set_cookiejar(cookiejar)

  verbose('Opening CAS login page')
  response = browser.open('https://login.gatech.edu/cas/login')
  verbose(response.get_data())
  browser.select_form(predicate = lambda f: 'id' in f.attrs and f.attrs['id'] == 'fm1')
  browser['username'] = args.username
  browser['password'] = args.password

  verbose('Logging into CAS')
  response = browser.submit()
  verbose(response.get_data())
  soup = Soup(response.get_data())
  success = soup.firstText(re.compile('(.*)log(.*)success(.*)', re.IGNORECASE)) is not None
  if not success:
    output['status'] = 401
    output['errors'] = map(lambda x: x.string, soup.findAll(attrs={'class': 'errors'}))
    raise Exception("Login failed")

  verbose('Logging into T-Square')
  response = browser.open('https://t-square.gatech.edu')
  verbose(response.get_data())

  cookies = cookiejar_to_list(cookiejar)
  output['cookies'] = cookies
  verbose(cookies)

  verbose('Loading session info')
  response = browser.open('https://t-square.gatech.edu/direct/session/current.json')

Example #5

0

Show file

File: recipe-523047-1.py Project: AlexLyj/aminer-spider

    def search(self, terms, limit=10):
        """
        @brief This function searches Google Scholar using the specified terms.
        
        Returns a list of dictionarys. Each
        dictionary contains the information related to the article:
            "URL"       : link to the article/n
            "Title"     : title of the publication/n
            "Authors"   : authors (example: DF Easton, DT Bishop, D Ford)/n
            "JournalYear"   : journal name & year (example: Nature, 2001)/n
            "JournalURL"    : link to the journal main website (example: www.nature.com)/n
            "Abstract"  : abstract of the publication/n
            "NumCited"  : number of times the publication is cited/n
            "Terms"     : list of search terms used in the query/n

        @param terms List of search terms
        @param limit Maximum number of results to be returned (default=10)
        @return List of results, this is the empty list if nothing is found
        """
        params = urllib.urlencode({'q': "+".join(terms), 'num': limit})
        headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}

        url = self.SEARCH_BASE_URL+"?"+params
        conn = httplib.HTTPConnection(self.SEARCH_HOST)
        conn.request("GET", url, {}, headers)
    
        resp = conn.getresponse()      
        
        if resp.status==200:
            html = resp.read()
            results = []
            html = html.decode('ascii', 'ignore')
                        
            # Screen-scrape the result to obtain the publication information
            soup = BeautifulSoup(html)
            citations = 0
            for record in soup('p', {'class': 'g'}):
             
                # Includeds error checking
                topPart = record.first('span', {'class': 'w'})                                
                
                pubURL = topPart.a['href']
                # Clean up the URL, make sure it does not contain '\' but '/' instead
                pubURL = pubURL.replace('\\', '/')

                pubTitle = ""
                
                for part in topPart.a.contents:
                    pubTitle += str(part)
                
                if pubTitle == "":
                    match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a',str(record))
                    match2 = re.split('- <a',match1[citations])
                    pubTitle = re.sub('<\/?(\S)+>',"",match2[0])
                    citations = citations + 1
               
                authorPart = record.first('font', {'color': 'green'}).string
                if str(authorPart)=='Null': 
                    authorPart = ''
                    # Sometimes even BeautifulSoup can fail, fall back to regex
                    m = re.findall('<font color="green">(.*)</font>', str(record))
                    if len(m)>0:
                        authorPart = m[0]
                num = authorPart.count(" - ")
                # Assume that the fields are delimited by ' - ', the first entry will be the
                # list of authors, the last entry is the journal URL, anything in between
                # should be the journal year
                idx_start = authorPart.find(' - ')
                idx_end = authorPart.rfind(' - ')
                pubAuthors = authorPart[:idx_start]             
                pubJournalYear = authorPart[idx_start + 3:idx_end]
                pubJournalURL = authorPart[idx_end + 3:]
                # If (only one ' - ' is found) and (the end bit contains '\d\d\d\d')
                # then the last bit is journal year instead of journal URL
                if pubJournalYear=='' and re.search('\d\d\d\d', pubJournalURL)!=None:
                    pubJournalYear = pubJournalURL
                    pubJournalURL = ''
                               
                # This can potentially fail if all of the abstract can be contained in the space
                # provided such that no '...' is found
                delimiter = soup.firstText("...").parent
                pubAbstract = ""
                while str(delimiter)!='Null' and (str(delimiter)!='<b>...</b>' or pubAbstract==""):
                    pubAbstract += str(delimiter)
                    delimiter = delimiter.nextSibling
                pubAbstract += '<b>...</b>'
                
                match = re.search("Cited by ([^<]*)", str(record))
                pubCitation = ''
                if match != None:
                    pubCitation = match.group(1)
                results.append({
                    "URL": pubURL,
                    "Title": pubTitle,
                    "Authors": pubAuthors,
                    "JournalYear": pubJournalYear,
                    "JournalURL": pubJournalURL,
                    "Abstract": pubAbstract,
                    "NumCited": pubCitation,
                    "Terms": terms
                })
            return results
        else:
            print "ERROR: ",
            print resp.status, resp.reason
            return []

Example #6

0

Show file

from pymongo import Connection

connection = Connection()
db = connection.placemillDB
citiesCollection = db.places
categoriesCollection = db.categories

f = urllib2.urlopen('http://www.yelp.com/developers/documentation/neighborhood_list')

raw = f.read()

soup = BeautifulSoup(raw)

places = soup.find('ul', {'class':'attr-list'})

usa = soup.firstText('USA').findNext('ul')

#get list of cities in usa
usaCities = usa.findAll('li', recursive=False)

#take out <li> tags

for i in range(len(usaCities)):
    usaCities[i] = usaCities[i].text

#get list of places in each city
places = {}
for i in usaCities:
   temp =  usa.firstText(i).findNext('ul').findAll('li')
   for j in range(len(temp)):
       temp[j] = temp[j].text