def scrape_scratch(query): '''Scrape scratch.mit.edu for project count''' page = urllib2.urlopen(query).read() soup = BeautifulSoup(page) regex = re.compile('([0-9,]+)\s+projects\s+uploaded,') count_part = soup.firstText(regex) if count_part: count = regex.match(count_part.strip()).group(1) return int(count.replace(',','')) else: raise Exception, "No project count found at Scratch!"
def search(self, terms, limit=10): """ @brief This function searches Google Scholar using the specified terms. Returns a list of dictionarys. Each dictionary contains the information related to the article: "URL" : link to the article/n "Title" : title of the publication/n "Authors" : authors (example: DF Easton, DT Bishop, D Ford)/n "JournalYear" : journal name & year (example: Nature, 2001)/n "JournalURL" : link to the journal main website (example: www.nature.com)/n "Abstract" : abstract of the publication/n "NumCited" : number of times the publication is cited/n "Terms" : list of search terms used in the query/n @param terms List of search terms @param limit Maximum number of results to be returned (default=10) @return List of results, this is the empty list if nothing is found """ params = urllib.urlencode({'q': "+".join(terms), 'num': limit}) headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } url = self.SEARCH_BASE_URL + "?" + params conn = httplib.HTTPConnection(self.SEARCH_HOST) conn.request("GET", url, {}, headers) resp = conn.getresponse() if resp.status == 200: html = resp.read() results = [] html = html.decode('ascii', 'ignore') # Screen-scrape the result to obtain the publication information soup = BeautifulSoup(html) citations = 0 for record in soup('p', {'class': 'g'}): # Includeds error checking topPart = record.first('span', {'class': 'w'}) pubURL = topPart.a['href'] # Clean up the URL, make sure it does not contain '\' but '/' instead pubURL = pubURL.replace('\\', '/') pubTitle = "" for part in topPart.a.contents: pubTitle += str(part) if pubTitle == "": match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a', str(record)) match2 = re.split('- <a', match1[citations]) pubTitle = re.sub('<\/?(\S)+>', "", match2[0]) citations = citations + 1 authorPart = record.first('font', {'color': 'green'}).string if str(authorPart) == 'Null': authorPart = '' # Sometimes even BeautifulSoup can fail, fall back to regex m = re.findall('<font color="green">(.*)</font>', str(record)) if len(m) > 0: authorPart = m[0] num = authorPart.count(" - ") # Assume that the fields are delimited by ' - ', the first entry will be the # list of authors, the last entry is the journal URL, anything in between # should be the journal year idx_start = authorPart.find(' - ') idx_end = authorPart.rfind(' - ') pubAuthors = authorPart[:idx_start] pubJournalYear = authorPart[idx_start + 3:idx_end] pubJournalURL = authorPart[idx_end + 3:] # If (only one ' - ' is found) and (the end bit contains '\d\d\d\d') # then the last bit is journal year instead of journal URL if pubJournalYear == '' and re.search('\d\d\d\d', pubJournalURL) != None: pubJournalYear = pubJournalURL pubJournalURL = '' # This can potentially fail if all of the abstract can be contained in the space # provided such that no '...' is found delimiter = soup.firstText("...").parent pubAbstract = "" while str(delimiter) != 'Null' and ( str(delimiter) != '<b>...</b>' or pubAbstract == ""): pubAbstract += str(delimiter) delimiter = delimiter.nextSibling pubAbstract += '<b>...</b>' match = re.search("Cited by ([^<]*)", str(record)) pubCitation = '' if match != None: pubCitation = match.group(1) results.append({ "URL": pubURL, "Title": pubTitle, "Authors": pubAuthors, "JournalYear": pubJournalYear, "JournalURL": pubJournalURL, "Abstract": pubAbstract, "NumCited": pubCitation, "Terms": terms }) return results else: print "ERROR: ", print resp.status, resp.reason return []
def search(self, terms, limit=10): start = 0 results = [] while start+10<=limit: params = urllib.urlencode({'q': "+".join(terms),'as_yhi': 2008, 'start': start }) headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} url = self.SEARCH_BASE_URL+"?"+params conn = httplib.HTTPConnection(self.SEARCH_HOST) conn.request("GET", url, {}, headers) resp = conn.getresponse() if resp.status==200: html = resp.read() html = html.decode('ascii', 'ignore') # Screen-scrape the result to obtain the publication information soup = BeautifulSoup(html) citations = 0 for record in soup('div', {'class': 'gs_r'}): # Includeds error checking topPart = record.first('h3') pubURL = topPart.a['href'] # Clean up the URL, make sure it does not contain '\' but '/' instead pubURL = pubURL.replace('\\', '/') pubTitle = "" for part in topPart.a.contents: pubTitle += str(part.string) if pubTitle == "": match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a',str(record)) match2 = re.split('- <a',match1[citations]) pubTitle = re.sub('<\/?(\S)+>',"",match2[0]) citations = citations + 1 authorPart = record.first('span', {'class': 'gs_a'}).string if authorPart == None: authorPart = re.search('<span class="gs_a">(.*?)</span>',str(record)).group(1) num = authorPart.count(" - ") # Assume that the fields are delimited by ' - ', the first entry will be the # list of authors, the last entry is the journal URL, anything in between # should be the journal year idx_start = authorPart.find(' - ') idx_end = authorPart.rfind(' - ') pubAuthors = authorPart[:idx_start] pubJournalYear = re.search('\d{4}',authorPart[idx_start + 3:idx_end]).group(0) pubJournalURL = authorPart[idx_end + 3:] # If (only one ' - ' is found) and (the end bit contains '\d\d\d\d') # then the last bit is journal year instead of journal URL if pubJournalYear=='' and re.search('\d\d\d\d', pubJournalURL)!=None: pubJournalYear = pubJournalURL pubJournalURL = '' # This can potentially fail if all of the abstract can be contained in the space # provided such that no '...' is found delimiter = soup.firstText("...").parent pubAbstract = "" while str(delimiter)!='Null' and (str(delimiter)!='<b>...</b>' or pubAbstract==""): pubAbstract += str(delimiter) delimiter = delimiter.nextSibling pubAbstract += '<b>...</b>' match = re.search("Cited by ([^<]*)", str(record)) pubCitation = '' if match != None: pubCitation = match.group(1) results.append({ "URL": pubURL, "Title": pubTitle, "Authors": pubAuthors, "JournalYear": pubJournalYear, "JournalURL": pubJournalURL, "Abstract": pubAbstract, "NumCited": pubCitation, "Terms": terms }) else: print "ERROR: ", print resp.status, resp.reason return [] start+=10 time.sleep(3) return results
cookiejar = CookieJar() browser = Browser() browser.set_cookiejar(cookiejar) verbose('Opening CAS login page') response = browser.open('https://login.gatech.edu/cas/login') verbose(response.get_data()) browser.select_form(predicate = lambda f: 'id' in f.attrs and f.attrs['id'] == 'fm1') browser['username'] = args.username browser['password'] = args.password verbose('Logging into CAS') response = browser.submit() verbose(response.get_data()) soup = Soup(response.get_data()) success = soup.firstText(re.compile('(.*)log(.*)success(.*)', re.IGNORECASE)) is not None if not success: output['status'] = 401 output['errors'] = map(lambda x: x.string, soup.findAll(attrs={'class': 'errors'})) raise Exception("Login failed") verbose('Logging into T-Square') response = browser.open('https://t-square.gatech.edu') verbose(response.get_data()) cookies = cookiejar_to_list(cookiejar) output['cookies'] = cookies verbose(cookies) verbose('Loading session info') response = browser.open('https://t-square.gatech.edu/direct/session/current.json')
def search(self, terms, limit=10): """ @brief This function searches Google Scholar using the specified terms. Returns a list of dictionarys. Each dictionary contains the information related to the article: "URL" : link to the article/n "Title" : title of the publication/n "Authors" : authors (example: DF Easton, DT Bishop, D Ford)/n "JournalYear" : journal name & year (example: Nature, 2001)/n "JournalURL" : link to the journal main website (example: www.nature.com)/n "Abstract" : abstract of the publication/n "NumCited" : number of times the publication is cited/n "Terms" : list of search terms used in the query/n @param terms List of search terms @param limit Maximum number of results to be returned (default=10) @return List of results, this is the empty list if nothing is found """ params = urllib.urlencode({'q': "+".join(terms), 'num': limit}) headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} url = self.SEARCH_BASE_URL+"?"+params conn = httplib.HTTPConnection(self.SEARCH_HOST) conn.request("GET", url, {}, headers) resp = conn.getresponse() if resp.status==200: html = resp.read() results = [] html = html.decode('ascii', 'ignore') # Screen-scrape the result to obtain the publication information soup = BeautifulSoup(html) citations = 0 for record in soup('p', {'class': 'g'}): # Includeds error checking topPart = record.first('span', {'class': 'w'}) pubURL = topPart.a['href'] # Clean up the URL, make sure it does not contain '\' but '/' instead pubURL = pubURL.replace('\\', '/') pubTitle = "" for part in topPart.a.contents: pubTitle += str(part) if pubTitle == "": match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a',str(record)) match2 = re.split('- <a',match1[citations]) pubTitle = re.sub('<\/?(\S)+>',"",match2[0]) citations = citations + 1 authorPart = record.first('font', {'color': 'green'}).string if str(authorPart)=='Null': authorPart = '' # Sometimes even BeautifulSoup can fail, fall back to regex m = re.findall('<font color="green">(.*)</font>', str(record)) if len(m)>0: authorPart = m[0] num = authorPart.count(" - ") # Assume that the fields are delimited by ' - ', the first entry will be the # list of authors, the last entry is the journal URL, anything in between # should be the journal year idx_start = authorPart.find(' - ') idx_end = authorPart.rfind(' - ') pubAuthors = authorPart[:idx_start] pubJournalYear = authorPart[idx_start + 3:idx_end] pubJournalURL = authorPart[idx_end + 3:] # If (only one ' - ' is found) and (the end bit contains '\d\d\d\d') # then the last bit is journal year instead of journal URL if pubJournalYear=='' and re.search('\d\d\d\d', pubJournalURL)!=None: pubJournalYear = pubJournalURL pubJournalURL = '' # This can potentially fail if all of the abstract can be contained in the space # provided such that no '...' is found delimiter = soup.firstText("...").parent pubAbstract = "" while str(delimiter)!='Null' and (str(delimiter)!='<b>...</b>' or pubAbstract==""): pubAbstract += str(delimiter) delimiter = delimiter.nextSibling pubAbstract += '<b>...</b>' match = re.search("Cited by ([^<]*)", str(record)) pubCitation = '' if match != None: pubCitation = match.group(1) results.append({ "URL": pubURL, "Title": pubTitle, "Authors": pubAuthors, "JournalYear": pubJournalYear, "JournalURL": pubJournalURL, "Abstract": pubAbstract, "NumCited": pubCitation, "Terms": terms }) return results else: print "ERROR: ", print resp.status, resp.reason return []
from pymongo import Connection connection = Connection() db = connection.placemillDB citiesCollection = db.places categoriesCollection = db.categories f = urllib2.urlopen('http://www.yelp.com/developers/documentation/neighborhood_list') raw = f.read() soup = BeautifulSoup(raw) places = soup.find('ul', {'class':'attr-list'}) usa = soup.firstText('USA').findNext('ul') #get list of cities in usa usaCities = usa.findAll('li', recursive=False) #take out <li> tags for i in range(len(usaCities)): usaCities[i] = usaCities[i].text #get list of places in each city places = {} for i in usaCities: temp = usa.firstText(i).findNext('ul').findAll('li') for j in range(len(temp)): temp[j] = temp[j].text