Example #1
0
def parseLyrics(lyricList,outlist,s,e):
	baseURL = u'http://www.darklyrics.com' 
	i = 0 ;
	for key in lyricList :
		i = i + 1 ;
		if(i >= s and i<= e):
			#key = 'In Flames'  # REMOVE FOR 100 Bands
			time.sleep(1)
			turl = lyricList[key] ;
			print 'Looking up band ' + key
			#print turl
			opener = urllib2.build_opener()
			opener.addheaders = [('User-agent', 'Mozilla/5.0')]
			page = opener.open(turl)
			soup = BeautifulSoup(page.read())
			divs = soup.findChildren('div',attrs={"class" : "album"})
			#get the sub-URL to the lyrics of the latest album and then full URL to the lyrics source
			if(len(divs)>0):
				sub_url =  divs[len(divs)-1].findChildren('a')[0]['href']
				lurl = baseURL + sub_url.split('#')[0][2:]
				#print lurl
				# hit the URL and get data
				page = opener.open(lurl)
				soup = BeautifulSoup(page.read())
				lydiv = soup.findChildren('div',attrs={"class" : "lyrics"})[0]
				[x.extract() for x in lydiv('div')]
				#lyrictext = re.sub('\'lydiv.text ;
				rly = getRawLyrics(lydiv) 
			else:
				rly = "Manual"
				print rly
			outlist[key] = rly
		#break ; # remove once started full testing
	print 'done' , s, ' to ', e	
	return outlist
Example #2
0
    def parse_sefer_file(self, f, sefer_idx=0):
        """Parses the Mechon Mamre sefer file pointed to by f.

		Args:
			f: file handle to Mechon Mamre file.
		"""

        parsed = BeautifulSoup(f)
        seforim = parsed.findChildren(name='h1')
        sefer_id = 'sefer_%d' % sefer_idx
        sefer = torah_model.Sefer(seforim[0].text, sefer_idx, sefer_id)

        perek_idx = -1
        pasuk_idx = -1
        current_perek = None
        perek_pasuk_children = parsed.findChildren(name='b')

        for i, pp in enumerate(perek_pasuk_children):
            perek_str, pasuk_str = pp.text.split(',')

            # May need to open a new perek.
            if (current_perek is None or current_perek.perek != perek_str):
                perek_idx += 1
                pasuk_idx = -1  # Reset pasuk counter
                perek_id = 'perek_%d:%d' % (sefer_idx, perek_idx)
                current_perek = torah_model.Perek(perek_str, perek_idx,
                                                  perek_id)
                sefer.append_to_stream(current_perek)

            # Mark the start of a new pasuk.
            pasuk_idx += 1
            pasuk_id = 'pasuk_%d:%d:%d' % (sefer_idx, perek_idx, pasuk_idx)
            # TODO: how do we handle the case the petuha/setumah
            # come in the middle of a pasuk?
            current_pasuk = torah_model.PasukStart(pasuk_str, pasuk_idx,
                                                   pasuk_id)
            current_perek.append_to_stream(current_pasuk)

            # Iterate over the text of the pasuk, until we hit the next one.
            sib = pp.nextSibling
            current_pasuk_fragment = torah_model.PasukFragment(pasuk_id)
            while sib:
                tag_name = self.get_tag_name(sib)
                if tag_name and tag_name.lower() in self.BREAKING_TAGS:
                    break
                elif tag_name:
                    child = torah_model.FormattedText.from_tag_name(
                        sib.text, tag_name)
                    current_pasuk_fragment.append_to_stream(child)
                else:
                    text = unicode(sib)
                    children = self.split_on_parsha_delimiter(text)
                    for c in children:
                        current_pasuk_fragment.append_to_stream(c)

                # Move to next sibling
                sib = sib.nextSibling
            current_perek.append_to_stream(current_pasuk_fragment)

        return sefer
Example #3
0
def parseYoutube(url):
    videos = []
    link = urllib2.urlparse.urlsplit(url)
    query = link.query
    netloc = link.netloc
    path = link.path

    print "youtube url : " + url

    def parseYoutubePlaylist(playlistId):
        videos = []
        yturl = 'http://gdata.youtube.com/feeds/api/playlists/' + playlistId
        try:
            response = net.http_GET(yturl)
            html = response.content
        except urllib2.HTTPError, e:
            print "HTTPError : " + str(e)
            return videos

        soup = BeautifulSoup(html)

        for video in soup.findChildren('media:player'):
            videoUrl = str(video['url'])
            print "youtube video : " + videoUrl
            videos += parseYoutube(videoUrl)
        return videos
def first_paragraph_filter(t):
    b = BeautifulSoup(t)
    paras = b.findChildren('p')
    if paras:
        return unicode(paras[0])
    else:
        return t
def first_paragraph_filter(t):
    b = BeautifulSoup(t)
    paras = b.findChildren('p')
    if paras:
        return unicode(paras[0])
    else:
        return t
    def handle(self, *args, **options):

        response = urllib2.urlopen('http://de.wikipedia.org/wiki/Liste_der_Kfz-Nationalit%C3%A4tszeichen')
        soup = BeautifulSoup(response)
        tables = soup.findChildren('table')

        my_table = tables[0]        
        rows = my_table.findChildren('tr')

        for row in rows:
            cells = row.findChildren('td') 
            value1 = ''
            value = ''           
            for idx,cell in enumerate(cells):                
                if idx == 1:                    
                    for a in cell.findChildren('a'):
                        value1 = a.string                        
                if idx ==2:                    
                    value = cell.string 
            try:                
                country = Country.objects.get(shortcut_two=value)
                country.german_name = value1
                country.save()
            except Exception, e:
                print value
                print e               
                pass
Example #7
0
def parseYoutube( url ):
   videos = []
   link = urllib2.urlparse.urlsplit( url )
   query = link.query
   netloc = link.netloc
   path = link.path

   print "youtube url : " + url

   def parseYoutubePlaylist( playlistId ):
      videos = []
      yturl = 'http://gdata.youtube.com/feeds/api/playlists/' + playlistId
      try:
         response = net.http_GET( yturl )
         html = response.content
      except urllib2.HTTPError, e:
         print "HTTPError : " + str( e )
         return videos

      soup = BeautifulSoup( html )

      for video in soup.findChildren( 'media:player' ):
         videoUrl = str( video[ 'url' ] )
         print "youtube video : " + videoUrl
         videos += parseYoutube( videoUrl )
      return videos
Example #8
0
def FormatHtml(f, idx):
	#results = d.find(id='bodyMainResults')
	resultDiv = SoupStrainer('div', id='bodyMainResults')
	res = BeautifulSoup(f, parseOnlyThese=resultDiv)

	#tables = res.findChildren('table', attrs={'class':'resultRow'})
	#tables = res.contents[0]
	tables = res.findChildren('table', attrs={'cellspacing':'0','cellpadding':'10'})

	for tab in tables:
		a = tab.find('a')
		link = a['href']
		span = a.findChild('span')
		#print span.contents
		#article = span.contents[0]  
		article = ' '.join([s.string for s in span.contents if s.string])
		iList = tab.findAll('i')
		journal = iList[0].contents[0]
		volumn = iList[1].contents[0]
		pubDate = iList[2].contents[0]
		pages = iList[3].contents[0]
		tds = [td for td in tab.contents]
		item = tds[1].find('td', attrs={'align':'left','width':'95%','colspan':'2'})
		author = item.contents[10]
		#td1 = tds[1]
 		#author = td1.contents[10]
		idx += 1
		#	print author
		print "[", idx, "]", "\n\t", link, "\n\t", article, "\n\t", author, "\n\t", journal, "\n\t", volumn, "\n\t", pages, "\n"
		#print "[", idx, "]", "\n\t", article, "\n\t", journal, "\n\t", volumn, "\n\t", pages, "\n"
	print "FETCH page, to ", idx
	return idx
Example #9
0
def zemi_domejni(stranici):
    """Ги зима домејните од Марнет. stranici треба да е ажурна
    листа на страници од главната страница на Марнет."""

    #ako deneska sum sobral ne probuvaj pak
    if gae:
        conn = rdbms.connect(instance=_INSTANCE_NAME, database='domaininfo')
    else:
        pass
        #conn = sqlite3.connect("db/domaininfo.sqlite3") #GAEquirk
    c = conn.cursor()
    c.execute('select count(*) from domaininfo where date=%s',(deneska,))
    if c.fetchone()[0]<>0:
        return []

    domejni = []
    for link in stranici:
        logging.info(u"Sobiram %s" % link)

        req = urllib2.Request(u'http://dns.marnet.net.mk/' + link)
        res = urllib2.urlopen(req)
        strana = res.read()
        soup = BeautifulSoup(strana)

        domejn_linkovi = soup.findChildren('a',{'class':'do'})

        for domejn in domejn_linkovi:
            if domejn['href'].find('dom=')<>-1:
                domejni.append(domejn['href'].replace('registar.php?dom=',''))

    return domejni
    def handle(self, *args, **options):
        response = urllib2.urlopen('http://www.countryareacode.net/')
        soup = BeautifulSoup(response)
        tables = soup.findChildren('table')
        my_table = tables[0]        
        rows = my_table.findChildren('tr')

        for row in rows:
            country = Country.objects.create()       
            cells = row.findChildren('td')
            for cell in cells:
                if cell.findChildren('a'):
                    for a in cell.findChildren('a'):
                        if a.string:
                            country.english_name  =  a.string
                            country.save()
                
                if cell.findChildren('b'):
                    b = cell.findChildren('b')
                    if not b[0].string.startswith('+'):
                        value = b[0].string
                        if len(value) == 2:
                            country.shortcut_two = value
                            country.save()
                            print 'shortname '+ value
                        if len(value) ==3:
                            country.shortcut_three = value
                            country.save()
                            print 'short ' + value
                country.save()               
            print '---------------'
Example #11
0
def parseList(service, listseq, offset, limit):
    page_url = root_url + "/ajaxInclude.gom?lib=gomclass&src=%2FindexList.gom&offset={0:d}&limit={1:d}&listseq={2:s}&service={3:s}".format(offset, limit, listseq, service)
    req = urllib2.Request(page_url)
    req.add_header('User-Agent', BrowserAgent)
    html = urllib2.urlopen(req).read()
    soup = BeautifulSoup(html, fromEncoding='UTF-8')
    items = []
    for node in soup.findChildren('a'):
        items.append( {'title':node.span.string, 'url':root_url+node['href'], 'thumbnail':node.img['src']} )
    return items
Example #12
0
def scrapefctalt(url,day,ind):
    
    try:
        # in case of lack of data and '-' appears
        errorflag = False   # will insert the index element to have a full data set
        
        # use sqlite3 to create a database
        datab = sqlite3.connect('weightedAvg.db')
        cursor = datab.cursor()
        cursor.execute("CREATE TABLE IF NOT EXISTS WAvg(id INT, spot_date TEXT, hour TEXT, weighted_avg REAL);")

        # start hour and index for sqlite
        hour = 0
        index = ind
        
        # prepare BeautifulSoup to scrape the webpage
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page)

        # extract the table and the rows of the table
        tables = soup.findChildren('table')
        rows = tables[0].findChildren(['th','tr'])

        # go through every row and every cell to extract the weighted average
        for row in rows:
            cells = row.findChildren('td')
            counter = 0
            for cell in cells:
                value = cell.string
                counter += 1
                hourgap = string.zfill(hour,2) + " - " + string.zfill(hour+1,2)
                if counter == 6:     #weightedaverage for 1st day
                    try:
                        unicode(value).encode('ascii')
                    except UnicodeEncodeError:
                        errorflag = True
                    if errorflag == False:
                        cursor.execute("INSERT INTO WAvg VALUES(?,?,?,?)", (index, day, hourgap, float(unicode(value))))
                        index += 1
                        hour += 1
                elif counter == 7 and errorflag == True:
                    cursor.execute("INSERT INTO WAvg VALUES(?,?,?,?)", (index, day, hourgap, float(unicode(value))))
                    errorflag = False
                    index += 1
                    hour += 1
                elif value is None:
                    break
        datab.commit()          # commit the changes done to the SQL database
    
    except sqlite3.Error, e:    # in case of error loading the SQL database

        print "Error %s:" % e.args[0]
        sys.exit(1)
def getRollNo(rollNo):
    roll = str(rollNo)
    req = requests.get('http://oa.cc.iitk.ac.in:8181/Oa/Jsp/OAServices/IITk_SrchRes.jsp?typ=stud&numtxt=' + roll + '&sbm=Y')
    soup = BeautifulSoup(req.text)

    image = 'http://oa.cc.iitk.ac.in:8181/Oa/Jsp/Photo/' + roll + '_0.jpg'
    print 'Image :: ' + image

    data = soup.findChildren('p')
    name = data[0].text.split(':')[1]
    print 'Name :: ' + name

    program = data[1].text.split(':')[1]
    print 'Program :: ' + program

    dept = data[2].text.split(':')[1]
    print 'Department :: ' + dept

    room = data[3].text.split(':')[1]
    print 'Room :: ' + room

    email = data[4].text.split(':')[1]
    print 'E-mail :: ' + email

    bloodData = data[5].text.split('<b>')[0]
    blood = bloodData.split(':')[1]
    print 'Blood :: ' + blood

    categoryData = data[5].text.split('<b>')[1]
    category = re.findall(u'(?<=>).+?(?=<)', categoryData)[0]
    print 'Category :: ' + category

    genderData = data[6].text.split(':')
    gender = genderData[1][0]
    print 'Gender :: ' + gender

    country = genderData[2]
    print 'Country :: ' + country

    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    addressSoup = BeautifulSoup(comments[1])
    permanentAddressData = addressSoup.findAll('p')[1].text
    phonePos = permanentAddressData.index('Phone no:')
    mobilePos = permanentAddressData.index('Mobile no:')

    address  = permanentAddressData[19:phonePos]
    print 'Permanent Address :: ' + address

    phone = permanentAddressData[(phonePos + 9):mobilePos]
    print 'Phone Number :: ' + phone

    mobile = permanentAddressData[(mobilePos + 9):]
    print 'Mobile Number :: ' + mobile
Example #14
0
    def _najdi_naslovi(self):
        soup = BeautifulSoup(self.stranata)
        naslovi = soup.findChildren('h3')

        for naslov in naslovi:
            a = naslov.findChild('a')
            url = u'http://www.feit.ukim.edu.mk' + a['href']
            tekst = a.string
            opisot = naslov.findParent().findParent().findNextSibling().td.getText()
            datum = self._najdi_datum_na_novost(url)
    
            self.vesti_site.append({'url':url,'tekst':tekst,'datum':datum, 'description':opisot})
Example #15
0
def rfacts():
    url = "http://hubpages.com/education/Very-Interesting-Facts"
    r = requests.get(url).content
    soup = BeautifulSoup(r)
    k = soup.findChildren('div', {'id': 'txtd_3703576'})
    k = k[0]
    k = k.findChildren("p")
    k = k[0]
    j = k.__str__().split("<br /><br /><br />")
    fres = []
    for x in j:
        fres.append(unidecode.unidecode(BeautifulSoup(x).text))
    return fres
Example #16
0
    def _najdi_vesti(self):
        soup = BeautifulSoup(self.stranata)
        vesti=soup.findChildren('div',{'class':'span-15 last'})
    
        for vest in vesti:
            alink = vest.findChild('a')
            izvor = vest.findChild('span',{'class':'source'}).findChild('strong')
            koga = vest.findChild('span',{'class':'when'})
            chasa = re.compile(u'([0-9]+) (час|мин).*')
            if koga:
                najdov = chasa.search(koga.string)
            else:
                najdov = False

            if najdov:
                koga = najdov.groups()[0]
                edinica = najdov.groups()[1]
                if edinica==u'час':
                    objaveno=datetime.datetime.now()-datetime.timedelta(hours=int(koga))
                else:
                    objaveno=datetime.datetime.now()-datetime.timedelta(minutes=int(koga))
            else:
                objaveno=datetime.datetime.now()
    
            self.elementi.append({'title':u"%s: %s" % (izvor.string,alink.string),
                                'url':alink['href'],
                                'published':objaveno})

        sodrzini = soup.findChildren('div',{'class':'span-12 last article-snippet'})
        
        i=0
        for sodrzina in sodrzini:
            self.elementi[i]['description']=cgi.escape(sodrzina.prettify())
            self.elementi[i]['description']=self.elementi[i]['description'].replace('href="index.psp?slot=','href="http://www.time.mk/index.psp?slot=')
            self.elementi[i]['description']=self.elementi[i]['description'].decode('utf-8')
            i+=1
def links():
    import urlresolver

    links_info = json.loads(h.extract_var(args, 'elem_id'))
    for link in links_info:
        url = link['url']
        print url
        resp = None
        try:
            resp = h.make_request(url, cookie_file, cookie_jar)
            soup = BeautifulSoup(resp)
            if len(soup.findChildren()) == 1:
                meta = soup.find('meta', attrs={'http-equiv': 'refresh'})
                if meta:
                    c = dict(meta.attrs)['content']
                    idx4 = c.find('URL=')
                    if idx4 != -1:
                        _url = c[idx4 + 4:]
                        soup = BeautifulSoup(h.make_request(_url, cookie_file, cookie_jar))
                        div = soup.find('div', {'id': 'content'})
                        url = dict(div.find('table').find('iframe').attrs)['src']
            else:
                div = h.bs_find_with_class(soup, 'div', 'entry-content')
                if div:
                    divs = div.findAll('div', recursive=False)
                    url = dict(divs[1].find('iframe').attrs)['src']
        except urllib2.HTTPError as e:
            # Hack. Avast blocks first url. Only for WatchVideo currently
            if e.code == 403 and e.msg == 'Malicious content':
                up = urlparse.urlparse(url)
                id = urlparse.parse_qs(up.query)['id'][0]
                f = 0
                if up.path == '/idowatch.php':
                    url = 'http://vidfile.xyz/embed-%s-1280x720.html' % id
                    f = 1
                elif up.path == '/watchvideo.php':
                    url = 'http://watchvideo2.us/embed-%s-1280x720.html' % id
                    f = 1
                elif up.path == '/playu.php':
                    url = 'http://playu.me/embed-%s-1280x720.html' % id
                    f = 1

                if f:
                    resp = h.make_request(url, cookie_file, cookie_jar)
        if resp:
            video_url = urlresolver.resolve(url)
            if video_url:
                h.add_dir_video(addon_handle, link['name'], video_url, '', '')
Example #18
0
    def handle(self, *args, **options):
        response = urllib2.urlopen('http://www.lab.lmnixon.org/4th/worldcapitals.html')
        soup = BeautifulSoup(response)
        tables = soup.findChildren('table')
        my_table = tables[0]        
        rows = my_table.findChildren('tr')

        for row in rows:
            cells = row.findChildren('td') 
            #print cells
            country = cells[0].string
            #print country
            # s und w negativ
            try:
                c = Country.objects.get(english_name=country)
                capitel = cells[1].string
                c.capital = capitel
                c.save()
                lat = cells[2].string
                if lat:
                    if lat.endswith('S'):
                        lat = lat[:-1]
                        c.lat = float('-' + lat)
                        c.save()
                    else:
                        lat = lat[:-1]
                        c.lat = float(lat)
                        c.save()

                try: 
                    lon = cells[3].string                    
                    if lon:
                        if lon.endswith('W'):                            
                            lon = lon[:-1]
                            c.lon = float('-' + lon)
                            c.save()
                        else:
                            lon = lon[:-1]
                            c.lon = lon
                            c.save()
                except:
                    pass
                c.save()
                
            except Exception,e:
                #print country
                pass
Example #19
0
 def getOneRow(self):
     soup = BeautifulSoup(self.html )
     tables = soup.findChildren('table')
     my_table = tables[0]
     rows = my_table.findChildren(['th', 'tr'])
     res = list()
     for row in rows:
         cells = row.findChildren('td')
         listM = cells[::-2] 
         for cell in listM:
             temp = cell.string
             if temp:
                 res.append(temp.replace(' ','').replace('\n','').strip().encode('utf-8'))
             else:
                 res.append('empty')
                 # print 'empty'
     return res     
Example #20
0
 def getOneRow(self):
     soup = BeautifulSoup(self.html)
     tables = soup.findChildren("table")
     my_table = tables[0]
     rows = my_table.findChildren(["th", "tr"])
     res = list()
     for row in rows:
         cells = row.findChildren("td")
         listM = cells[::-2]
         for cell in listM:
             temp = cell.string
             if temp:
                 res.append(temp.replace(" ", "").replace("\n", "").strip().encode("utf-8"))
             else:
                 res.append("empty")
                 # print 'empty'
     return res
Example #21
0
class NipperCommercialHTMLReport(dict):
    def __init__(self,path):
        self.path = path
        self.update(dict((k,{}) for k in SEVERITY_MAP.keys()))

        if not os.path.isfile(self.path):
            raise ReportParserError('No such file: %s' % self.path)
        self.parser = BeautifulSoup(markup=open(self.path,'r').read())

        contents = self.parser.find('div',{'id':'contents'})
        if contents is None:
            raise ReportParserError('No table of contents found')

        self.device = None
        self.name = None
        t = self.parser.find('title').text
        for re_match in DEVICE_TITLES:
            m = re_match.match(t)  
            if m:
                self.device = m.group(1)
                self.name = m.group(2)
                break

        if self.device is None or self.name is None:
            raise ReportParserError('Could not parse device type and name')

        for d in self.parser.findChildren('div'):
            d_id = d.get('id')
            if d_id is None or d_id in SKIP_DIVS:
                continue
            r = NipperReportedIssue(self,d)
            try:
                severity = filter(lambda k: 
                    r.severity in SEVERITY_MAP[k],
                    SEVERITY_MAP.keys()
                )[0]
            except IndexError:
                ReportParserError('Unknown severity level: %s' % r.severity)
            if not self[severity].has_key(r.issue):
                self[severity][r.issue] = []
            self[severity][r.issue].append(r) 

    def __repr__(self):
        return self.path
Example #22
0
class NipperCommercialHTMLReport(dict):
    def __init__(self, path):
        self.path = path
        self.update(dict((k, {}) for k in SEVERITY_MAP.keys()))

        if not os.path.isfile(self.path):
            raise ReportParserError('No such file: %s' % self.path)
        self.parser = BeautifulSoup(markup=open(self.path, 'r').read())

        contents = self.parser.find('div', {'id': 'contents'})
        if contents is None:
            raise ReportParserError('No table of contents found')

        self.device = None
        self.name = None
        t = self.parser.find('title').text
        for re_match in DEVICE_TITLES:
            m = re_match.match(t)
            if m:
                self.device = m.group(1)
                self.name = m.group(2)
                break

        if self.device is None or self.name is None:
            raise ReportParserError('Could not parse device type and name')

        for d in self.parser.findChildren('div'):
            d_id = d.get('id')
            if d_id is None or d_id in SKIP_DIVS:
                continue
            r = NipperReportedIssue(self, d)
            try:
                severity = filter(lambda k: r.severity in SEVERITY_MAP[k],
                                  SEVERITY_MAP.keys())[0]
            except IndexError:
                ReportParserError('Unknown severity level: %s' % r.severity)
            if not self[severity].has_key(r.issue):
                self[severity][r.issue] = []
            self[severity][r.issue].append(r)

    def __repr__(self):
        return self.path
Example #23
0
def getFileType(content):
    soup = BeautifulSoup(content)
    tables = soup.findChildren('table')
    table = tables[0]
    rows = table.findChildren(['th', 'tr'])
    isfinded = 0
    last_tr = rows[
        -1]  # date in last row - dependency: virusshare UI site design
    #print last_tr.text
    m = re.search('[0-9]{4}-[0-9]{2}-[0-9]{2}', last_tr.text)
    date = m.group(0)

    for row in rows:
        tds = row.findChildren('td')
        for td in tds:
            val = td.string
            if (val == 'File Type'):
                isfinded = 1
                continue
            if (isfinded):
                return [date, val]
Example #24
0
    def get_class_description(class_number, term):
        """
        :returns: a string that is the description for CLASS_NUMBER in term TERM

        :param: class_number: String, class number
        :param: term: String, term number
        """

        url = 'http://www.courses.as.pitt.edu/detail.asp?CLASSNUM={}&TERM={}'.format(class_number, term)
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
        table = soup.findChildren('table')[0]
        rows = table.findChildren('tr')

        has_description = False
        for row in rows:
            cells = row.findChildren('td')
            for cell in cells:
                if has_description:
                    return cell.string.strip()
                if len(cell.contents) > 0 and str(cell.contents[0]) is '<strong>Description</strong>':
                    has_description = True
Example #25
0
    def get_class_description(class_number, term):
        """
        :returns: a string that is the description for CLASS_NUMBER in term TERM

        :param: class_number: String, class number
        :param: term: String, term number
        """

        url = 'http://www.courses.as.pitt.edu/detail.asp?CLASSNUM={}&TERM={}'.format(class_number, term)
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
        table = soup.findChildren('table')[0]
        rows = table.findChildren('tr')

        has_description = False
        for row in rows:
            cells = row.findChildren('td')
            for cell in cells:
                if has_description:
                    return cell.string.strip()
                if len(cell.contents) > 0 and str(cell.contents[0]) == '<strong>Description</strong>':
                    has_description = True
Example #26
0
    def get_class_description(self, class_number, term):
        '''
        Returns a string that is the description for CLASS_NUMBER in term TERM

        Keyword arguments
        class_number -- String, class number
        term -- String, term number
        '''

        url= 'http://www.courses.as.pitt.edu/detail.asp?CLASSNUM=%s&TERM=%s' % (class_number, term)
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
        table = soup.findChildren('table')[0]
        rows = table.findChildren('tr')

        description_flag = False
        for row in rows:
            cells = row.findChildren('td')
            for cell in cells:
                if description_flag == True:
                    return cell.string.strip()
                if len(cell.contents) > 0 and str(cell.contents[0]) == '<strong>Description</strong>':
                    description_flag = True
Example #27
0
def stranici():
    """Букви на http://dns.marnet.net.mk/registar.php"""
    bukvi = ['NUM','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']

    url = 'http://dns.marnet.net.mk/registar.php'

    linkovi = []
    for bukva in bukvi:
        logging.info(u"Proveruvam Bukva %s" % bukva)
        bukva = urllib.urlencode({'bukva':bukva})
        req = urllib2.Request(url,bukva)
        res = urllib2.urlopen(req)

        stranica = res.read()

        soup = BeautifulSoup(stranica)
        rawlinkovi = soup.findChildren('a',{'class':'do'})
        
        for link in rawlinkovi:
            if link['href'].find('del=') <> -1:
                linkovi.append(link['href'])

    return linkovi
import urllib2
from BeautifulSoup import BeautifulSoup

opener = urllib2.build_opener()
opener.addheaders=[('User-agent','Mozilla/5.0')]

url = "http://www.imdb.com/search/title?release_date=2010,2015&title_type=feature"
ourUrl = opener.open(url).read()
soup = BeautifulSoup(ourUrl)

# Fetching the value present within tag table with class=results
movie = soup.findChildren('table','results')

# Changing the movie into an iterator
iterMovie = iter(movie[0].findChildren('tr'))

# Finding tr in itermovie. Every tr tag contains information of a movie.
for tr in iterMovie:
	#try doing some stuff here
Example #29
0
        url = "http://packages.qa.debian.org/%s/%s.html" % (prefix, package)
        f = urlopen(url)
    except HTTPError, e:
        if e.code == 404:
            msg = "%s: package %s does not exist" % (ievent.nick, package)
        else:
            msg = "%s: %s" % (ievent.nick, e.msg)
        bot.msg(choose_dest(ievent), msg.encode("utf-8"))
        return

    msg = "%s: maintainer for %s is " % (ievent.nick, package)
    soup = BeautifulSoup(f)
    maint = soup.find('span', {'title':'maintainer'})
    email = maint.parent['href'].split('login='******'span', {'title':'uploader'})
    if uploaders:
        msg += ", uploaders: "
        tmp = []
        for u in uploaders:
            tmp.append("%s <%s>" % (u.string, u.parent['href'].split('login='******', '.join(tmp)
    bot.msg(choose_dest(ievent), msg.encode("utf-8"))

@expose("madison", 1)
def madison(bot, ievent):
    """madison <package> [package, package, ...]"""
    package = str(ievent.args[0])
    if len(ievent.args) > 1:
        suites = ",".join(ievent.args[1:])
    else:
def process_file_updatedb(fname):
	with open(fname,"r") as foo_file:
		soup = BeautifulSoup(foo_file)  #soup is the full HTML
	
	
	#-----------------------------------------
	
	valueList = []  # this is a list of lists 
	blank=0 #flag of blank resultsheet 
	
	#--------------------------------------------
	tables = soup.findChildren('table')
	my_table=tables[11]  # we are targetting the 11th table in soup 
	
	#-------------------------------------
	# This is a special code to avoid some few results which has only two subjects 
	# logically now, we sHOULD not have blank result file as per current logic
	
	bolds= my_table.findAll('b')
	boldlen=len(bolds)
	if (boldlen>1):
		nameusn= bolds[0].string  #name and USN
		print "nameusn%s"%nameusn
	else:
		result = "resultsheet blank"
		blank=1	
	if (boldlen>2):
		semesternum= bolds[1].string + bolds[2].string
		print "semesternum%s"%semesternum
		if int(semesternum[-1]) != 6:  # THIS CHECK HAS TO BE MODIFIED for each semester 
			# There are stray cases where the student is only writing 1st semester backlog and the result sheet has only those results..program will break 
			blank=1
	if (boldlen>4):	
		result = bolds[3].string.replace('&nbsp;',' ')
	
	
	#print"Debug:result of this candidate after accessing bold elements...%s"%(result)  # will give an error unless assigned
	#---collect the subject pass/fail also -only when those fields are there------------
	#---unless you put these checks, it will bomb for few which has a different format -----------------
	
	if (boldlen>5):
		sub1res = bolds[4].string.replace('&nbsp;',' ')
		print sub1res
	if (boldlen>6):
		sub2res = bolds[5].string.replace('&nbsp;',' ')
		print sub2res
	if (boldlen>7):
		sub3res = bolds[6].string.replace('&nbsp;',' ')
		print sub3res
	if (boldlen>8):
		sub4res = bolds[7].string.replace('&nbsp;',' ')
		print sub4res
	if (boldlen>9):
		sub5res = bolds[8].string.replace('&nbsp;',' ')
		print sub5res
	if (boldlen>10):
		sub6res = bolds[9].string.replace('&nbsp;',' ')
		print sub6res
	if (boldlen>11):
		sub7res = bolds[10].string.replace('&nbsp;',' ')
		print sub7res
	#since B tech Semester 2 has 8 subjects 
	if (boldlen>12):
		sub8res = bolds[11].string.replace('&nbsp;',' ')
		print sub8res		
	#print "DBG: Here I am extracting each row of data for another student ..."
	valueList =[]	


	for row in soup.find("td", {"width" : "513"}).findAll('tr'):
		tds = row('td')  # here I am printing all the cells in a row (list of cells) and I need to extract them 
		 
		listInternal=[]
		#print tds[0].text.replace('&nbsp;',' ')
		listInternal.append(tds[0].text.replace('&nbsp;',' '))
		 
		#print tds[1].text.replace('&nbsp;',' ')
		listInternal.append(tds[1].text.replace('&nbsp;',' '))
		 
		#print tds[2].text.replace('&nbsp;',' ')
		listInternal.append(tds[2].text.replace('&nbsp;',' '))
		
		#print tds[3].text.replace('&nbsp;',' ')
		listInternal.append(tds[3].text.replace('&nbsp;',' '))
		valueList.append(listInternal)
	print "..........valueList before allocating sub1 etc..........................."	
	
		
	sizeOfVlist=len(valueList)  # this is a list of lists
	print "size of valueList %s" %(sizeOfVlist)
	if sizeOfVlist ==0 :
		blank=1
		print "blank result with size of valueList 0"
	
	if (sizeOfVlist>9):  #Otherwise this code will fail for candidates where blank result is there
		print valueList[2][0]
		print valueList[3][0]
		print valueList[4][0]
		print valueList[5][0]
		print valueList[6][0]
		print valueList[7][0]
		print valueList[8][0]
		print valueList[9][0]
		
	
	#Since the subject list itself varies from college to college, I have to
	# define the subject dynamically 
	# the code sucks .. once it works, we have to improve this !!
	
	found=0  # no of subjects found 
	
	if (blank==0):
		
		if (sizeOfVlist >2):

			if '10AL61' in valueList[2][0]:
				sub1="10AL61"
				found +=1
			elif '10CS62' in valueList[2][0]:
				sub1="10CS62"
				found +=1
			elif '10CS63' in valueList[2][0]:
				sub1="10CS63"
				found +=1
			elif '10CS64' in valueList[2][0]:
				sub1="10CS64"
				found +=1
			elif '10CS65' in valueList[2][0]:
				sub1="10CS65"
				found +=1
			elif '10CSL67' in valueList[2][0]:
				sub1="10CSL67"
				found +=1
			elif '10CSL68' in valueList[2][0]:
				sub1="10CSL68"
				found +=1
			elif '10CS661' in valueList[2][0]:
				sub1="10CS661"
				found +=1
			elif '10CS662' in valueList[2][0]:
				sub1="10CS662"
				found +=1
			elif '10CS663' in valueList[2][0]:
				sub1="10CS663"
				found +=1
			elif '10CS664' in valueList[2][0]:
				sub1="10CS664"
				found +=1
			elif '10CS665' in valueList[2][0]:
				sub1="10CS665"
				found +=1
			elif '10CS666' in valueList[2][0]:
				sub1="10CS666"
				found +=1
			
		
#for the 2nd subject 
		if (sizeOfVlist >3):
			
			if '10AL61' in valueList[3][0]:
				sub2="10AL61"
				found +=1
			elif '10CS62' in valueList[3][0]:
				sub2="10CS62"
				found +=1
			elif '10CS63' in valueList[3][0]:
				sub2="10CS63"
				found +=1
			elif '10CS64' in valueList[3][0]:
				sub2="10CS64"
				found +=1
			elif '10CS65' in valueList[3][0]:
				sub2="10CS65"
				found +=1
			elif '10CSL67' in valueList[3][0]:
				sub2="10CSL67"
				found +=1
			elif '10CSL68' in valueList[3][0]:
				sub2="10CSL68"
				found +=1
			elif '10CS661' in valueList[3][0]:
				sub2="10CS661"
				found +=1
			elif '10CS662' in valueList[3][0]:
				sub2="10CS662"
				found +=1
			elif '10CS663' in valueList[3][0]:
				sub2="10CS663"
				found +=1
			elif '10CS664' in valueList[3][0]:
				sub2="10CS664"
				found +=1
			elif '10CS665' in valueList[3][0]:
				sub2="10CS665"
				found +=1
			elif '10CS666' in valueList[3][0]:
				sub2="10CS666"
				found +=1

		
#for the 3rd subject 
		if (sizeOfVlist >4):
			
			if '10AL61' in valueList[4][0]:
				sub3="10AL61"
				found +=1
			elif '10CS62' in valueList[4][0]:
				sub3="10CS62"
				found +=1
			elif '10CS63' in valueList[4][0]:
				sub3="10CS63"
				found +=1
			elif '10CS64' in valueList[4][0]:
				sub3="10CS64"
				found +=1
			elif '10CS65' in valueList[4][0]:
				sub3="10CS65"
				found +=1
			elif '10CSL67' in valueList[4][0]:
				sub3="10CSL67"
				found +=1
			elif '10CSL68' in valueList[4][0]:
				sub3="10CSL68"
				found +=1
			elif '10CS661' in valueList[4][0]:
				sub3="10CS661"
				found +=1
			elif '10CS662' in valueList[4][0]:
				sub3="10CS662"
				found +=1
			elif '10CS663' in valueList[4][0]:
				sub3="10CS663"
				found +=1
			elif '10CS664' in valueList[4][0]:
				sub3="10CS664"
				found +=1
			elif '10CS665' in valueList[4][0]:
				sub3="10CS665"
				found +=1
			elif '10CS666' in valueList[4][0]:
				sub3="10CS666"
				found +=1

		
#for the 4th subject 
		if (sizeOfVlist>5):
			
			
			if '10AL61' in valueList[5][0]:
				sub4="10AL61"
				found +=1
			elif '10CS62' in valueList[5][0]:
				sub4="10CS62"
				found +=1
			elif '10CS63' in valueList[5][0]:
				sub4="10CS63"
				found +=1
			elif '10CS64' in valueList[5][0]:
				sub4="10CS64"
				found +=1
			elif '10CS65' in valueList[5][0]:
				sub4="10CS65"
				found +=1
			elif '10CSL67' in valueList[5][0]:
				sub4="10CSL67"
				found +=1
			elif '10CSL68' in valueList[5][0]:
				sub4="10CSL68"
				found +=1
			elif '10CS661' in valueList[5][0]:
				sub4="10CS661"
				found +=1
			elif '10CS662' in valueList[5][0]:
				sub4="10CS662"
				found +=1
			elif '10CS663' in valueList[5][0]:
				sub4="10CS663"
				found +=1
			elif '10CS664' in valueList[5][0]:
				sub4="10CS664"
				found +=1
			elif '10CS665' in valueList[5][0]:
				sub4="10CS665"
				found +=1
			elif '10CS666' in valueList[5][0]:
				sub4="10CS666"
				found +=1

		
#for the 5th subject 
		if (sizeOfVlist>6):
				
			if '10AL61' in valueList[6][0]:
				sub5="10AL61"
				found +=1
			elif '10CS62' in valueList[6][0]:
				sub5="10CS62"
				found +=1
			elif '10CS63' in valueList[6][0]:
				sub5="10CS63"
				found +=1
			elif '10CS64' in valueList[6][0]:
				sub5="10CS64"
				found +=1
			elif '10CS65' in valueList[6][0]:
				sub5="10CS65"
				found +=1
			elif '10CSL67' in valueList[6][0]:
				sub5="10CSL67"
				found +=1
			elif '10CSL68' in valueList[6][0]:
				sub5="10CSL68"
				found +=1
			elif '10CS661' in valueList[6][0]:
				sub5="10CS661"
				found +=1
			elif '10CS662' in valueList[6][0]:
				sub5="10CS662"
				found +=1
			elif '10CS663' in valueList[6][0]:
				sub5="10CS663"
				found +=1
			elif '10CS664' in valueList[6][0]:
				sub5="10CS664"
				found +=1
			elif '10CS665' in valueList[6][0]:
				sub5="10CS665"
				found +=1
			elif '10CS666' in valueList[6][0]:
				sub5="10CS666"
				found +=1

		
#for the 6th subject 
		if (sizeOfVlist > 7):
				
			if '10AL61' in valueList[7][0]:
				sub6="10AL61"
				found +=1
			elif '10CS62' in valueList[7][0]:
				sub6="10CS62"
				found +=1
			elif '10CS63' in valueList[7][0]:
				sub6="10CS63"
				found +=1
			elif '10CS64' in valueList[7][0]:
				sub6="10CS64"
				found +=1
			elif '10CS65' in valueList[7][0]:
				sub6="10CS65"
				found +=1
			elif '10CSL67' in valueList[7][0]:
				sub6="10CSL67"
				found +=1
			elif '10CSL68' in valueList[7][0]:
				sub6="10CSL68"
				found +=1
			elif '10CS661' in valueList[7][0]:
				sub6="10CS661"
				found +=1
			elif '10CS662' in valueList[7][0]:
				sub6="10CS662"
				found +=1
			elif '10CS663' in valueList[7][0]:
				sub6="10CS663"
				found +=1
			elif '10CS664' in valueList[7][0]:
				sub6="10CS664"
				found +=1
			elif '10CS665' in valueList[7][0]:
				sub6="10CS665"
				found +=1
			elif '10CS666' in valueList[7][0]:
				sub6="10CS666"
				found +=1

		
#for the 7th subject 
		if (sizeOfVlist >8):
				
			if '10AL61' in valueList[8][0]:
				sub7="10AL61"
				found +=1
			elif '10CS62' in valueList[8][0]:
				sub7="10CS62"
				found +=1
			elif '10CS63' in valueList[8][0]:
				sub7="10CS63"
				found +=1
			elif '10CS64' in valueList[8][0]:
				sub7="10CS64"
				found +=1
			elif '10CS65' in valueList[8][0]:
				sub7="10CS65"
				found +=1
			elif '10CSL67' in valueList[8][0]:
				sub7="10CSL67"
				found +=1
			elif '10CSL68' in valueList[8][0]:
				sub7="10CSL68"
				found +=1
			elif '10CS661' in valueList[8][0]:
				sub7="10CS661"
				found +=1
			elif '10CS662' in valueList[8][0]:
				sub7="10CS662"
				found +=1
			elif '10CS663' in valueList[8][0]:
				sub7="10CS663"
				found +=1
			elif '10CS664' in valueList[8][0]:
				sub7="10CS664"
				found +=1
			elif '10CS665' in valueList[8][0]:
				sub7="10CS665"
				found +=1
			elif '10CS666' in valueList[8][0]:
				sub7="10CS666"
				found +=1
				

#for the 8th subject 
		if (sizeOfVlist >9):
				
			if '10AL61' in valueList[9][0]:
				sub8="10AL61"
				found +=1
			elif '10CS62' in valueList[9][0]:
				sub8="10CS62"
				found +=1
			elif '10CS63' in valueList[9][0]:
				sub8="10CS63"
				found +=1
			elif '10CS64' in valueList[9][0]:
				sub8="10CS64"
				found +=1
			elif '10CS65' in valueList[9][0]:
				sub8="10CS65"
				found +=1
			elif '10CSL67' in valueList[9][0]:
				sub8="10CSL67"
				found +=1
			elif '10CSL68' in valueList[9][0]:
				sub8="10CSL68"
				found +=1
			elif '10CS661' in valueList[9][0]:
				sub8="10CS661"
				found +=1
			elif '10CS662' in valueList[9][0]:
				sub8="10CS662"
				found +=1
			elif '10CS663' in valueList[9][0]:
				sub8="10CS663"
				found +=1
			elif '10CS664' in valueList[9][0]:
				sub8="10CS664"
				found +=1
			elif '10CS665' in valueList[9][0]:
				sub8="10CS665"
				found +=1
			elif '10CS666' in valueList[9][0]:
				sub8="10CS666"
				found +=1
		
	#check_value(valueList)
		
	#================================================================
	
	# create a document and insert
	# we have to write code to get the college number 
	if (blank ==0):
		
		collegeCode=fname[0:3]
		
		print "college code: %s"%(collegeCode)
		#print college.items()
		
		collegeName=college[collegeCode]
		
		# uncomment below lines to test for one file  
		#collegeCode="1RG"
		#collegeName="Rajiv Gandhi Institute of Technology"
		
		
		
		sizeofVlist=len(valueList)
		print "size of valueList %s after allocating sub1,sub2 etc." %(sizeofVlist)
		
		print "blank flag signifying blank record or WRONG SEMESTER  = %s"%blank
		
		print "No of subjects found in the result for this semester : %s"%found
	
	if blank ==0 and sizeofVlist>9 :  # neglecting few cases where all 8 subjects are NOT printed ( it should never happen that way)
		
		if (found==8):
	
			post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
				"student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
				sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res},
				sub2:{"sub":valueList[3][0],"ext":int(valueList[3][1]),"int":int(valueList[3][2]),"tot":int(valueList[3][3]),"passfail":sub2res},
				sub3:{"sub":valueList[4][0],"ext":int(valueList[4][1]),"int":int(valueList[4][2]),"tot":int(valueList[4][3]),"passfail":sub3res},
				sub4:{"sub":valueList[5][0],"ext":int(valueList[5][1]),"int":int(valueList[5][2]),"tot":int(valueList[5][3]),"passfail":sub4res},
				sub5:{"sub":valueList[6][0],"ext":int(valueList[6][1]),"int":int(valueList[6][2]),"tot":int(valueList[6][3]),"passfail":sub5res},
				sub6:{"sub":valueList[7][0],"ext":int(valueList[7][1]),"int":int(valueList[7][2]),"tot":int(valueList[7][3]),"passfail":sub6res},
				sub7:{"sub":valueList[8][0],"ext":int(valueList[8][1]),"int":int(valueList[8][2]),"tot":int(valueList[8][3]),"passfail":sub7res},
				sub8:{"sub":valueList[9][0],"ext":int(valueList[9][1]),"int":int(valueList[9][2]),"tot":int(valueList[9][3]),"passfail":sub8res}
				}
		elif (found ==7):
			post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
				"student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
				sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res},
				sub2:{"sub":valueList[3][0],"ext":int(valueList[3][1]),"int":int(valueList[3][2]),"tot":int(valueList[3][3]),"passfail":sub2res},
				sub3:{"sub":valueList[4][0],"ext":int(valueList[4][1]),"int":int(valueList[4][2]),"tot":int(valueList[4][3]),"passfail":sub3res},
				sub4:{"sub":valueList[5][0],"ext":int(valueList[5][1]),"int":int(valueList[5][2]),"tot":int(valueList[5][3]),"passfail":sub4res},
				sub5:{"sub":valueList[6][0],"ext":int(valueList[6][1]),"int":int(valueList[6][2]),"tot":int(valueList[6][3]),"passfail":sub5res},
				sub6:{"sub":valueList[7][0],"ext":int(valueList[7][1]),"int":int(valueList[7][2]),"tot":int(valueList[7][3]),"passfail":sub6res},
				sub7:{"sub":valueList[8][0],"ext":int(valueList[8][1]),"int":int(valueList[8][2]),"tot":int(valueList[8][3]),"passfail":sub7res}
				}
		elif (found ==6):
			post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
			        "student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
			        sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res},
			        sub2:{"sub":valueList[3][0],"ext":int(valueList[3][1]),"int":int(valueList[3][2]),"tot":int(valueList[3][3]),"passfail":sub2res},
			        sub3:{"sub":valueList[4][0],"ext":int(valueList[4][1]),"int":int(valueList[4][2]),"tot":int(valueList[4][3]),"passfail":sub3res},
			        sub4:{"sub":valueList[5][0],"ext":int(valueList[5][1]),"int":int(valueList[5][2]),"tot":int(valueList[5][3]),"passfail":sub4res},
			        sub5:{"sub":valueList[6][0],"ext":int(valueList[6][1]),"int":int(valueList[6][2]),"tot":int(valueList[6][3]),"passfail":sub5res},
			        sub6:{"sub":valueList[7][0],"ext":int(valueList[7][1]),"int":int(valueList[7][2]),"tot":int(valueList[7][3]),"passfail":sub6res}
			        }
		elif (found ==5):
			post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
			"student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
			sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res},
			sub2:{"sub":valueList[3][0],"ext":int(valueList[3][1]),"int":int(valueList[3][2]),"tot":int(valueList[3][3]),"passfail":sub2res},
			sub3:{"sub":valueList[4][0],"ext":int(valueList[4][1]),"int":int(valueList[4][2]),"tot":int(valueList[4][3]),"passfail":sub3res},
			sub4:{"sub":valueList[5][0],"ext":int(valueList[5][1]),"int":int(valueList[5][2]),"tot":int(valueList[5][3]),"passfail":sub4res},
			sub5:{"sub":valueList[6][0],"ext":int(valueList[6][1]),"int":int(valueList[6][2]),"tot":int(valueList[6][3]),"passfail":sub5res}
			}			
		elif (found==4):
			post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
			"student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
			sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res},
			sub2:{"sub":valueList[3][0],"ext":int(valueList[3][1]),"int":int(valueList[3][2]),"tot":int(valueList[3][3]),"passfail":sub2res},
			sub3:{"sub":valueList[4][0],"ext":int(valueList[4][1]),"int":int(valueList[4][2]),"tot":int(valueList[4][3]),"passfail":sub3res},
			sub4:{"sub":valueList[5][0],"ext":int(valueList[5][1]),"int":int(valueList[5][2]),"tot":int(valueList[5][3]),"passfail":sub4res}
		        }			
		elif (found==3):
			post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
		        "student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
		        sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res},
		        sub2:{"sub":valueList[3][0],"ext":int(valueList[3][1]),"int":int(valueList[3][2]),"tot":int(valueList[3][3]),"passfail":sub2res},
		        sub3:{"sub":valueList[4][0],"ext":int(valueList[4][1]),"int":int(valueList[4][2]),"tot":int(valueList[4][3]),"passfail":sub3res}
		        }
		elif (found==2):
			post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
		        "student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
		        sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res},
		        sub2:{"sub":valueList[3][0],"ext":int(valueList[3][1]),"int":int(valueList[3][2]),"tot":int(valueList[3][3]),"passfail":sub2res}
		         
		        }
		elif (found==1):
			post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
		        "student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
		        sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res}
		        
		        }			
	
		print "Inserting a document"
		collection.insert(post)
	
	#print "DBG: printing all records in the collection"
	#print "-------------------------------------------"
	#results=collection.find()
	#for record in results:
	#	print record
	client.close()	
Example #31
0
def parseWiki(wikiList):
	bandGenreInfo = {}
	bandAsActInfo = {}
	for key in wikiList:
		#key = 'Slipknot'
		turl = wikiList[key] ;
		print 'Looking up band ' + key
		opener = urllib2.build_opener()
		opener.addheaders = [('User-agent', 'Mozilla/5.0')]
		page = opener.open(turl)
		soup = BeautifulSoup(page.read())
		tables = soup.findChildren('table')
		if(len(tables)>0):
			table = tables[0] ;  # first table has the genre information
			if(len(tables[0]) < 10):   # some have some other crap, in that case select the next table
				table = tables[1]	
			rows = table.findChildren('tr') 
			# Usually 3rd/4th/5th row has genre information
			# hence loop through to find out
			assact = []
			genre = []
			if(len(rows) >= 5):
				for i in range(1,len(rows)):
					t = rows[i].findChildren('a');
					h =  rows[i].findChildren('th');
					if(len(t) >= 1 and  t[0].string == 'Genres'):
						genre_row  = rows[i] ;
						all_a = genre_row.findChildren('a') 
						all_a = all_a[1:] 
				
						
						for info in all_a:
							if(info.string is not None):
								genre.append(info.string.lower()) ;
					elif(len(h) >=1 and h[0].string == 'Associated acts'):
						origin_row = rows[i];
						all_a = origin_row.findChildren('a') ;
						
						for info in all_a:
							if(info.string is not None):
								assact.append(info.string.lower()) ;
						
				#print genre + '\n'
				if(len(genre)<1):
					genre = ['Unknown']
				if(len(assact)<1):
					assact = ['Unknown']
				
				bandGenreInfo[key]  = genre
				bandAsActInfo[key] = assact
				#print genre, assact
			else :# Unknown Issue
				bandGenreInfo[key] = ['Unknown']
				bandAsActInfo[key] = ['Unknown']
		else:
			bandGenreInfo[key] = ['Unknown']
			bandAsActInfo[key] = ['Unknown']
			
		#break;
	#return rows
	return bandGenreInfo, bandAsActInfo
__author__ = 'Vineet'

import urllib2
from BeautifulSoup import BeautifulSoup

url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=2005,2014'

test_url = urllib2.urlopen(url)
readHtml = test_url.read()
test_url.close()

soup = BeautifulSoup(readHtml)
# Using it track the number of Movie
count = 0
# Fetching the value present within tag results
movies = soup.findChildren('table', 'results')
# Changing the movie into an iterator
itermovie = iter(movies[0].findChildren('tr'))
# Skipping the first value of the iterator as it does have the required info
next(itermovie)

# Finding tr in itermovie. Every tr tag contains information of a movie
for tr in itermovie:

    # Fetching image Url for the movie
    imgSource = tr.findChildren(
        'td',
        'image')[0].find('img')['src'].split('._V1.')[0] + '._V1_SX214_AL_.jpg'
    # Fetching the title and year of the movie
    movie = tr.findChildren('td', 'title')
    title = movie[0].find('a').contents[0] + movie[0].find(
    parser.add_argument('-election', '--election', help='Election to show', required=False)
    args = vars(parser.parse_args())
    url = args['url']
    efilter = args['election']

    d = feedparser.parse(url)
    elections = []
    for item in d.entries:
        #print item
        #print item['title_detail']
        title = item['title_detail']['value']
        election = Election(title)
        #print item['summary']
        soup = BeautifulSoup(item['summary'])

        tables = soup.findChildren('table')
        # This will get the first (and only) table. Your page may have more.
        my_table = tables[0]

        # You can find children with multiple tags by passing a list of strings
        rows = my_table.findChildren(['th', 'tr'])
        i = 0
        for row in rows:
            i += 1
            cells = row.findChildren('td')
            if i == 1:
                election.Progress = cells[0].text.strip()
            else:
                candidate = CandidateIssue(cells[0].text.strip())
                candidate.Percentage = cells[1].text.strip()
                candidate.TotalNumber = cells[2].text.strip()
def process_file_updatedb(fname):
	with open(fname,"r") as foo_file:
		soup = BeautifulSoup(foo_file)  #soup is the full HTML
	
	
	#-----------------------------------------
	
	valueList = []  # this is a list of lists 
	blank=0 #flag of blank resultsheet 
	
	#--------------------------------------------
	tables = soup.findChildren('table')
	my_table=tables[11]  # we are targetting the 11th table in soup 
	
	#-------------------------------------
	'''
	bolds= my_table.findAll('b')    
	nameusn= bolds[0].string  #name and USN
	semesternum= bolds[1].string + bolds[2].string
	result = bolds[3].string.replace('&nbsp;',' ')
	'''
	bolds= my_table.findAll('b')
	boldlen=len(bolds)
	if (boldlen>1):
		nameusn= bolds[0].string  #name and USN
	if (boldlen>2):
		semesternum= bolds[1].string + bolds[2].string
	if (boldlen>4):	
		result = bolds[3].string.replace('&nbsp;',' ')
	else:
		result = "resultsheet blank"
		blank=1
	#print"Debug:result of this candidate after accessing bold elements...%s"%(result)  # will give an error unless assigned
	
	      
	
	#---collect the subject pass/fail also -only when those fields are there------------
	#---unless you put these checks, it will bomb for few which has a different format -----------------
	
	if (boldlen>5):
		sub1res = bolds[4].string.replace('&nbsp;',' ')
	#print sub1res
	if (boldlen>6):
		sub2res = bolds[5].string.replace('&nbsp;',' ')
	#print sub2res
	if (boldlen>7):
		sub3res = bolds[6].string.replace('&nbsp;',' ')
	#print sub3res
	if (boldlen>8):
		sub4res = bolds[7].string.replace('&nbsp;',' ')
	#print sub4res
	if (boldlen>9):
		sub5res = bolds[8].string.replace('&nbsp;',' ')
	#print sub5res
	if (boldlen>10):
		sub6res = bolds[9].string.replace('&nbsp;',' ')
	#print sub6res
	if (boldlen>11):
		sub7res = bolds[10].string.replace('&nbsp;',' ')
	#print sub7res	
	#print "DBG: Here I am extracting each row of data for another student ..."
	valueList =[]	


	for row in soup.find("td", {"width" : "513"}).findAll('tr'):
		tds = row('td')  # here I am printing all the cells in a row (list of cells) and I need to extract them 
		 
		listInternal=[]
		#print tds[0].text.replace('&nbsp;',' ')
		listInternal.append(tds[0].text.replace('&nbsp;',' '))
		 
		#print tds[1].text.replace('&nbsp;',' ')
		listInternal.append(tds[1].text.replace('&nbsp;',' '))
		 
		#print tds[2].text.replace('&nbsp;',' ')
		listInternal.append(tds[2].text.replace('&nbsp;',' '))
		
		#print tds[3].text.replace('&nbsp;',' ')
		listInternal.append(tds[3].text.replace('&nbsp;',' '))
		valueList.append(listInternal)
	print "..........valueList..........................."	
	
	sizeOfVlist=len(valueList)  # this is a list of lists
	print "size of valueList %s" %(sizeOfVlist)
	
	
	#Since the subject list itself varies from college to college, I have 
	# define the subject dynamically 
	# the code sucks .. once it works, we have to improve this !!
	
	if (blank==0):
		
		if (sizeOfVlist >2):

			if '14SCS21' in valueList[2][0]:
				sub1="14SCS21"
			elif '14SCS22' in valueList[2][0]:
				sub1="14SCS22"
			elif '14SCS23' in valueList[2][0]:
				sub1="14SCS23"		
			elif '14SCS24' in valueList[2][0]:
				sub1="14SCS24"
			elif '14SCS26' in valueList[2][0]:
				sub1="14SCS26"
			elif '14SCS27' in valueList[2][0]:
				sub1="14SCS27"
			elif '14SCS251' in valueList[2][0]:
				sub1="14SCS251"
			elif '14SCS251' in valueList[2][0]:
				sub1="14SCS252"		
			elif '14SCS252' in valueList[2][0]:
				sub1="14SCS252"
			elif '14SCS253' in valueList[2][0]:
				sub1="14SCS253"
			elif '14SCS254' in valueList[2][0]:
				sub1="14SCS254"
		
	#for the 2nd subject 
		if (sizeOfVlist >3):
			
			if '14SCS21' in valueList[3][0]:
				sub2="14SCS21"
			elif '14SCS22' in valueList[3][0]:
				sub2="14SCS22"
			elif '14SCS23' in valueList[3][0]:
				sub2="14SCS23"		
			elif '14SCS24' in valueList[3][0]:
				sub2="14SCS24"
			elif '14SCS26' in valueList[3][0]:
				sub2="14SCS26"
			elif '14SCS27' in valueList[3][0]:
				sub2="14SCS27"
			elif '14SCS251' in valueList[3][0]:
				sub2="14SCS251"
			elif '14SCS251' in valueList[3][0]:
				sub2="14SCS252"		
			elif '14SCS252' in valueList[3][0]:
				sub2="14SCS252"
			elif '14SCS253' in valueList[3][0]:
				sub2="14SCS253"
			elif '14SCS254' in valueList[3][0]:
				sub2="14SCS254"
		
		#for the 3rd subject 
		if (sizeOfVlist >4):
			
			if '14SCS21' in valueList[4][0]:
				sub3="14SCS21"
			elif '14SCS22' in valueList[4][0]:
				sub3="14SCS22"
			elif '14SCS23' in valueList[4][0]:
				sub3="14SCS23"		
			elif '14SCS24' in valueList[4][0]:
				sub3="14SCS24"
			elif '14SCS26' in valueList[4][0]:
				sub3="14SCS26"
			elif '14SCS27' in valueList[4][0]:
				sub3="14SCS27"
			elif '14SCS251' in valueList[4][0]:
				sub3="14SCS251"
			elif '14SCS251' in valueList[4][0]:
				sub3="14SCS252"		
			elif '14SCS252' in valueList[4][0]:
				sub3="14SCS252"
			elif '14SCS253' in valueList[4][0]:
				sub3="14SCS253"
			elif '14SCS254' in valueList[4][0]:
				sub3="14SCS254"
		
#for the 4th subject 
		if (sizeOfVlist>5):
			
			
			if '14SCS21' in valueList[5][0]:
				sub4="14SCS21"
			elif '14SCS22' in valueList[5][0]:
				sub4="14SCS22"
			elif '14SCS23' in valueList[5][0]:
				sub4="14SCS23"		
			elif '14SCS24' in valueList[5][0]:
				sub4="14SCS24"
			elif '14SCS26' in valueList[5][0]:
				sub4="14SCS26"
			elif '14SCS27' in valueList[5][0]:
				sub4="14SCS27"
			elif '14SCS251' in valueList[5][0]:
				sub4="14SCS251"
			elif '14SCS251' in valueList[5][0]:
				sub4="14SCS252"		
			elif '14SCS252' in valueList[5][0]:
				sub4="14SCS252"
			elif '14SCS253' in valueList[5][0]:
				sub4="14SCS253"
			elif '14SCS254' in valueList[5][0]:
				sub4="14SCS254"
		
#for the 5th subject 
		if (sizeOfVlist>6):
				
			if '14SCS21' in valueList[6][0]:
				sub5="14SCS21"
			elif '14SCS22' in valueList[6][0]:
				sub5="14SCS22"
			elif '14SCS23' in valueList[6][0]:
				sub5="14SCS23"		
			elif '14SCS24' in valueList[6][0]:
				sub5="14SCS24"
			elif '14SCS26' in valueList[6][0]:
				sub5="14SCS26"
			elif '14SCS27' in valueList[6][0]:
				sub5="14SCS27"
			elif '14SCS251' in valueList[6][0]:
				sub5="14SCS251"
			elif '14SCS251' in valueList[6][0]:
				sub5="14SCS252"		
			elif '14SCS252' in valueList[6][0]:
				sub5="14SCS252"
			elif '14SCS253' in valueList[6][0]:
				sub5="14SCS253"
			elif '14SCS254' in valueList[6][0]:
				sub5="14SCS254"
		
#for the 6th subject 
		if (sizeOfVlist > 7):
				
			if '14SCS21' in valueList[7][0]:
				sub6="14SCS21"
			elif '14SCS22' in valueList[7][0]:
				sub6="14SCS22"
			elif '14SCS23' in valueList[7][0]:
				sub6="14SCS23"		
			elif '14SCS24' in valueList[7][0]:
				sub6="14SCS24"
			elif '14SCS26' in valueList[7][0]:
				sub6="14SCS26"
			elif '14SCS27' in valueList[7][0]:
				sub6="14SCS27"
			elif '14SCS251' in valueList[7][0]:
				sub6="14SCS251"
			elif '14SCS251' in valueList[7][0]:
				sub6="14SCS252"		
			elif '14SCS252' in valueList[7][0]:
				sub6="14SCS252"
			elif '14SCS253' in valueList[7][0]:
				sub6="14SCS253"
			elif '14SCS254' in valueList[7][0]:
				sub6="14SCS254"
		
#for the 7th subject 
		if (sizeOfVlist >8):
				
			if '14SCS21' in valueList[8][0]:
				sub7="14SCS21"
			elif '14SCS22' in valueList[8][0]:
				sub7="14SCS22"
			elif '14SCS23' in valueList[8][0]:
				sub7="14SCS23"		
			elif '14SCS24' in valueList[8][0]:
				sub7="14SCS24"
			elif '14SCS26' in valueList[8][0]:
				sub7="14SCS26"
			elif '14SCS27' in valueList[8][0]:
				sub7="14SCS27"
			elif '14SCS251' in valueList[8][0]:
				sub7="14SCS251"
			elif '14SCS251' in valueList[8][0]:
				sub7="14SCS252"		
			elif '14SCS252' in valueList[8][0]:
				sub7="14SCS252"
			elif '14SCS253' in valueList[8][0]:
				sub7="14SCS253"
			elif '14SCS254' in valueList[8][0]:
				sub7="14SCS254"
	
	
		
	#check_value(valueList)
	
	#================================================================
	
	# create a document and insert
	# we have to write code to get the college number 
	collegeCode=fname[0:3]
	
	#print "college code: %s"%(collegeCode)
	#print college.items()
	
	collegeName=college[collegeCode]
	
	# uncomment below lines to test for one file  
	#collegeCode="1RG"
	#collegeName="Rajiv Gandhi Institute of Technology"
	
	
	
	sizeofVlist=len(valueList)
	print "size of valueList %s" %(sizeofVlist)
	if blank ==0 and sizeofVlist>8 :  # neglecting few cases
	
		post = {"college": {"collegeName": collegeName, "collegeCode":collegeCode},
			"student":{"nameusn":nameusn, "sem": valueList[0][1],"result":result},  # not taking total here 
			sub1:{"sub":valueList[2][0],"ext":int(valueList[2][1]),"int":int(valueList[2][2]),"tot":int(valueList[2][3]),"passfail":sub1res},
			sub2:{"sub":valueList[3][0],"ext":int(valueList[3][1]),"int":int(valueList[3][2]),"tot":int(valueList[3][3]),"passfail":sub2res},
			sub3:{"sub":valueList[4][0],"ext":int(valueList[4][1]),"int":int(valueList[4][2]),"tot":int(valueList[4][3]),"passfail":sub3res},
			sub4:{"sub":valueList[5][0],"ext":int(valueList[5][1]),"int":int(valueList[5][2]),"tot":int(valueList[5][3]),"passfail":sub4res},
			sub5:{"sub":valueList[6][0],"ext":int(valueList[6][1]),"int":int(valueList[6][2]),"tot":int(valueList[6][3]),"passfail":sub5res},
			sub6:{"sub":valueList[7][0],"ext":int(valueList[7][1]),"int":int(valueList[7][2]),"tot":int(valueList[7][3]),"passfail":sub6res},
			sub7:{"sub":valueList[8][0],"ext":int(valueList[8][1]),"int":int(valueList[8][2]),"tot":int(valueList[8][3]),"passfail":sub7res},
			}
		collection.insert(post)
	
	#print "DBG: printing all records in the collection"
	#print "-------------------------------------------"
	#results=collection.find()
	#for record in results:
	#	print record
	client.close()	
def _get_organization_name(response):
    xml_string = response[1]
    soup = BeautifulSoup(xml_string)
    organization_name = soup.findChildren()[-1].get('value')
    return organization_name
Example #36
0
html_contents = website.read()

#close website
website.close()

# 3. Parse the html!
#import BeautifulSoup! This is BeautifulSoup3
from BeautifulSoup import BeautifulSoup

#import regular expression library
import re

soup = BeautifulSoup(''.join(html_contents))

#get all items in the menu
items = soup.findChildren('div', id=re.compile("^item-"))

#get items and reviews
names = []
reviews = []
i = 0

#import Html parser for &'s and 's
import HTMLParser

html = HTMLParser.HTMLParser()
while i < len(items):
    my_table = items[i]

    #get name
    name_row = my_table.findNext('div', attrs={"class": 'ow-check-in-mi'})
html_contents = website.read()

#close website
website.close()

# 3. Parse the html!
#import BeautifulSoup! This is BeautifulSoup3
from BeautifulSoup import BeautifulSoup

#import regular expression library
import re

soup = BeautifulSoup(''.join(html_contents))

#get all items in the menu
items = soup.findChildren('div', id=re.compile("^item-"))

#get items and reviews
names = []
reviews = []
i = 0;

#import Html parser for &'s and 's
import HTMLParser
html = HTMLParser.HTMLParser()
while i < len(items):
	my_table = items[i]
	
	#get name
	name_row = my_table.findNext('div', attrs={"class" : 'ow-check-in-mi'})
	if (name_row):
Example #38
0
#
# Preparation:
# pip install requests json BeautifulSoup
#
# Run:
# python country_codes.py

import os
import sys
import requests
import json
from BeautifulSoup import BeautifulSoup

page = requests.get('http://countrycode.org/')
soup = BeautifulSoup(page.text)
rows = soup.findChildren('table')[0].findChildren(['tr'])
country_codes_json = []

for row in rows:
  cells = row.findChildren('td')

  if cells:
    country_codes_json.append({
      'name': cells[0].find('a').string,
      'code': int(cells[1].string.split(',', 1)[0].replace('-','')),
      'iso': cells[2].string.split('/', 1)[0].strip(),
      'population': int(cells[3].string.replace(',', ''))
    })

with open(os.path.dirname(sys.argv[0]) + '/codes.json', 'w') as outfile:
  json.dump(country_codes_json, outfile, indent=2)
from BeautifulSoup import BeautifulSoup
import os, sys


if os.path.isfile("abschiebungen/settings.py"):
    sys.path.append(os.getcwd())
else:
    sys.exit("Error: not in the root directory of the django project.")


from data.models import Country


response = urllib2.urlopen("http://www.countryareacode.net/")
soup = BeautifulSoup(response)
tables = soup.findChildren("table")

my_table = tables[0]
# print my_table
rows = my_table.findChildren("tr")

for row in rows:
    # print len(row)

    cells = row.findChildren("td")

    for cell in cells:

        if cell.findChildren("a"):
            for a in cell.findChildren("a"):
                if a.string:
Example #40
0
#
# Preparation:
# pip install requests json BeautifulSoup
#
# Run:
# python country_codes.py

import os
import sys
import requests
import json
from BeautifulSoup import BeautifulSoup

page = requests.get('http://countrycode.org/')
soup = BeautifulSoup(page.text)
rows = soup.findChildren('table')[0].findChildren(['tr'])
country_codes_json = []

for row in rows:
    cells = row.findChildren('td')

    if cells:
        country_codes_json.append({
            'name':
            cells[0].find('a').string,
            'code':
            int(cells[1].string.split(',', 1)[0].replace('-', '')),
            'iso':
            cells[2].string.split('/', 1)[0].strip(),
            'population':
            int(cells[3].string.replace(',', ''))
Example #41
0
    'NUM', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
]

url = 'http://dns.marnet.net.mk/registar.php'

linkovi = []
for bukva in bukvi:
    #print "Глеам колку страници има на буква: %s" % bukva
    bukva = urllib.urlencode({'bukva': bukva})
    req = urllib2.Request(url, bukva)
    res = urllib2.urlopen(req)

    stranica = res.read()

    soup = BeautifulSoup(stranica)
    rawlinkovi = soup.findChildren('a', {'class': 'do'})

    for link in rawlinkovi:
        if link['href'].find('del=') <> -1:
            linkovi.append(link['href'])

f = open(
    '/home/glisha/webapps/nginx_domejnotmk/domejnotmk/soberipodatoci/domejni_stranici.pckl',
    'wb')
pickle.dump(linkovi, f)
f.close()

#import pprint
#pprint.pprint(linkovi)
def getRecord(rollNo):
    roll = str(rollNo)
    req = requests.get('http://oa.cc.iitk.ac.in:8181/Oa/Jsp/OAServices/IITk_SrchRes.jsp?typ=stud&numtxt=' + roll + '&sbm=Y')
    soup = BeautifulSoup(req.text)
    record = {}

    record['roll'] = roll

    image = 'http://oa.cc.iitk.ac.in:8181/Oa/Jsp/Photo/' + roll + '_0.jpg'
    record['image'] = image

    data = soup.findChildren('p')
    name = data[0].text.split(':')[1]
    record['name'] = name

    if not name:
        return None

    program = data[1].text.split(':')[1]
    record['program'] = program

    dept = data[2].text.split(':')[1]
    record['department'] = dept

    room = data[3].text.split(':')[1]
    record['room'] = room

    email = data[4].text.split(':')[1]
    record['email'] = email

    bloodData = data[5].text.split('<b>')[0]
    blood = bloodData.split(':')[1]
    record['blood'] = blood

    categoryData = data[5].text.split('<b>')[1]
    category = re.findall(u'(?<=>).+?(?=<)', categoryData)[0]
    record['category'] = category

    genderData = data[6].text.split(':')
    gender = genderData[1][0]
    record['gender'] = gender

    country = genderData[2]
    record['country'] = country

    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    addressSoup = BeautifulSoup(comments[1])
    permanentAddressData = addressSoup.findAll('p')[1].text
    phonePos = permanentAddressData.index('Phone no:')
    mobilePos = permanentAddressData.index('Mobile no:')

    address  = permanentAddressData[19:phonePos]
    record['address'] = address

    phone = permanentAddressData[(phonePos + 9):mobilePos]
    record['phone'] = phone

    mobile = permanentAddressData[(mobilePos + 9):]
    record['mobile'] = mobile

    return record
                                            department_ID + ' = ' +
                                            str(no_of_students) + '\n')
                break

            if k < 10:
                student_ID = department_ID + '00' + str(k)
            elif k < 100:
                student_ID = department_ID + '0' + str(k)
            else:
                student_ID = department_ID + str(k)

            data[input_filed_name] = student_ID

            response = requests.post("http://engasu.net/chepfall/Results.aspx",
                                     data=data)

            parsed_html = BeautifulSoup(response.text)
            tables = parsed_html.findChildren('table')

            # No grades table available = students grade not available
            if len(tables) >= 2:
                create_file(department_directory, student_ID + '.html',
                            response.text.encode('utf-8'))
                print 'Printed file for student ' + student_ID
                last_ID = k
                no_of_students += 1
            else:
                print 'Student ' + student_ID + ' is not available'

file_departments_data.close()
        last_ID = 0

        if os.path.exists(department_directory):
            student_files = [
                html_files for html_files in os.listdir(department_directory)
                if html_files[0] != '.'
            ]

            for file in student_files:

                with open(department_directory + '/' + file,
                          'r') as content_file:
                    html_content = content_file.read()

                parsed_html = BeautifulSoup(html_content.decode('utf-8'))
                tables = parsed_html.findChildren('table')

                student_id = file[:7]
                student_name = parsed_html.findChildren("p")[0].text.split(
                    ':')[1].strip(' ')

                table = tables[1]
                rows = table.findChildren('tr')[1:]

                number_of_courses = 0
                total_grades = 0
                for row in rows:
                    cols = row.findChildren('td')

                    course_name = str(cols[1].text)
                    if (special_credit_hours.has_key(course_name)):