def DoSoupFindAll(data, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): try: soup = beautifulsoup.BeautifulSoup(data) return soup.findAll(name, attrs, recursive, text, limit, **kwargs) except: logFile.debug("Error parsing using soup", exc_info=True) return []
def getdetailednzbinfo(self, data): filesegs = [] fileinfo = {} fileinfo['pars'] = 0 fileinfo['rars'] = 0 fileinfo['nfo'] = 0 fileinfo['nofile'] = 0 fileinfo['nbytes'] = 0 fileinfo['postid'] = [] if (len(data) == 0): return fileinfo soup = beautifulsoup.BeautifulSoup(data) fileno = soup.findAll('file') for fno in fileno: try: segs = fno.findAll('segments') fsggs = 0 parfile = 0 #~ there is no rar or rarparts chk, many nzb contain just uncompr.files val_sample = re.search(r"[\.\-]sample", fno['subject'], re.I) if (val_sample is not None): continue if (fno['subject'].find('.nfo') != -1): fileinfo['nfo'] = fileinfo['nfo'] + 1 elif (fno['subject'].find('.par2') != -1): fileinfo['pars'] = fileinfo['pars'] + 1 parfile = 1 else: fileinfo['nofile'] = fileinfo['nofile'] + 1 for s in segs: s_segs = s.findAll('segment') fsggs = fsggs + len(s_segs) postid = [] for s2 in s_segs: fileinfo['nbytes'] += int(s2['bytes']) except Exception as e: fileinfo['pars'] = 0 fileinfo['rars'] = 0 fileinfo['nfo'] = 0 fileinfo['nofile'] = 0 fileinfo['nbytes'] = 0 fileinfo['postid'] = [] log.critical("Error, could not parse NZB file") #~ sys.exit() fileinfo['nbytes'] = int(fileinfo['nbytes'] / (1024 * 1024)) #~ print 'Num files: ' + str(fileinfo['nofile']) + ' of which repair files ' + str(fileinfo['pars']) return fileinfo
def get_profile_info(self): socket.setdefaulttimeout(self.timeout) if (self.chkcookie() == False): if (self.dologin() == False): return [] loginurl = self.cur_cfg['url'] + "/profile" try: socket.setdefaulttimeout(self.timeout) res = self.br.open(loginurl) except Exception as e: eret = self.mech_error_generic(e) if (eret == 302): self.reset_cookies() return [] data = res.get_data() soup = beautifulsoup.BeautifulSoup(data) info = {} for row in soup.findAll("tr"): data = {} #~ print row #~ print '--------' allTHs = row.findAll("th") for x in range(len(allTHs)): str_lowcase = str(allTHs[x]).lower() if (str_lowcase.find('api hits today') > -1): allTD = row.findAll("td") if (len(allTD)): info['api_hits'] = ''.join(allTD[0].findAll(text=True)) if (str_lowcase.find('grabs today') > -1): allTD = row.findAll("td") if (len(allTD)): info['grabs_today'] = ''.join( allTD[0].findAll(text=True)) if (str_lowcase.find('grabs total') > -1 or str_lowcase.find('grabs') > -1): allTD = row.findAll("td") if (len(allTD)): info['grabs_total'] = ''.join( allTD[0].findAll(text=True)) #~ print info return info
def get_links(value): """ Returns links found in an (X)HTML string as Python objects for itteration in templates. EXAMPLE: <ul> {% for link in blog.entry.body|get_links %} <li><a href="{{ link.href }}">{{ link.title }}</a></li> {% endfor %} </ul> """ try: import beautifulsoup except ImportError: if settings.DEBUG: raise template.TemplateSyntaxError, "Error in {% getlinks %} filter: The Python BeautifulSoup and/or urllib2 libraries aren't installed." return value else: soup = beautifulsoup.BeautifulSoup(value) return soup.findAll('a')
def search(self, srchstr): if (self.cur_cfg['valid'] == 0): return [] socket.setdefaulttimeout(self.timeout) self.cur_cfg['retcode'] = self.default_retcode if (self.chkcookie() == False): if (self.dologin() == False): return [] mainurl = self.cur_cfg['url'] #~ category must have asterisk, it messes up the search srchstrnu = srchstr.split('.') for defcats in self.definedcat: if (defcats[0] == srchstrnu[-1]): srchstrnu[-1] = '*' + srchstrnu[-1] srchstr = ".".join(srchstrnu) loginurl = mainurl + '/nzbbrowse.php?b=2&st=1&c=0&g=0&sr=2&o=0&k=' + srchstr timestamp_s = time.time() try: socket.setdefaulttimeout(self.timeout) res = self.br.open(loginurl) except Exception as e: self.mech_error_generic(e) eret = self.mech_error_generic(e) if (eret == 302): self.reset_cookies() return [] data = res.get_data() timestamp_e = time.time() log.info('TS ' + mainurl + " " + str(timestamp_e - timestamp_s)) self.cur_cfg['retcode'][2] = timestamp_e - timestamp_s #~ def searchDBG(self, srchstr): #~ handler = open('test.html').read() soup = beautifulsoup.BeautifulSoup(data) parsed_data = [] titlesdiv = soup.findAll('div', {'class': 'pstnam'}) nzburlsdiv = soup.findAll('div', {'class': 'dlnzb'}) tstampdiv = soup.findAll('div', {'class': 'pstdat'}) szdiv = soup.findAll('abbr', {'title': 'Total size of articles'}) catdiv = soup.findAll('a', {'class': 'catimg'}) titles = [] rdetails = [] nzburls = [] tstamp = [] bytesize = [] categr = [] for tl in catdiv: fall_tt = tl['title'].find('Show all in: ') if (fall_tt != -1): categr.append(tl['title'][fall_tt + 13:]) else: categr = [] break for tl in titlesdiv: all_a = tl.findAll("a") titles.append(''.join(all_a[0].findAll(text=True))) rdetails.append(all_a[0]['href'][1:]) for tl in nzburlsdiv: all_a = tl.findAll("a") nzburls.append(all_a[0]['href'][1:]) #~ absolute day of posting for tl in tstampdiv: intage = int(tl.findAll(text=True)[0].split()[0].split('.')[0]) today = datetime.datetime.now() dd = datetime.timedelta(days=intage) earlier = today - dd tstamp.append(time.mktime(earlier.timetuple())) for sz1 in szdiv: for sz2 in sz1.findAll(text=True): sz2s = sz2.split() if (len(sz2s) == 2): if (sz2s[1].lower() == 'mb'): bytesize.append( int(self.basic_sz * float(sz2s[0].replace(',', '')))) if (sz2s[1].lower() == 'gb'): bytesize.append( int(self.basic_sz * float(sz2s[0].replace(',', '')) * 1024)) if (len(titles) != len(nzburls)): return [] if (len(titles) != len(tstamp)): return [] if (len(titles) != len(rdetails)): return [] if (len(titles) != len(bytesize)): return [] if (len(categr) != len(titles)): categr = [] for i in xrange(len(titles)): category_found = {} if (len(categr)): category_found[categr[i]] = 1 else: category_found['N/A'] = 1 d1 = { 'title': titles[i], 'poster': 'poster', 'size': bytesize[i], 'url': self.baseURL + nzburls[i], 'filelist_preview': '', 'group': 'N/A', 'posting_date_timestamp': tstamp[i], 'release_comments': self.baseURL + rdetails[i], 'categ': category_found, 'ignore': 0, 'req_pwd': self.typesrch, 'provider': self.baseURL, 'providertitle': self.name } #~ print d1 parsed_data.append(d1) return parsed_data
def search_raw(self, pagestr, srchstr): if (self.cur_cfg['valid'] == 0): return [] socket.setdefaulttimeout(self.timeout) #~ WIN: it seems to have issue in win32 # locale.setlocale( locale.LC_ALL, 'en_US.utf8' ) self.cur_cfg['retcode'] = self.default_retcode if (self.chkcookie() == False): if (self.dologin() == False): return [] mainurl = self.cur_cfg['url'] loginurl = mainurl + pagestr + srchstr timestamp_s = time.time() try: socket.setdefaulttimeout(self.timeout) res = self.br.open(loginurl) except Exception as e: eret = self.mech_error_generic(e) if (eret == 302): self.reset_cookies() return [] data = res.get_data() timestamp_e = time.time() log.info('TS ' + mainurl + " " + str(timestamp_e - timestamp_s)) self.cur_cfg['retcode'][2] = timestamp_e - timestamp_s soup = beautifulsoup.BeautifulSoup(data) #~ def searchDBG(self, srchstr): #~ handler = open('tmp/tater.html').read() #~ soup = BeautifulSoup (handler) parsed_data = [] titles = soup.findAll('a', {'class': 'title'}) nzburls = soup.findAll('a', {'title': 'Download Nzb'}) tstamp_raw = soup.findAll('td', {'class': 'less mid'}) rdetails = soup.findAll('a', {'title': 'View details'}) sz_raw = soup.findAll('td', {'class': 'less right'}) catname_raw = soup.findAll('td', {'class': 'less'}) catname = [] for catn in catname_raw: catcont = catn.findAll(text=True) for catn1 in catcont: catcont_idx = catn1.find('">') if (catcont_idx != -1): catname.append(catn1[catcont_idx + 2:len(catn1)].replace( '>', '-').capitalize()) bytesize = [] for sz1 in sz_raw: #~ rawline = str(sz1).split() for sz2 in sz1.findAll(text=True): sz2s = sz2.split() if (len(sz2s) == 2): #~ print sz2s[1].lower() if (sz2s[1].lower() == 'mb'): bytesize.append( int(self.basic_sz * float(sz2s[0].replace(',', '')))) if (sz2s[1].lower() == 'gb'): bytesize.append( int(self.basic_sz * float(sz2s[0].replace(',', '')) * 1024)) #~ print bytesize #~ 2010-05-08 18:53:09 tstamp = [] for tt in tstamp_raw: for tt2 in tt.attrs: #~ print tt2[1] if ('title' in tt2): #~ print tt2[1] tstamp.append( time.mktime( datetime.datetime.strptime( tt2[1], "%Y-%m-%d %H:%M:%S").timetuple())) break #~ deep debug #~ print 'tit' + str(len(titles)) #~ print 'tst' + str(len(tstamp) ) #~ print 'url' + str(len(nzburls)) #~ print 'det' + str(len(rdetails)) #~ print 'sz' + str(len(bytesize)) #~ print 'cat' + str(len(catname)) skipts = 1 if (len(titles)): if (len(tstamp) % len(titles) == 0): skipts = len(tstamp) / len(titles) if (skipts < 1): return [] if (len(titles) != len(nzburls)): return [] #~ if(len(titles) != len(tstamp)): #~ return [] if (len(titles) != len(rdetails)): return [] if (len(titles) != len(bytesize)): return [] for i in xrange(len(titles)): category_found = {} if (len(catname) == len(titles)): category_found[catname[i]] = 1 else: category_found['N/A'] = 1 d1 = { 'title': ''.join(titles[i].findAll(text=True)), 'poster': 'poster', 'size': bytesize[i], 'url': self.baseURL + '/' + nzburls[i]['href'], 'filelist_preview': '', 'group': 'N/A', 'posting_date_timestamp': tstamp[i * skipts], 'release_comments': self.baseURL + rdetails[i]['href'], 'categ': category_found, 'ignore': 0, 'req_pwd': self.typesrch, 'provider': self.baseURL, 'providertitle': self.name } #~ print d1 parsed_data.append(d1) return parsed_data
def search(self, srchstr): if(self.cur_cfg['valid'] == 0): return [] socket.setdefaulttimeout(self.timeout) if (self.chkcookie() == False): if(self.dologin() == False): return [] mainurl = self.cur_cfg['url'] #~ https://www.gingadaddy.com/nzbbrowse.php?b=2&st=1&k=dog&c=0&g=0&sr=2&o=0 loginurl = mainurl + '/nzbbrowse.php?b=2&st=1&c=0&g=0&sr=2&o=0&k='+srchstr timestamp_s = time.time() try: socket.setdefaulttimeout(self.timeout) res = self.br.open(loginurl) except Exception as e: self.mech_error_generic(e) eret = self.mech_error_generic(e) if(eret == 302): self.reset_cookies() return [] data = res.get_data() timestamp_e = time.time() log.info('TS ' + mainurl + " " + str(timestamp_e - timestamp_s)) #~ def searchDBG(self, srchstr): #~ handler = open('test.html').read() soup = beautifulsoup.BeautifulSoup(data) parsed_data = [] titlesdiv = soup.findAll('div', {'class': 'pstnam'}) nzburlsdiv = soup.findAll('div', {'class': 'dlnzb'}) tstampdiv = soup.findAll('div', {'class': 'pstdat'}) szdiv = soup.findAll('abbr', {'title': 'Total size of articles'}) titles = [] rdetails = [] nzburls = [] tstamp = [] bytesize = [] for tl in titlesdiv: all_a = tl.findAll("a") titles.append(''.join(all_a[0].findAll(text=True))) rdetails.append(all_a[0]['href'][1:]) for tl in nzburlsdiv: all_a = tl.findAll("a") nzburls.append(all_a[0]['href'][1:]) #~ absolute day of posting for tl in tstampdiv: intage = int(tl.findAll(text=True)[0].split()[0].split('.')[0]) today = datetime.datetime.now() dd = datetime.timedelta(days=intage) earlier = today - dd tstamp.append(time.mktime(earlier.timetuple())) for sz1 in szdiv: for sz2 in sz1.findAll(text=True): sz2s = sz2.split() if(len(sz2s) == 2): if (sz2s[1].lower() == 'mb' ): bytesize.append( int(self.basic_sz * float(sz2s[0].replace(',', '')) ) ) if (sz2s[1].lower() == 'gb' ): bytesize.append( int(self.basic_sz * float(sz2s[0].replace(',', '')) * 1024) ) if(len(titles) != len(nzburls)): return [] if(len(titles) != len(tstamp)): return [] if(len(titles) != len(rdetails)): return [] if(len(titles) != len(bytesize)): return [] for i in xrange(len(titles)): d1 = { 'title': titles[i], 'poster': 'poster', 'size': bytesize[i], 'url': self.baseURL + nzburls[i], 'filelist_preview': '', 'group': 'N/A', 'posting_date_timestamp': tstamp[i], 'release_comments': self.baseURL + rdetails[i], 'categ':{'N/A':1}, 'ignore':0, 'req_pwd':self.typesrch, 'provider':self.baseURL, 'providertitle':self.name } #~ print d1 parsed_data.append(d1) return parsed_data
def nzb_getinfo(self, data): h = HTMLParser.HTMLParser() soup = beautifulsoup.BeautifulSoup(data) fileno = soup.findAll('file') filesegs = [] fileinfo = {} fileinfo['pars'] = 0 fileinfo['nfo'] = 0 fileinfo['nofile'] = 0 fileinfo['rar'] = 0 fileinfo['nzb'] = 0 fileinfo['sfv'] = 0 fileinfo['postid'] = [] allfiles_LUT = {} allfiles = {} rootfile = '' nbytes = 0 for fno in fileno: #~ try: #~ print fno['subject'] segs = fno.findAll('segments') groups = fno.findAll('groups') fsggs = 0 parfile = 0 typefile = self.MSGTYPE_ARCHIVE #~ val = re.search(r".r[0-9]{2,4}", fno['subject'], re.I) val_sample = re.search(r"[\.\-]sample", fno['subject'], re.I) if (val_sample is not None): continue par2idx = fno['subject'].lower().find('.par2') if (par2idx != -1): typefile = self.MSGTYPE_PAR2IDX fileinfo['pars'] = fileinfo['pars'] + 1 npar_vol = re.search(r".vol[0-9]{1,4}", fno['subject'][1:par2idx + 5], re.I) if (npar_vol is not None): typefile = self.MSGTYPE_PAR2VOL if (fno['subject'].lower().find('.nfo') != -1): typefile = self.MSGTYPE_NFO fileinfo['nfo'] = fileinfo['nfo'] + 1 if (fno['subject'].lower().find('.sfv') != -1): typefile = self.MSGTYPE_SFV fileinfo['sfv'] = fileinfo['sfv'] + 1 if (fno['subject'].lower().find('.nzb') != -1): typefile = self.MSGTYPE_NZB fileinfo['nzb'] = fileinfo['nzb'] + 1 if (typefile == 0): allfiles[h.unescape(fno['subject'])] = 1 cur_group = [] for g in groups: g_groups = g.findAll('group') for g2 in g_groups: cur_group.append(''.join(g2.findAll(text=True))) for s in segs: s_segs = s.findAll('segment') fsggs = fsggs + len(s_segs) postid = [] for s2 in s_segs: nbytes += int(s2['bytes']) subject = h.unescape(fno['subject']) keyname = re.findall(r'\"(.+?)\"', subject)[0] if (keyname not in allfiles_LUT): allfiles_LUT[keyname] = [] allfiles_LUT[keyname].append(len(filesegs)) filesegs.append([ subject, int(s2['bytes']), typefile, h.unescape(''.join(s2.findAll(text=True))), cur_group, self.STATUS_INIT, -2 ]) #~ except Exception as e: #~ print "Error, could not parse NZB file " + str(e) #~ sys.exit() allfiles_sorted = [] allfiles_sorted_clean = [] for key in allfiles: allfiles_sorted.append(key) allfiles_sorted = sorted(allfiles_sorted) for s in allfiles_sorted: allfiles_sorted_clean.append(re.findall(r'\"(.+?)\"', s)[0]) self.infodata = {} self.infodata['summary'] = fileinfo self.infodata['detail'] = filesegs self.infodata['subject'] = allfiles_sorted self.infodata['filename'] = allfiles_sorted_clean self.infodata['filename_LUT'] = allfiles_LUT
def search(self, srchstr): if(self.cur_cfg['valid'] == 0): return [] socket.setdefaulttimeout(self.timeout) #~ WIN: it seems to have issue in win32 # locale.setlocale( locale.LC_ALL, 'en_US.utf8' ) if (self.chkcookie() == False): if(self.dologin() == False): return [] mainurl = self.cur_cfg['url'] loginurl = mainurl + "/search/"+srchstr timestamp_s = time.time() try: socket.setdefaulttimeout(self.timeout) res = self.br.open(loginurl) except Exception as e: self.mech_error_generic(e) eret = self.mech_error_generic(e) print eret if(eret == 302): self.reset_cookies() return [] data = res.get_data() timestamp_e = time.time() log.info('TS ' + mainurl + " " + str(timestamp_e - timestamp_s)) soup = beautifulsoup.BeautifulSoup(data) #~ def searchDBG(self, srchstr): #~ handler = open('tmp/tater.html').read() #~ soup = BeautifulSoup (handler) parsed_data = [] titles = soup.findAll('a', {'class': 'title'}) nzburls = soup.findAll('a', {'title': 'Download Nzb'}) tstamp_raw = soup.findAll('td', {'class': 'less mid'}) rdetails = soup.findAll('a', {'title': 'View details'}) sz_raw = soup.findAll('td', {'class': 'less right'}) bytesize = [] for sz1 in sz_raw: #~ rawline = str(sz1).split() for sz2 in sz1.findAll(text=True): sz2s = sz2.split() if(len(sz2s) == 2): #~ print sz2s[1].lower() if (sz2s[1].lower() == 'mb' ): bytesize.append(int(self.basic_sz * float(sz2s[0].replace(',', '')) )) if (sz2s[1].lower() == 'gb' ): bytesize.append(int(self.basic_sz * float(sz2s[0].replace(',', '')) * 1024)) #~ print bytesize #~ 2010-05-08 18:53:09 tstamp = [] for tt in tstamp_raw: for tt2 in tt.attrs: #~ print tt2[1] if('title' in tt2): tstamp.append( time.mktime(datetime.datetime.strptime(tt2[1], "%Y-%m-%d %H:%M:%S").timetuple()) ) if(len(titles) != len(nzburls)): return [] if(len(titles) != len(tstamp)): return [] if(len(titles) != len(rdetails)): return [] if(len(titles) != len(bytesize)): return [] for i in xrange(len(titles)): d1 = { 'title': ''.join(titles[i].findAll(text=True)), 'poster': 'poster', 'size': bytesize[i], 'url': self.baseURL + '/' + nzburls[i]['href'], 'filelist_preview': '', 'group': 'N/A', 'posting_date_timestamp': tstamp[i], 'release_comments': self.baseURL + rdetails[i]['href'], 'categ':{'N/A':1}, 'ignore':0, 'req_pwd':self.typesrch, 'provider':self.baseURL, 'providertitle':self.name } #~ print d1 parsed_data.append(d1) return parsed_data