def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if not debridstatus == 'true': raise Exception() url = urlparse.urljoin(self.base_link, url) r = client.request(url) movie_id = re.findall('<input type="hidden" value="(\d+)" name="movie_id"', r)[0] download_link = urlparse.urljoin(self.base_link, self.download_link) p = urllib.urlencode({'movie': movie_id.encode('utf-8')}) r = client.request(download_link, post=p, XHR=True) r = BeautifulSoup(r) r = r.findAll('tr') ext = ['.avi','.mkv','.mov','.mp4','.xvid','.divx'] locDict = [(i.rsplit('.', 1)[0], i) for i in hostprDict if not i in ext] for link in r: try: link = str(link) href = re.findall('<a href="(.+?)" class="down-btn-epd', str(link))[0] try:host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(href.strip().lower()).netloc)[0] except: host = 'Dltube' r = client.parseDOM(link, 'span')[0] fmt = r.strip().lower().split() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = 'HD' try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', r)[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size))/div info = '%.2f GB' % size except: info = '' sources.append({'source': host, 'quality': quality, 'provider': 'DLTube', 'url': href, 'info': info, 'direct': False, 'debridonly': True}) except: pass return sources except: return sources
def parse_unescape(self, location, encoding=None): dom_builder = html5lib.treebuilders.getTreeBuilder("dom") parser = html5lib.HTMLParser(tree=dom_builder) try: location = unescape(location) except: try: location = BeautifulSoup(location, convertEntities=BeautifulSoup.HTML_ENTITIES) except: pass # self.tree = parser.parse(unescape(location), encoding=encoding) try: self.tree = parser.parse(location.strip(), encoding=encoding) except: pass
def _clean_record(self, record): ' post process a scraped record: parses dates, converts to ISO8601 format, strips spaces, tags etc ' #self.logger.debug("Raw record to clean: %s", str(record)) for k, v in record.items(): if v: if isinstance(v, list): v = ' '.join(v) if not v or not myutils.GAPS_REGEX.sub( '', v): # always remove any empty fields v = None elif k == 'uid': # note some Exmoor uids have internal spaces #v = myutils.GAPS_REGEX.sub('', v) # strip any spaces in uids text = myutils.GAPS_REGEX.sub( ' ', v) # normalise any internal space v = text.strip() # strip leading and trailing space elif k == 'url' or k.endswith('_url'): text = myutils.GAPS_REGEX.sub( '', v) # strip any spaces in urls v = scrapeutils.JSESS_REGEX.sub( '', text) # strip any jsessionid parameter elif k.endswith('_date') or k.startswith('date_'): dt = myutils.get_dt(v, self._response_date_format) if not dt: v = None else: v = dt.isoformat() if v <= '1970-01-01': # special processing for bad dates inserted where "N/A" appears on screen v = None else: text = scrapeutils.TAGS_REGEX.sub( ' ', v) # replace any html tag content with spaces #try: text = BeautifulSoup( text, convertEntities="html").contents[0].string # use beautiful soup to convert html entities to unicode strings #except: # pass text = myutils.GAPS_REGEX.sub( ' ', text) # normalise any internal space v = text.strip() # strip leading and trailing space if not v: # delete entry if the final value is empty del record[k] else: record[k] = v
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if not debridstatus == 'true': raise Exception() url = urlparse.urljoin(self.base_link, url) # print("DLTUBE MOVIES SOURCES", url) r = client.request(url) movie_id = re.findall('<input type="hidden" value="(\d+)" name="movie_id"', r)[0] # print("DLTUBE MOVIES ID", movie_id) download_link = urlparse.urljoin(self.base_link, self.download_link) p = urllib.urlencode({'movie': movie_id.encode('utf-8')}) r = client.request(download_link, post=p, XHR=True) r = BeautifulSoup(r) r = r.findAll('p') ext = ['.avi','.mkv','.mov','.mp4','.xvid','.divx'] locDict = [(i.rsplit('.', 1)[0], i) for i in hostprDict if not i in ext] # print ("DLTUBE MOVIES SOURCES 2", r) for link in r: try: link = str(link) host = re.findall('Downloads-Server(.+?)(?:\'|\")\)', link)[0] # print ("DLTUBE MOVIES SOURCES 3", locDict, host.lower()) # host = host.strip().lower().split()[-1] if 'fichier' in host.lower(): host = '1fichier' host = [x[1] for x in locDict if x[0].lower() in host.lower()][0] # print ("DLTUBE MOVIES SOURCES 4", host) host = client.replaceHTMLCodes(host) host = host.encode('utf-8') if not any(value in host for value in hostprDict): raise Exception() url = client.parseDOM(link, 'a', ret='href')[0] url = client.replaceHTMLCodes(url) url = url.encode('utf-8') r = client.parseDOM(link, 'a')[0] fmt = r.strip().lower().split() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = 'HD' try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', r)[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size))/div info = '%.2f GB' % size except: info = '' sources.append({'source': host, 'quality': quality, 'provider': 'DLTube', 'url': url, 'info': info, 'direct': False, 'debridonly': True}) except: pass return sources except: return sources
def _text(self, value): value = BeautifulSoup(value.strip()).text return value.replace(' ', ' ')
('identities', 'IdentityThreat'), ('identity loss', 'IdentityThreat'), ('insider', 'InsiderThreat'), ('Malware', 'Malware'), ] # Categorized corpora Reader collect the respective words based on ThreatType ThreatTypes = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(ThreatTypes) print(reader.categories()) new_train = ThreatTypes print(new_train) #Naive Bayes classifiers assume that the value of a particular feature is independent of the value of #any other feature, given the class variable. cl = NaiveBayesClassifier(train) #update the classifier with training keywords from Categorized corpora cl.update(new_train) inputpath = nltk.data.find('corpora/abc/threatdescp.txt') f = open(inputpath, encoding='latin2') outputpath = nltk.data.find('corpora/abc/ResultNB.txt') ResultFile = open(outputpath, 'w', encoding='latin2') for line in f: line = BeautifulSoup(line.strip()).text Threattype = cl.classify(line) finalText = line + '|' + Threattype print(finalText) ResultFile.writelines(finalText) ResultFile.writelines("\n")
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if not debridstatus == 'true': raise Exception() url = urlparse.urljoin(self.base_link, url) # print("DLTUBE MOVIES SOURCES", url) r = client.request(url) movie_id = re.findall( '<input type="hidden" value="(\d+)" name="movie_id"', r)[0] # print("DLTUBE MOVIES ID", movie_id) download_link = urlparse.urljoin(self.base_link, self.download_link) p = urllib.urlencode({'movie': movie_id.encode('utf-8')}) r = client.request(download_link, post=p, XHR=True) r = BeautifulSoup(r) r = r.findAll('p') ext = ['.avi', '.mkv', '.mov', '.mp4', '.xvid', '.divx'] locDict = [(i.rsplit('.', 1)[0], i) for i in hostprDict if not i in ext] # print ("DLTUBE MOVIES SOURCES 2", r) for link in r: try: link = str(link) host = re.findall('Downloads-Server(.+?)(?:\'|\")\)', link)[0] # print ("DLTUBE MOVIES SOURCES 3", locDict, host.lower()) # host = host.strip().lower().split()[-1] if 'fichier' in host.lower(): host = '1fichier' host = [ x[1] for x in locDict if x[0].lower() in host.lower() ][0] # print ("DLTUBE MOVIES SOURCES 4", host) host = client.replaceHTMLCodes(host) host = host.encode('utf-8') if not any(value in host for value in hostprDict): raise Exception() url = client.parseDOM(link, 'a', ret='href')[0] url = client.replaceHTMLCodes(url) url = url.encode('utf-8') r = client.parseDOM(link, 'a')[0] fmt = r.strip().lower().split() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = 'HD' try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', r)[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size)) / div info = '%.2f GB' % size except: info = '' sources.append({ 'source': host, 'quality': quality, 'provider': 'DLTube', 'url': url, 'info': info, 'direct': False, 'debridonly': True }) except: pass return sources except: return sources
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if not debridstatus == 'true': raise Exception() url = urlparse.urljoin(self.base_link, url) r = client.request(url) movie_id = re.findall( '<input type="hidden" value="(\d+)" name="movie_id"', r)[0] download_link = urlparse.urljoin(self.base_link, self.download_link) p = urllib.urlencode({'movie': movie_id.encode('utf-8')}) r = client.request(download_link, post=p, XHR=True) r = BeautifulSoup(r) r = r.findAll('tr') ext = ['.avi', '.mkv', '.mov', '.mp4', '.xvid', '.divx'] locDict = [(i.rsplit('.', 1)[0], i) for i in hostprDict if not i in ext] for link in r: try: link = str(link) href = re.findall('<a href="(.+?)" class="down-btn-epd', str(link))[0] try: host = re.findall( '([\w]+[.][\w]+)$', urlparse.urlparse(href.strip().lower()).netloc)[0] except: host = 'Dltube' r = client.parseDOM(link, 'span')[0] fmt = r.strip().lower().split() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = 'HD' try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', r)[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size)) / div info = '%.2f GB' % size except: info = '' sources.append({ 'source': host, 'quality': quality, 'provider': 'DLTube', 'url': href, 'info': info, 'direct': False, 'debridonly': True }) except: pass return sources except: return sources
class Book(Item): def __init__(self, cf_title, bid, author): super(Book, self).__init__(cf_title) self.id = bid self.author = author def generate_title_candidates(self): """ generate title candidates for books""" for c in '{}[]\n.': self.cf_title = self.cf_title.replace(c, '') self.cf_title = self.cf_title.split(':')[0] self.cf_title = self.cf_title.split('(')[0] if len(self.cf_title) > 1: if self.cf_title[0] != self.cf_title[0].upper() or \ self.cf_title[1] != self.cf_title[1].lower(): self.cf_title = self.cf_title[0].upper() +\ self.cf_title[1:].lower() ce = BeautifulSoup.HTML_ENTITIES self.cf_title = BeautifulSoup(self.cf_title, convertEntities=ce) self.cf_title = self.cf_title.contents[0] self.cf_title = self.cf_title.replace('reg;', '') self.cf_title = self.cf_title.replace(';', '') self.cf_title = self.cf_title.replace('(R)', '') self.cf_title = self.cf_title.replace('(r)', '') keys = {self.cf_title.strip()} # handle prefix/suffix swaps, e.g., "Haine, La" prefixes = {'The', 'A', 'An', 'La', 'Le', 'Les', 'Die', 'Das', 'Der', 'Ein', 'Il', "L'", 'Lo', 'Le', 'I', 'El', 'Los', 'Las', 'O'} new_keys = set() for k in keys: parts = k.split(' ') if len(parts) > 1 and parts[0].strip() in prefixes: new_keys.add(' '.join(parts[1:])) keys |= new_keys # add "The" to the beginning, if it is not already there new_keys = set() for k in keys: p = k.split(' ')[0] if p not in prefixes: new_keys.add('The ' + k) keys |= new_keys # adapt captialization to the Wikipedia Manual of Style # (this is only a heuristic) new_keys = set() minuscles = {'a', 'an', 'the', 'and', 'but', 'or', 'nor', 'for', 'yet', 'of', 'to', 'in', 'for', 'on', 'with'} for k in keys: parts = k.split(' ') parts = [p for p in parts if p] parts_new = [parts[0]] for p in parts[1:]: if p.lower() not in minuscles: parts_new.append(p[0].upper() + p[1:]) else: parts_new.append(p) new_keys.add(' '.join(parts_new)) keys |= new_keys author_last = self.author.rsplit(' ', 1)[-1] book = [k + ' (' + author_last + ' book)' for k in keys] booka = [k + ' (book)' for k in keys] novel = [k + ' (novel)' for k in keys] novela = [k + ' (' + author_last + ' novel)' for k in keys] keys.update(set(book), set(novel), set(booka), set(novela)) self.title_candidates = {k: '' for k in keys} def select_title(self): """ select the title among the candidates and check if it's actually a book """ super(Book, self).select_title(['books', 'novels', 'plays']) # sanity check - is this really a relevant article? if self.wikipedia_text: regex = re.compile('\[\[Category:([^#\|\]]+)', flags=re.IGNORECASE) data = self.title_candidates[self.wikipedia_title] categories = ' '.join(regex.findall(data)) occurrences = categories.lower().count('books') occurrences += categories.lower().count('novels') occurrences += categories.lower().count('plays') occurrences += categories.lower().count('short story') if not occurrences: self.wikipedia_text = '' print('did not pass sanity check') if not self.author.split()[-1].lower() in self.wikipedia_text.lower(): if DEBUG: pdb.set_trace() self.wikipedia_text = '' print('author not in text') del self.title_candidates def obtain_categories(self): """scrape book categories from Google""" # sleep in-between to not get banned for too frequent requests if DEBUG: t = 1 else: t = random.randint(10, 19) print('DEBUG') print('sleeping for', t, 'seconds') time.sleep(t) title = urllib.quote(urllib.unquote(self.wikipedia_title.encode())) query = '"' + title.replace('_', '+') + '"+' + 'genre' url = u"https://www.google.com/search?hl=en&biw=1195&bih=918" +\ u"&sclient=psy-ab&q=" + query + u"&btnG=&oq=&gs_l=&pbx=1" try: request = urllib2.Request(url) # choose a random user agent ua = random.choice(Item.url_headers) request.add_header('User-agent', ua) data = Item.url_opener.open(request).read() data = data.decode('utf-8') if self.author.split()[-1].lower() not in data.lower(): # sanity check self.wikipedia_text = '' return [] except (urllib2.HTTPError, urllib2.URLError) as e: print('!+!+!+!+!+!+!+!+ URLLIB ERROR !+!+!+!+!+!+!+!+') print('URLError', e) pdb.set_trace() rexes = [ # r'<span class="kno-a-v">([^</]+)', # r'<span class="answer_slist_item_title nonrich">([^</]+)', # r'<span class="answer_slist_item_title">([^</]+)', r'Genres\s*(?:</span>)?(?:</a>)?:\s*(?:</span>)?\s*<span class="[-\_\sa-zA-Z]+">([^</]+)', r'Genre</td><td(?:[^</]*)>([^</]+)', r'Genre</th></tr><td(?:[^</]*)>([^</]+)', ] re_cat = re.compile('|'.join(rexes)) cats = [e for g in re.findall(re_cat, data) for e in g if e] # cats = [g for g in re.findall(re_cat, data) if g] print(self.wikipedia_title) print(cats) if DEBUG: pdb.set_trace() cats = list(set(cats)) if not cats: # sanity check self.wikipedia_text = '' return cats def write_to_database(self, db_file): super(Book, self).write_to_database('books', db_file)