Python BeautifulSoup.strip Examples, BeautifulSoup.BeautifulSoup.strip Python Examples

Example #1

0

Show file

File: dltube.py Project: azumimuo/family-xbmc-addon

    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if not debridstatus == 'true': raise Exception()

            url = urlparse.urljoin(self.base_link, url)
            r = client.request(url)

            movie_id = re.findall('<input type="hidden" value="(\d+)" name="movie_id"', r)[0]
			
            download_link = urlparse.urljoin(self.base_link, self.download_link)
            p = urllib.urlencode({'movie': movie_id.encode('utf-8')})
            r = client.request(download_link, post=p, XHR=True)
            r = BeautifulSoup(r)
            r = r.findAll('tr')
            ext = ['.avi','.mkv','.mov','.mp4','.xvid','.divx']
            locDict = [(i.rsplit('.', 1)[0], i) for i in hostprDict if not i in ext]
			
            for link in r:
                try:
                    link = str(link)
                    href = re.findall('<a href="(.+?)" class="down-btn-epd', str(link))[0]
                    try:host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(href.strip().lower()).netloc)[0]
                    except: host = 'Dltube'

                    r = client.parseDOM(link, 'span')[0]

                    fmt = r.strip().lower().split()

                    if '1080p' in fmt: quality = '1080p'
                    elif '720p' in fmt: quality = 'HD'

                    try:
                        size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', r)[-1]
                        div = 1 if size.endswith(' GB') else 1024
                        size = float(re.sub('[^0-9|/.|/,]', '', size))/div
                        info = '%.2f GB' % size
                    except:
                        info = ''

                    sources.append({'source': host, 'quality': quality, 'provider': 'DLTube', 'url': href, 'info': info, 'direct': False, 'debridonly': True})
                except:
                    pass

            return sources
        except:
            return sources

Example #2

0

Show file

File: htmlextract.py Project: kienpt/structured_data_classification

 def parse_unescape(self, location, encoding=None):
     dom_builder = html5lib.treebuilders.getTreeBuilder("dom")
     parser = html5lib.HTMLParser(tree=dom_builder)
     try:
         location = unescape(location)
     except:
         try:
             location = BeautifulSoup(location, convertEntities=BeautifulSoup.HTML_ENTITIES)
         except:
             pass
     # self.tree = parser.parse(unescape(location), encoding=encoding)
     try:
         self.tree = parser.parse(location.strip(), encoding=encoding)
     except:
         pass

Example #3

0

Show file

 def _clean_record(self, record):
     ' post process a scraped record: parses dates, converts to ISO8601 format, strips spaces, tags etc '
     #self.logger.debug("Raw record to clean: %s", str(record))
     for k, v in record.items():
         if v:
             if isinstance(v, list):
                 v = ' '.join(v)
             if not v or not myutils.GAPS_REGEX.sub(
                     '', v):  # always remove any empty fields
                 v = None
             elif k == 'uid':  # note some Exmoor uids have internal spaces
                 #v = myutils.GAPS_REGEX.sub('', v) # strip any spaces in uids
                 text = myutils.GAPS_REGEX.sub(
                     ' ', v)  # normalise any internal space
                 v = text.strip()  # strip leading and trailing space
             elif k == 'url' or k.endswith('_url'):
                 text = myutils.GAPS_REGEX.sub(
                     '', v)  # strip any spaces in urls
                 v = scrapeutils.JSESS_REGEX.sub(
                     '', text)  # strip any jsessionid parameter
             elif k.endswith('_date') or k.startswith('date_'):
                 dt = myutils.get_dt(v, self._response_date_format)
                 if not dt:
                     v = None
                 else:
                     v = dt.isoformat()
                     if v <= '1970-01-01':  # special processing for bad dates inserted where "N/A" appears on screen
                         v = None
             else:
                 text = scrapeutils.TAGS_REGEX.sub(
                     ' ', v)  # replace any html tag content with spaces
                 #try:
                 text = BeautifulSoup(
                     text, convertEntities="html").contents[0].string
                 # use beautiful soup to convert html entities to unicode strings
                 #except:
                 #    pass
                 text = myutils.GAPS_REGEX.sub(
                     ' ', text)  # normalise any internal space
                 v = text.strip()  # strip leading and trailing space
         if not v:  # delete entry if the final value is empty
             del record[k]
         else:
             record[k] = v

Example #4

0

Show file

File: dltube.py Project: vphuc81/MyRepository

    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if not debridstatus == 'true': raise Exception()

            url = urlparse.urljoin(self.base_link, url)
            # print("DLTUBE MOVIES SOURCES", url)
            r = client.request(url)

            movie_id = re.findall('<input type="hidden" value="(\d+)" name="movie_id"', r)[0]
			
            # print("DLTUBE MOVIES ID", movie_id)
            download_link = urlparse.urljoin(self.base_link, self.download_link)
            p = urllib.urlencode({'movie': movie_id.encode('utf-8')})
            r = client.request(download_link, post=p, XHR=True)
            r = BeautifulSoup(r)
            r = r.findAll('p')
            ext = ['.avi','.mkv','.mov','.mp4','.xvid','.divx']
            locDict = [(i.rsplit('.', 1)[0], i) for i in hostprDict if not i in ext]
			
            # print ("DLTUBE MOVIES SOURCES 2", r)			
            for link in r:
                try:
                    link = str(link)
                    host = re.findall('Downloads-Server(.+?)(?:\'|\")\)', link)[0]
                    # print ("DLTUBE MOVIES SOURCES 3", locDict, host.lower())
					
                    # host = host.strip().lower().split()[-1]
                    if 'fichier' in host.lower(): host = '1fichier'
				
                    host = [x[1] for x in locDict if x[0].lower() in host.lower()][0]
                    # print ("DLTUBE MOVIES SOURCES 4", host)	                   
                    host = client.replaceHTMLCodes(host)
                    host = host.encode('utf-8')
                    if not any(value in host for value in hostprDict): raise Exception()
                    url = client.parseDOM(link, 'a', ret='href')[0]
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    r = client.parseDOM(link, 'a')[0]

                    fmt = r.strip().lower().split()

                    if '1080p' in fmt: quality = '1080p'
                    elif '720p' in fmt: quality = 'HD'

                    try:
                        size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', r)[-1]
                        div = 1 if size.endswith(' GB') else 1024
                        size = float(re.sub('[^0-9|/.|/,]', '', size))/div
                        info = '%.2f GB' % size
                    except:
                        info = ''

                    sources.append({'source': host, 'quality': quality, 'provider': 'DLTube', 'url': url, 'info': info, 'direct': False, 'debridonly': True})
                except:
                    pass

            return sources
        except:
            return sources

Example #5

0

Show file

File: scraping.py Project: aleborba/packtrack

 def _text(self, value):
     value = BeautifulSoup(value.strip()).text
     return value.replace('&nbsp;', ' ')

Example #6

0

Show file

File: NaiveBaiyesClassifier.py Project: dilipc1990/security-trends

    ('identities', 'IdentityThreat'),
    ('identity loss', 'IdentityThreat'),
    ('insider', 'InsiderThreat'),
    ('Malware', 'Malware'),
]

# Categorized corpora Reader collect the respective words based on ThreatType
ThreatTypes = [(list(reader.words(fileid)), category)
               for category in reader.categories()
               for fileid in reader.fileids(category)]
random.shuffle(ThreatTypes)
print(reader.categories())
new_train = ThreatTypes
print(new_train)
#Naive Bayes classifiers assume that the value of a particular feature is independent of the value of
#any other feature, given the class variable.
cl = NaiveBayesClassifier(train)
#update the classifier with training keywords from Categorized corpora
cl.update(new_train)
inputpath = nltk.data.find('corpora/abc/threatdescp.txt')
f = open(inputpath, encoding='latin2')
outputpath = nltk.data.find('corpora/abc/ResultNB.txt')
ResultFile = open(outputpath, 'w', encoding='latin2')
for line in f:
    line = BeautifulSoup(line.strip()).text
    Threattype = cl.classify(line)
    finalText = line + '|' + Threattype
    print(finalText)
    ResultFile.writelines(finalText)
    ResultFile.writelines("\n")

Example #7

0

Show file

    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if not debridstatus == 'true': raise Exception()

            url = urlparse.urljoin(self.base_link, url)
            # print("DLTUBE MOVIES SOURCES", url)
            r = client.request(url)

            movie_id = re.findall(
                '<input type="hidden" value="(\d+)" name="movie_id"', r)[0]

            # print("DLTUBE MOVIES ID", movie_id)
            download_link = urlparse.urljoin(self.base_link,
                                             self.download_link)
            p = urllib.urlencode({'movie': movie_id.encode('utf-8')})
            r = client.request(download_link, post=p, XHR=True)
            r = BeautifulSoup(r)
            r = r.findAll('p')
            ext = ['.avi', '.mkv', '.mov', '.mp4', '.xvid', '.divx']
            locDict = [(i.rsplit('.', 1)[0], i) for i in hostprDict
                       if not i in ext]

            # print ("DLTUBE MOVIES SOURCES 2", r)
            for link in r:
                try:
                    link = str(link)
                    host = re.findall('Downloads-Server(.+?)(?:\'|\")\)',
                                      link)[0]
                    # print ("DLTUBE MOVIES SOURCES 3", locDict, host.lower())

                    # host = host.strip().lower().split()[-1]
                    if 'fichier' in host.lower(): host = '1fichier'

                    host = [
                        x[1] for x in locDict if x[0].lower() in host.lower()
                    ][0]
                    # print ("DLTUBE MOVIES SOURCES 4", host)
                    host = client.replaceHTMLCodes(host)
                    host = host.encode('utf-8')
                    if not any(value in host for value in hostprDict):
                        raise Exception()
                    url = client.parseDOM(link, 'a', ret='href')[0]
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    r = client.parseDOM(link, 'a')[0]

                    fmt = r.strip().lower().split()

                    if '1080p' in fmt: quality = '1080p'
                    elif '720p' in fmt: quality = 'HD'

                    try:
                        size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)',
                                          r)[-1]
                        div = 1 if size.endswith(' GB') else 1024
                        size = float(re.sub('[^0-9|/.|/,]', '', size)) / div
                        info = '%.2f GB' % size
                    except:
                        info = ''

                    sources.append({
                        'source': host,
                        'quality': quality,
                        'provider': 'DLTube',
                        'url': url,
                        'info': info,
                        'direct': False,
                        'debridonly': True
                    })
                except:
                    pass

            return sources
        except:
            return sources

Example #8

0

Show file

File: dltube.py Project: aguia123/repository

    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if not debridstatus == 'true': raise Exception()

            url = urlparse.urljoin(self.base_link, url)
            r = client.request(url)

            movie_id = re.findall(
                '<input type="hidden" value="(\d+)" name="movie_id"', r)[0]

            download_link = urlparse.urljoin(self.base_link,
                                             self.download_link)
            p = urllib.urlencode({'movie': movie_id.encode('utf-8')})
            r = client.request(download_link, post=p, XHR=True)
            r = BeautifulSoup(r)
            r = r.findAll('tr')
            ext = ['.avi', '.mkv', '.mov', '.mp4', '.xvid', '.divx']
            locDict = [(i.rsplit('.', 1)[0], i) for i in hostprDict
                       if not i in ext]

            for link in r:
                try:
                    link = str(link)
                    href = re.findall('<a href="(.+?)" class="down-btn-epd',
                                      str(link))[0]
                    try:
                        host = re.findall(
                            '([\w]+[.][\w]+)$',
                            urlparse.urlparse(href.strip().lower()).netloc)[0]
                    except:
                        host = 'Dltube'

                    r = client.parseDOM(link, 'span')[0]

                    fmt = r.strip().lower().split()

                    if '1080p' in fmt: quality = '1080p'
                    elif '720p' in fmt: quality = 'HD'

                    try:
                        size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)',
                                          r)[-1]
                        div = 1 if size.endswith(' GB') else 1024
                        size = float(re.sub('[^0-9|/.|/,]', '', size)) / div
                        info = '%.2f GB' % size
                    except:
                        info = ''

                    sources.append({
                        'source': host,
                        'quality': quality,
                        'provider': 'DLTube',
                        'url': href,
                        'info': info,
                        'direct': False,
                        'debridonly': True
                    })
                except:
                    pass

            return sources
        except:
            return sources

Example #9

0

Show file

File: scraping.py Project: linscomt/packtrack

 def _text(self, value):
     value = BeautifulSoup(value.strip()).text
     return value.replace('&nbsp;', ' ')

Example #10

0

Show file

File: prepare_dataset.py Project: lamda/RecNet

class Book(Item):
    def __init__(self, cf_title, bid, author):
        super(Book, self).__init__(cf_title)
        self.id = bid
        self.author = author

    def generate_title_candidates(self):
        """ generate title candidates for books"""
        for c in '{}[]\n.':
            self.cf_title = self.cf_title.replace(c, '')
        self.cf_title = self.cf_title.split(':')[0]
        self.cf_title = self.cf_title.split('(')[0]
        if len(self.cf_title) > 1:
            if self.cf_title[0] != self.cf_title[0].upper() or \
                    self.cf_title[1] != self.cf_title[1].lower():
                self.cf_title = self.cf_title[0].upper() +\
                    self.cf_title[1:].lower()
        ce = BeautifulSoup.HTML_ENTITIES
        self.cf_title = BeautifulSoup(self.cf_title, convertEntities=ce)
        self.cf_title = self.cf_title.contents[0]
        self.cf_title = self.cf_title.replace('reg;', '')
        self.cf_title = self.cf_title.replace(';', '')
        self.cf_title = self.cf_title.replace('(R)', '')
        self.cf_title = self.cf_title.replace('(r)', '')
        keys = {self.cf_title.strip()}

        # handle prefix/suffix swaps, e.g., "Haine, La"
        prefixes = {'The', 'A', 'An', 'La', 'Le', 'Les', 'Die', 'Das', 'Der',
                    'Ein', 'Il', "L'", 'Lo', 'Le', 'I', 'El', 'Los', 'Las', 'O'}
        new_keys = set()
        for k in keys:
            parts = k.split(' ')
            if len(parts) > 1 and parts[0].strip() in prefixes:
                new_keys.add(' '.join(parts[1:]))
        keys |= new_keys

        # add "The" to the beginning, if it is not already there
        new_keys = set()
        for k in keys:
            p = k.split(' ')[0]
            if p not in prefixes:
                new_keys.add('The ' + k)
        keys |= new_keys

        # adapt captialization to the Wikipedia Manual of Style
        # (this is only a heuristic)
        new_keys = set()
        minuscles = {'a', 'an', 'the', 'and', 'but', 'or', 'nor', 'for',
                     'yet', 'of', 'to', 'in', 'for', 'on', 'with'}

        for k in keys:
            parts = k.split(' ')
            parts = [p for p in parts if p]
            parts_new = [parts[0]]
            for p in parts[1:]:
                if p.lower() not in minuscles:
                    parts_new.append(p[0].upper() + p[1:])
                else:
                    parts_new.append(p)
            new_keys.add(' '.join(parts_new))
        keys |= new_keys

        author_last = self.author.rsplit(' ', 1)[-1]
        book = [k + ' (' + author_last + ' book)' for k in keys]
        booka = [k + ' (book)' for k in keys]
        novel = [k + ' (novel)' for k in keys]
        novela = [k + ' (' + author_last + ' novel)' for k in keys]
        keys.update(set(book), set(novel), set(booka), set(novela))
        self.title_candidates = {k: '' for k in keys}

    def select_title(self):
        """ select the title among the candidates
        and check if it's actually a book
        """
        super(Book, self).select_title(['books', 'novels', 'plays'])

        # sanity check - is this really a relevant article?
        if self.wikipedia_text:
            regex = re.compile('\[\[Category:([^#\|\]]+)', flags=re.IGNORECASE)
            data = self.title_candidates[self.wikipedia_title]
            categories = ' '.join(regex.findall(data))
            occurrences = categories.lower().count('books')
            occurrences += categories.lower().count('novels')
            occurrences += categories.lower().count('plays')
            occurrences += categories.lower().count('short story')
            if not occurrences:
                self.wikipedia_text = ''
                print('did not pass sanity check')
            if not self.author.split()[-1].lower() in self.wikipedia_text.lower():
                if DEBUG:
                    pdb.set_trace()
                self.wikipedia_text = ''
                print('author not in text')
            del self.title_candidates

    def obtain_categories(self):
        """scrape book categories from Google"""
        # sleep in-between to not get banned for too frequent requests
        if DEBUG:
            t = 1
        else:
            t = random.randint(10, 19)
        print('DEBUG')
        print('sleeping for', t, 'seconds')
        time.sleep(t)
        title = urllib.quote(urllib.unquote(self.wikipedia_title.encode()))
        query = '"' + title.replace('_', '+') + '"+' + 'genre'
        url = u"https://www.google.com/search?hl=en&biw=1195&bih=918" +\
              u"&sclient=psy-ab&q=" + query + u"&btnG=&oq=&gs_l=&pbx=1"
        try:
            request = urllib2.Request(url)
            # choose a random user agent
            ua = random.choice(Item.url_headers)
            request.add_header('User-agent', ua)
            data = Item.url_opener.open(request).read()
            data = data.decode('utf-8')
            if self.author.split()[-1].lower() not in data.lower():  # sanity check
                self.wikipedia_text = ''
                return []
        except (urllib2.HTTPError, urllib2.URLError) as e:
            print('!+!+!+!+!+!+!+!+ URLLIB ERROR !+!+!+!+!+!+!+!+')
            print('URLError', e)
            pdb.set_trace()

        rexes = [
            # r'<span class="kno-a-v">([^</]+)',
            #  r'<span class="answer_slist_item_title nonrich">([^</]+)',
            #  r'<span class="answer_slist_item_title">([^</]+)',
            r'Genres\s*(?:</span>)?(?:</a>)?:\s*(?:</span>)?\s*<span class="[-\_\sa-zA-Z]+">([^</]+)',
            r'Genre</td><td(?:[^</]*)>([^</]+)',
            r'Genre</th></tr><td(?:[^</]*)>([^</]+)',
        ]
        re_cat = re.compile('|'.join(rexes))
        cats = [e for g in re.findall(re_cat, data) for e in g if e]
        # cats = [g for g in re.findall(re_cat, data) if g]
        print(self.wikipedia_title)
        print(cats)
        if DEBUG:
            pdb.set_trace()
        cats = list(set(cats))
        if not cats:  # sanity check
            self.wikipedia_text = ''
        return cats

    def write_to_database(self, db_file):
        super(Book, self).write_to_database('books', db_file)