Ejemplo n.º 1
1
def careermatch(username, password):
        browser = mechanize.Browser(factory=mechanize.RobustFactory())
        browser.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)')]
        browser.set_handle_robots(False)
        browser.open("http://training.careermatch-uk.com/eco_login.php")
        browser.select_form(nr=0)
        browser.form.set_all_readonly(False)
        browser["txtUserId"] = username
        browser["txtPassword"] = password
        browser.submit()
        browser.open("http://training.careermatch-uk.com/eco_learn.php?filter=enrolled")
        html1 = browser.response().read()
        for course in re.findall(r'<dt>(.*?)</dt>', html1):
                if "href" in course:
                    soup1 = BeautifulSoup(course, "html.parser")
                    tag1 = soup1.find("a")
                    url1 = "http://training.careermatch-uk.com/"+tag1["href"]
                    name1 = soup1.getText()
                    print name1
                    browser.open(url1)
                    html2 = browser.response().read()
                    for topic in re.findall(r'<dt>(.*?)</dt>', html2):
                        if "href" in topic:
                            soup2 = BeautifulSoup(topic, "html.parser")
                            tag2 = soup2.find("a")
                            url2 = "http://training.careermatch-uk.com/"+tag2["href"]
                            name2 = soup2.getText()
                            print "\t"+name2
                            browser.open(url2)
                            html3 = browser.response().read()
                            pattern = re.compile(r'\s+')
                            html3 = re.sub(pattern, ' ', html3)
                            for mp4 in re.findall(r'{ file: "(.*?)" , label: "SD"}', html3):
                                url3 = "http"+mp4.split('http')[-1]
                                name3 = url3.split('/')[-1]
                                print "\t\t"+name3
                                if not os.path.isfile(name3):
                                            time.sleep(1)
                                            u = urllib2.urlopen(url3)
                                            f = open(name3, 'wb')
                                            meta = u.info()
                                            file_size = int(meta.getheaders("Content-Length")[0])
                                            print "\t\t\tDownloading: %s Size: %s MB" % (name3, file_size/(1024*1024))
                                            print "\t\t\t",
                                            file_size_dl = 0
                                            block_sz = 8192
                                            status=""
                                            time.sleep(1)
                                            while True:
                                                buffer = u.read(block_sz)
                                                if not buffer:
                                                    break
                                                file_size_dl += len(buffer)
                                                f.write(buffer)
                                                if r"[%3.0f%%]" % (file_size_dl * 50. / file_size) != status:
                                                    status = r"[%3.0f%%]" % (file_size_dl * 50. / file_size)
                                                    print ".",
                                            f.close()
                                            print "\n",
Ejemplo n.º 2
0
def grabing(data,ipPort):
    
    soup = BeautifulSoup(data)
    a_list = soup.find_all("a",attrs={"itemprop": "url"})
    a_list.pop(0)
    content_list = []
    f=open('D:\\novel.txt','w')
    for atag in a_list:
        url = re.search(r'http://read.qidian.com/BookReader/4fknnsotQvLZ6ZDT--NUMw2,?.*\.aspx',str(atag))
        txt = re.search(r'<span itemprop="headline">.*</span>',str(atag))
        soup = BeautifulSoup(txt.group())
        file_name = soup.getText()
        print(soup.getText())
        f.write(soup.getText() + "\n")
        
        d = agentGo(url.group(),ipPort)
        soup = BeautifulSoup(d)
        
        div_content = soup.find("div",id="chaptercontent")
        txt_url = re.search(r'http://files.qidian.com/Author6/?.*\.txt', str(div_content),re.DOTALL)
        #print(txt_url.group())
        urllib.request.urlretrieve(txt_url.group(),"F:\\novel\\" + file_name + ".txt")
        
        #保存到txt文件
#         oper = urllib.request.urlopen(getReq(txt_url.group()))
#         data = oper.read()
#         content = str(data)
#         content_list.append(content)
#         print(content)
#         f.write(content + "\n")
    
    f.close()
Ejemplo n.º 3
0
def get_content(soup):
    enpcontent = re.findall('<!--enpcontent-->(.*)<!--enpcontent-->',str(soup), re.DOTALL)
    if enpcontent:
        enpcontent = enpcontent[0]
        contentsoup = BS(enpcontent)
        text = contentsoup.getText().encode('utf-8')
        return text
Ejemplo n.º 4
0
def stackoverflowESLoader(folder):
    """Loads into ES all Stackoverflow data files located in a folder and subfolders
    
    Inputs:
        folder: directory where the data is located. Must contain a subfolder
            for each stackexchange community, with a Posts.xml file with
            the community posts
    """
    
    tagdetector = re.compile("<[^>]+>")
    
    # Iterate over each subolder (community)
    for communityfolder in glob.glob(folder + '/*.com'):
        # Get community name
        communityname = communityfolder.split('/')[-1]
        # Process the post file for such community
        root = xml.etree.ElementTree.parse(communityfolder + "/Posts.xml").getroot()
        # Get all posts
        for row in root.iter('row'):
            id = row.attrib["Id"]
            body = {}
            # Get text withouth the HTML tags
            html = row.attrib["Body"]
            soup = BeautifulSoup(html, "lxml")
            body["text"] = soup.getText()
            # Get post tags
            if "Tags" in row.attrib:
                tagsraw = row.attrib["Tags"]
                body["tags"] = tagdetector.findall(tagsraw)
            # Add to Elasticsearch
            es.index(index=communityname, doc_type="post", id=id, body=body,
                     request_timeout=30)
Ejemplo n.º 5
0
    def feed_page(feed_link=None, feed_index=None):
        import time
        import feedparser
        from bs4 import BeautifulSoup
        from unidecode import unidecode
        from string import lower

        if not feed_index:
            try:
                if not feed_link:
                    # No feed link was provided, so show the first one
                    feed_index = 0
                else:
                    # Try to extract the feed link number
                    feed_index = [x[1] for x in app.config.get('FEEDS')].index(feed_link)
            except ValueError as ve:
                abort(404)
            except Exception as e:
                print(e)
                import traceback
                print(traceback.format_exc())
                feed_index = 0

        try:
            feedDesc, feedLink, feedURL = app.config.get('FEEDS')[feed_index]
        except:
            #feedDesc, feedLink, feedURL = ('Test Feed', 'test_feed', 'http://feedparser.org/docs/examples/atom10.xml')
            abort(404)

        rss = feedparser.parse(feedURL)

        maxEntries = 12 
        rssFormatted = []
        for post in rss.entries[:maxEntries]:
            date_p = post.published_parsed
            date = time.strftime("%d.%m.%Y", date_p)
            
            catstr = unidecode(lower(post.category))
            for fr,to in [(u"tutkimus", u"research"),
                          (u"palkitut", u"honored"),
                          (u"opiskelu", u"studies"),
                          (u"yhteistyo", u"cooperation"),
                          (u"muut", u"other")]:
                catstr = catstr.replace(fr, to)
            category = catstr.split(',')

            # We use BeautifulSoup to strip html from the description
            descSoup = BeautifulSoup(post.description)
            desc = descSoup.getText()
            postDict = {'date': date,
                        'category': category,
                        'category_unformatted': post.category.replace(u",", u", "),
                        'title': post.title,
                        'description': desc,
                        'link': post.link}
            rssFormatted.append(postDict)
        try:
            return render_template('index.html', entries=rssFormatted)
        except TemplateNotFound:
            abort(404)
Ejemplo n.º 6
0
def clean_up(aLink):
    html = urllib2.urlopen(aLink).read()
    soup = BeautifulSoup(html,"html5lib")
    texts = soup.findAll(text=True)
    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    visible_text = soup.getText().encode('utf-8').strip()
    l = visible_text.split('\n')
    l = filter(lambda element: (('â' or '€') not in element) and (element != '') and ('\xc2\xa0' not in element), l)
   
    #Cleaning up the crawled data
    for i in range(len(l)):
        if 'for discount' in l[i]:
            deals_start = i + 1

    clean_list = []
    for i in range(deals_start, len(l)):
        
        if 'COMMENT' in l[i]:
            l[i] = '************break***************'
        elif (l[i].isupper()):
            continue
        elif 'miss a single chance' in l[i]:
            break
       
      
        clean_list.append(l[i])

       
    return clean_list
Ejemplo n.º 7
0
    def _thread(self, args):
        category = args

        # browser asks question
        self._text_to_speech("What do you want to load, buddy?")

        # user gives answer
        answer = self.speech_to_text.convert()
        if not answer: return

        # get url from search engine
        url = search_engine(category, answer)
        if not url: return

        # browser tells user that content is being retrieved
        self._text_to_speech("Cool. I will get you stuff now...")

        # get web content
        request = requests.get(url)
        soup = BeautifulSoup(request.text, 'lxml') 

        # get text from web content
        [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
        text = soup.getText()
                
        # speak each line of text        
        try:
            for line in text.split('\n'):
                if self.is_stop: return
                                
                if len(line) >= self.MIN_LINE_LENGTH:
                    self._text_to_speech(line)
        except:
            print "Browser: error converting text to speech"
Ejemplo n.º 8
0
def get_cleantext(html) :
	soup = BeautifulSoup(html)
	[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
	cleantext = soup.getText()
	cleantext = cleantext.encode('utf8', 'ignore')
	cleantext = " ".join(cleantext.split())
	return cleantext
Ejemplo n.º 9
0
def getRssInfo():
    url = "http://www.huxiu.com/rss/0.xml"
    d = feedparser.parse(url)
    #     print d.feed.title
    #     print d.feed.link
    #     print d.feed.description
    # print d.etag
    # print d.modifed
    infoList = []
    for entry in d.entries:
        info = {}
        info["url"] = entry.link
        info["newsid"] = getMd5(info["url"])
        info["title"] = entry.title
        info["description"] = entry.description
        info["ctime"] = (long)(time.mktime(entry.published_parsed))
        info["author"] = entry.source.title
        info["source"] = ctable
        info["keywords"] = ""
        #         print entry
        #         print info['url']
        #         print info['newsid']
        #         print info['title'],info['ctime']
        #         print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['ctime'])),info['author']
        #         print entry.published_parsed
        #         print info['description']
        soup = BeautifulSoup(entry.description, "html.parser", from_encoding="utf-8")
        img = soup.find("img")
        info["thumb"] = img.get("src") if img else ""
        info["summary"] = soup.getText()
        #         print info['thumb']
        #         print info['summary']
        infoList.append(info)
    return infoList
Ejemplo n.º 10
0
def getClasses():
    def getIndex(index):
        return visible_text.index(reqList[index][0]+" "+reqList[index][1])
    soup = BeautifulSoup(open('degree.htm'), 'html.parser')

    categories = soup.find_all("font")
    requirements = dict()
    reqList = list()
    #Looking for things like 1) CSE REQ or maybe > 2) Statistics - Reqd
    valid = re.compile(r".*([1-9]+\))\s([^-]+).*$")
    for req in categories:
        if req.string is not None:
            regexResult = valid.match(req.string)
            if regexResult is not None:
                reqList.append(regexResult.groups())

    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    visible_text = soup.getText()
    index = 0

    takenCourses = re.compile(r"((?:FA|SP|WI)(?:15|16|17))\s([A-Z]{3,4}\s+[0-9]{1,3}[A-Z]*)")
    missingReq = re.compile(r"\nNeeds:\s+([1-9]+)\sCourses\n")
    for index in range(1,len(reqList)):
        #Going to have the requirements be (takenCourses, missingCourses)
        requirements[reqList[index-1][1]] = [[],0]
        stringSegment = visible_text[getIndex(index-1):getIndex(index)]
        regexResult = takenCourses.findall(stringSegment)
        for result in regexResult:
            #Add to the taken courses
            requirements[reqList[index-1][1]][0].append(result[0] + " " + result[1])
        regexResult = missingReq.search(stringSegment)
        if regexResult is not None:
            requirements[reqList[index-1][1]][1] = int(regexResult.groups()[0])

    return requirements
    def parse_content(self, url):
        yield from asyncio.sleep(3.0)
        logging.info('Extracting content for: %s', url)
        #extract page content
        try:
            response = urllib.request.urlopen(url)
            content = response.read()
        except URLError:
            print('Error')
            return

        yield from asyncio.sleep(5.0)
        logging.info('Start to parse content for: %s', url)
        soup = BeautifulSoup(content, 'html.parser')

        #parse and store content of pages
        for s in soup(['style', 'script',
                       '[document]', 'had', 'title']):
            s.extract()

        logging.info('Storing Content in for: %s', url)
        asyncio.Task(Page(url, soup.getText()).save())

        logging.info('Updated queue with new links: %s', url)

        asyncio.Task(self._extract_links(soup))
        logging.info('Finish to parse content for: %s', url)
Ejemplo n.º 12
0
def pull_headlines(tweet):
    ent = tweet.get('entities')
    urls = ent.get('urls')
    t = ""
    if urls:
        for u in urls:
            try:
                url = u.get('expanded_url')
                r = requests.get(url)
                headlines = BeautifulSoup(r.content).find('title')
                if not headlines:
                    headlines = BeautifulSoup(r.content).find('h1')
                # remove domain
                domain = '{uri.netloc}'.format(uri=urlparse(url)) + NEWS_DOMAINS
                hwords = [h for h in headlines.getText().split() if h.lower() not in domain]

                t = "%s %s" % (t,' '.join(hwords))
            except:
                continue

    # also pull quoted tweets
    if tweet.get('is_quote_status'):
        try:
            quote = tweet.get('quoted_status').get('text')
        except:
            quote = ''
        t+=quote
    return t
Ejemplo n.º 13
0
def getBillText():
    bill_id_list = getBillIdList()
    txt_json = getBill(bill_id_list)
    decoded_txt = ''
    txt_json = txt_json['bill']

    #'text' is the first object in the json file, increment to the 'doc'
    txt_json = txt_json['texts']
    txt_json = txt_json[0]
    doc_id = txt_json['doc_id']

    searchId = requests.get('https://api.legiscan.com/?key=2d28553a502d7fed3b68863b2f592f19&op=getBillText&id='+str(doc_id))
    resultsId = searchId.json()
    resultsId = resultsId['text']
    resultsId = resultsId['doc']
    decodedResults = base64.b64decode(resultsId.encode('ascii'))
    bsObj2 = BeautifulSoup(decodedResults)
    bsObj2.style.decompose()
    htmlText = bsObj2.getText()
    #the bill text is MIME:txt/html and base64 encoded. So decode it
    #decoded_txt = base64.b64decode(txt_json.encode('ascii'))

    #the decoded text is an ugly html string. Use BS to parse and clean it
    #This only works when MIME is html, need to account for PDF****
        #bsObj = BeautifulSoup(decoded_txt)

    #use BS to get the text from the bsObj
        #prettyText = bsObj.getText()

    return htmlText
    ''' I've managed to parse the bill ids from the json file and can now use th
Ejemplo n.º 14
0
def get_surname_info(last_name):
    url = ("https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=intitle:%22"+
           urllib.quote(last_name.encode('utf8'))+"%22AND%22surname%22&srlimit=2&format=json&utf8=")
    response = urllib.urlopen(url)
    data = json.loads(response.read())
    nomatches = True
    if len(data['query']['search']) > 0:
        for d in data['query']['search']:
            #print d
            if "is a" in d['snippet'] and "surname" in d['snippet']:
                nomatches = False
                #print d['snippet']
                soup = BeautifulSoup(d['snippet'].encode("utf-8"),"html.parser")
                text = soup.getText()
                sentences = text.split(".") 
                finals = ''
                for s in sentences:
                    #print s
                    if last_name in s or finals != '':
                        finals = finals + s
                    if "is a " in s or "is an " in s:
                        if len(s) > 35:
                            if last_name in s:
                                return s
                            else:
                                return finals
                        break
    if nomatches:
        return "NO ORIGIN DATA"
def web_crawling(url):
    """
    Main task is extract page content by url,
    parse html and get all links and add to redis queue.
    """
    logging.info('Extracting content for: %s', url)
    #extract page content
    try:
        page = urlopen(url)
        content = page.read()
    except (HTTPError, URLError):
        return

    logging.info('Start to parse content for: %s', url)
    soup = BeautifulSoup(content, 'html.parser')

    #parse and store content of pages
    for s in soup(['style', 'script',
                   '[document]', 'had', 'title']):
        s.extract()

    page = Page(url, soup.getText())
    page.save()

    logging.info('Stored Content in for: %s', url)

    #find all links and add to queue
    links = soup.findAll('a', attrs={'href': re.compile('^http://')})
    for link in links:
        href = link.get('href')
        q.put(href)
        logging.info('Added %s to Url Queue for processing', url)

    logging.info('Finish to parse content for: %s', url)
Ejemplo n.º 16
0
def get_debate(url):
    r = requests.get(url)
    tmp = BeautifulSoup(r.text, 'html.parser')

    debate = []
    check = []
    trump_flag = False

    for t in tmp.select('div p'):
        line = t.text
        soup = BeautifulSoup(line)
        line = soup.getText()
        line = line.encode('utf-8', 'ignore')
        line = deal_with_unicode(line)
         
        if line.startswith('TRUMP') :
            trump_flag = True
            line = ' '.join(line.split()[1:])
            line.strip()
            if not line.startswith("...") and len(line.split()) > 4 :
                debate.append(line)
        elif line.split()[0].isupper() and line.split()[0] <> "(APPLAUSE)":
            trump_flag = False
        elif trump_flag:
            line.strip()
            if len(line.split()) > 4:
                debate.append(line)
        else:
            check.append(line) 

    return debate
	def parse(self, html):
		soup = BeautifulSoup(html, "html.parser")
		#clean-up invalid HTML tags
		prettyHtmlDoc = soup.prettify()
		#get title, body, etc. text
		allText = soup.getText()
		return allText
Ejemplo n.º 18
0
def getRssInfo():
    url='http://36kr.com/feed'
    d=feedparser.parse(url)
#     print d.feed.title
#     print d.feed.link
#     print d.feed.description 
#     print d.etag
#     print d.modifed
    infoList=[]
    for entry in d.entries:
        info={}
        info['url']=entry.link
        info['newsid']=getMd5(info['url'])
        info['title']=entry.title
        info['description']=entry.description
        info['ctime']=(long)(time.mktime(entry.published_parsed))
        info['author']=entry.author
        info['source']=ctable
        info['keywords']=''
        soup = BeautifulSoup(entry.description, "html.parser",from_encoding='utf-8')
        img=soup.find('img')
        info['thumb']=img.get('src') if img else ''
        info['summary']=soup.getText()
        #         print entry
#         print info['newsid'],info['url']
#         print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['ctime'])),info['title']
#         print info['author'],info['thumb']
#         print info['description']
#         print info['summary']
        infoList.append(info)
    return infoList
Ejemplo n.º 19
0
def get_text(url):
   html = urllib.request.urlopen(url).read()
   soup = BeautifulSoup(html, "html.parser")
   [s.extract() for s in soup(['style', 'script', '[document]',
                               'head', 'title'])]
   text = soup.getText()
   return text
Ejemplo n.º 20
0
 def privmsg(self, user, channel, msg):
     if msg == '_unmute':
         if channel in self.config['muted']:
             self.config['muted'].remove(channel)
             self.manager.wittyconf.update_plugin_config(self.plugin_name, self.config)
             logging.info('Added channel %s to the mute list' % channel)
     elif msg == '_mute':
         if channel not in self.config['muted']:
             self.config['muted'].append(channel)
             self.manager.wittyconf.update_plugin_config(self.plugin_name, self.config)
             logging.info('Removed channel %s from the mute list' % channel)
     if self.block or channel in self.config['muted']:
         return
     if msg.startswith('_feed'):
         self.block = True
         url = msg[5:].strip()
         self.manager.app.say(channel, 'Eating %s...' % url)
         try:
             html = urlopen(url)
         except URLError, e:
             self.manager.app.say(channel, e)
             return
         soup = BeautifulSoup(html)
         text = soup.getText()
         f = open(self.data_path, 'a')
         f.write(text.encode('ascii', 'ignore'))
         f.close()
         logging.info('Reloading...')
         self.post_init()
         self.manager.app.say(channel, 'Done. So much wisdom!')
         self.block = False
Ejemplo n.º 21
0
 def getArticleFromXML(self, root):
     tags = []
     #Fetch id, title and categories
     id = root.find('{http://www.w3.org/2005/Atom}id').text
     title = unicode(root.find('{http://www.w3.org/2005/Atom}title').text)
     #Check if record needs to be eliminated from zotero OR
     #resource title needs to be stripped
     if ':' in title:
         if self.isOmissible(title):
             log.debug('Omitting record with title- %s' % title)
             return None
         else:
             title = self.stripRsrc(title)
     
     categories = root.findall('{http://www.w3.org/2005/Atom}category')
     tags = self.produceTag(tags, categories, title)
     
     #Fetch HTML content and URL
     content = ''
     url = ''
     if root.find('{http://www.w3.org/2005/Atom}content').text != None:
         soup = BeautifulSoup(root.find('{http://www.w3.org/2005/Atom}content').text)
         content = soup.getText()
         if soup.find('a') != None:
             url = (soup.find('a')).get('href')
             httpObj = httplib2.Http()
             try:
                 log.debug('Trying URL:%s' % url)
                 resp, content = httpObj.request(url, 'HEAD')
                 if resp['status'] == '404':
                     tags.append({'tag':'404:'+time.strftime("%H-%M-%S")})
                     log.info('Code 404: URL %s broken.' % url)
                 elif resp['status'] == '301':
                     tags.append({'tag':'301:'+time.strftime("%H-%M-%S"),'tag':'old-url:'+url})
                     log.info('Code 301: URL %s redirecting to %s.' % (url, resp['location']))
                     url = resp['location']
             except socket_error:
                 tags.append({'tag':'111:Connection refused'})
                 log.info('Connection refused: URL %s' % url)  
             except httplib2.RedirectLimit as redir_lim:
                 tags.append({'tag':'301:'+time.strftime("%H-%M-%S")})
                 log.info('Redirect limit reached:'+url)
                 log.info(str(redir_lim))                   
             except Exception as httpex:
                 tags.append({'tag': str(httpex.__class__.__name__)})
                 log.info(str(httpex)+':'+url)
                 
     blogUrl = ''
     tmpList = root.findall("{http://www.w3.org/2005/Atom}link[@rel='alternate']")
     if len(tmpList) > 0:
         blogUrl = tmpList[0].attrib['href']
     #Eliminate the first category value(http://../kind#post) that's taken as a tag
     if 'kind#post' in tags[0]['tag']: 
         tags.pop(0)
     
     issn = self.getISSNFromXML(root) 
     if issn!=None:
         return Article(id, title, tags, content, url, blogUrl, issn, 'journalArticle')
     else:
         return Article(id, title, tags, content, url, blogUrl, issn, 'webpage')
Ejemplo n.º 22
0
def clean_html_and_save_to_file(file_directory,file_name,save_directory):
    try:
        #File location
        file_path = file_directory + file_name
        # Open file and turn into beautiful soup object
        with open(file_path, "r",encoding="utf-8",errors="ignore") as file:
            soup_object = BeautifulSoup(file,"html.parser")
        # Extract only text from document
        text_section = soup_object.getText()

        #  Convert all text to string
        str_text_section = str(text_section)

        # Trim file based on following; ALL SCRIPTS => start and "Back to IMSDb" the end of script
        start= str_text_section.find("ALL SCRIPTS")
        finish = str_text_section.find("Back to IMSDb")
        cleaned_text = str_text_section[start:finish]

        # Save text to file
        new_file_name = file_name.replace(".html",".txt") # change file extension
        save_path = save_directory + new_file_name
        new_file = open(save_path,"w")
        new_file.write(cleaned_text)



    except FileExistsError:
        print("File not found")
Ejemplo n.º 23
0
def crawl_website(website,zin,resultaten,domains,index,beschrijving,titel):
    try:
        url = urlparse(website)
        domain = url.netloc
        while domain in domains:
            time.sleep(1)
        domains[index] = domain
        html = get(website)
        domains[index] = ""
        soup = BeautifulSoup(html,"html.parser")
        if config["CRAWLER"]["CHECK_ONLY_VISIBLE_TEXT"].lower() == "true":
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title','meta'])]
        text = soup.getText()
        overeenkomstWaarde,plagiaatZin,text = algoritme.vergelijk_tekst(text,zin)  # dit is het tekst vergelijk algoritme
        if overeenkomstWaarde > 3 and website not in veelGevonden:  #als de plagiaat waarde hoog is kan er meer gekopieerd zijn dus scan de url bij iedere zin.
            #veelGevonden.append(website)
            veelGevonden.append((website,beschrijving,titel))
        resultaten[index] = {
            "waarde": overeenkomstWaarde,
            "plagiaatZin": plagiaatZin,
            "text": text,
            "url":website,
        }  # dit is als het ware de return

    except Exception as e:
        print(e)
        resultaten[index] = {
            "waarde": 0,
            "plagiaatZin": "",
            "text": zin,
            "url": website,
        }
    finally:
        domains[index] = ""
Ejemplo n.º 24
0
def index_document(id, filename):
    #we get the file from disk and initialize an html parsing library with it
    soup = BeautifulSoup(open(filename))

    #remove all the text inside every <script> tag
    #[x.extract() for x in soup.findAll('script')]
    map(lambda x: x.extract(), soup.findAll("code"))      # delete all
    map(lambda x: x.extract(), soup.findAll("style"))     # delete all
    map(lambda x: x.extract(), soup.findAll("script"))    # delete all

    #removes all the special punctuation
    text = remove_puctuation(soup.getText())
    #get title of document
    title = soup.title.contents[0]
    #get snippet of text (20 words from the middle of the text)
    snip = text.split()[160:190]
    #save the title and the snippet into the document_table (reference)
    document_table[mydoc.id].title = title
    document_table[mydoc.id].snip = " ".join(snip)
    #text = soup.getText()
    #tokenize the page into words separated by spaces
    terms = [i.lower() for i in text.split() if i not in stop]
    for term in terms:
        try:
            invindex[term][str(id)] = invindex[term][str(id)] + 1
        except:
            invindex[term][str(id)] = 1
Ejemplo n.º 25
0
def get_hocr_zones(processing_folder, png_filename, engine="tesseract"):
    image_filename = processing_folder + "/" + png_filename
    logging.info(image_filename)
    image = PillowImage.open(image_filename)
    if engine == "tesseract":
        engine_filename = engine + ".hocr"
    else:
        engine_filename = engine + ".hocr.html"
    hocr_filename = "{0}/{1}/{2}".format(processing_folder, engine, engine_filename)
    soup = BeautifulSoup(open(hocr_filename))
    logging.info("opened " + hocr_filename)
    logging.info(soup.getText())
    regions = []
    for zone, region in read_hocr_tesseract(soup, image):
        regions.append(region)
        # TODO page number folder
        zone.save("{0}/{1}/{2}.bin.png".format(processing_folder, engine, region["id"]))
        with io.open(
                "{0}/{1}/{2}.txt".format(processing_folder, engine, region["id"]),
                "w", encoding="utf8") as fh:
            fh.write(region["text"])
    with io.open(
            "{0}/{1}/master.json".format(processing_folder, engine),
            "w", encoding="utf8") as fh:
        fh.write(u"var regions = \n")
        fh.write(json.dumps(regions, ensure_ascii=False))
    logging.info("Done")
Ejemplo n.º 26
0
class BSTokenizer:
    """

    """

    def __init__(self, url):
        """
        Receive an url, reads it, create a beautiful object and extract sections don't wanted.

        Arguments:
            url (str): The string representation of an url to be analyzed
        """
        self.raw_content = urllib2.urlopen(url).read()
        self.soup = BeautifulSoup(self.raw_content, "html.parser")
        [s.extract() for s in self.soup(['style', 'script', '[document]', 'head', 'title', 'meta'])]

    def get_most_common_words(self, amount=100):
        """It gets all the text, split it, and count using a utility from collections standard libraries

        Arguments:
            amount (int): The amount of elements that should be returned

        Returns:
            List: A list with a list where first element is the word and second the number of
            repetitions [["hello", 10], ["bye", 4],...]
        """

        text = self.soup.getText()
        frequencies = Counter(text.split())
        return frequencies.most_common(amount)
Ejemplo n.º 27
0
def summarize():
    text = ""
    if shmoopurl:
        page = requests.get(shmoopurl + "summary.html")
        soup = BeautifulSoup(page.content, 'lxml')
        descript = BeautifulSoup(str(soup.find(class_ = 'content-learning-guide')), 'lxml')
        title = descript.find('h1')
        par = descript.find_all('p')
        text += "<h1>" + title.getText() + "</h1>"
        i = 0
        while i < len(par) - 4:
            text += par[i].getText()
            i += 1
        while "\n" in text:
            text = text.replace('\n', '<br>')
    elif sparkurl:
        page = requests.get(sparkurl + "summary.html")
        soup = BeautifulSoup(page.content, 'lxml')
        char = BeautifulSoup(str(soup.find(class_='studyGuideText')), 'lxml')
        ads = char.find_all(class_= 'floatingad')
        strads = []
        text = char.getText()
        for ad in ads:
            strads.append(BeautifulSoup(str(ad), 'lxml').getText())
        for ad in strads:
            text = text.replace(ad, '')
        while '\n' in text:
            text = text.replace('\n', '<br>')
    return text
Ejemplo n.º 28
0
 def getContent(self):
     # Scrape the site for the article content
     html = urllib.urlopen(self.article).read()
     soup = BeautifulSoup(html)
     [s.extract() for s in soup(["style", "script", "[document]", "head", "title"])]
     self.visible_text = soup.getText()
     return
Ejemplo n.º 29
0
def getRssInfo(url):
    d=feedparser.parse(url)
#     print d.feed.title
#     print d.feed.link
#     print d.feed.description 
#     print d.etag
#     print d.modifed
    infoList=[]
    for entry in d.entries:
        info={}
#         print entry
        info['url']=entry.link
        info['newsid']=getMd5(info['url'])
        info['title']=entry.title
        info['ctime']=(long)(time.mktime(entry.published_parsed))
        info['author']=entry.author
#         print timeFormat.getTimeStamp(info['ctime']),info['title']
        info['source']=ctable
        tags=entry.tags if 'tags' in entry else None   
        info['keywords']=','.join(tag.term for tag in tags) if tags else ''        
        info['description']=entry.summary #entry.content[0].value
        soup = BeautifulSoup(info['description'], "html.parser",from_encoding='utf-8')        
        img=soup.find('img')
        info['thumb']=img.get('src') if img else ''
        info['summary']=soup.getText() #' '.join(p.getText().strip() for p in soup.find_all('p'))
#         print info['newsid'],info['url']
#         print info['author']
#         print info['keywords'],info['thumb']
#         print info['summary']
#         print info['description']
        infoList.append(info)
    return infoList
Ejemplo n.º 30
0
def cleanHtml(i):
    
    i = str(i)    
    bs = BeautifulSoup(i)
    i = bs.getText()
    
    return i
Ejemplo n.º 31
0
def snippet_builder(doc_ID):
    if doc_ID in key_docs:
        a = search_list_of_dict(doc_ID, sample_content_chunk)
        snippet = {}
        soup = BeautifulSoup(a['html'])
        s = soup.getText()
        desc = s[1:300]
        snippet['title'] = a['title']
        snippet['href'] = a['link']
        snippet['desc'] = desc + '...'
        return snippet
def tokenize(rawHtml):
    soup = BeautifulSoup(rawHtml, 'html.parser')
    rawDocument = soup.getText().encode('utf-8').lower()
    tokens = nltk.word_tokenize(rawDocument)
    for punctuation in string.punctuation:
        tokens = filter(lambda a: a != punctuation, tokens)
    # remove `` manually
    tokens = filter(lambda a: a != "``", tokens)
    # remove '' manually
    tokens = filter(lambda a: a != "''", tokens)
    return tokens
Ejemplo n.º 33
0
def format_item(item):
    url = "https://intra.epitech.eu"
    soup = BeautifulSoup(item["title"], "html.parser")
    links = []
    output = u"EPITECH\n-------\n{message}\n-----\nLiens:\n{links}"

    for link in soup.find_all("a"):
        links.append("[{index}]: {url}{link}".format(index=len(links) + 1, url=url, link=link.get("href")))
        link.replace_with("{string}[{index}]".format(string=link.string, index=len(links)))
    message = soup.getText()
    return output.format(message=message, links="\n".join(links))
Ejemplo n.º 34
0
def download_page(url):
    cookies = pickle.load(open("d:/cookies.pkl", "rb"))
    print(cookies)
    cookie_jar = RequestsCookieJar()
    for c in cookies:
        cookie_jar.set(c['name'], c['value'], domain="jd.com")
    page = requests.get(url, cookies=cookie_jar)
    soup = BeautifulSoup(page.text, 'html.parser', from_encoding='utf-8')
    print(soup.getText())
    print(page)
    print('爬取成功')
def make_lexifications():
    host_name = name.get()
    port_no = port.get() + 3602
    port_string = str(port_no)
    output = "http://" + host_name + ":" + port_string + "/cgi-bin/cg?cb-start"
    print(output)
    number_of_terms = create_cyc_lex_no_var.get()
    path = create_cyc_lex_var.get()
    titlish = create_cyc_lex_name_var.get()
    progress_bar["maximum"] = 3
    if path == "":
        msg.showwarning("Utility Warning", "No path specified.")
        return ""
    if titlish == "":
        msg.showwarning("Utility Warning", "Provide a filename.")
        return ""
    if titlish == "":
        msg.showwarning("Utility Warning", "Indicate number of terms.")
        return ""
    try:
        urllib.request.urlopen(output)
        lights_on()
        print("OPEN")
    except urllib.error.HTTPError:
        msg.showwarning("Utility Warning", "Check connection.")
        return ""
    except urllib.error.URLError:
        msg.showwarning("Utility Warning", "Check connection.")
        return ""
    progress_bar["value"] = 1
    progress_bar.update()
    subl_query = "(get-lexification-sets-for-n-concepts " + str(
        number_of_terms) + ")"
    uri_query = urllib.parse.quote(subl_query)
    header = "http://" + host_name + ":" + port_string + "/cgi-bin/cb-eval-subl?expression="
    request = header + uri_query
    print(request)
    progress_bar["value"] = 2
    progress_bar.update()
    content_return = urllib.request.urlopen(request).read()
    soup_return = BeautifulSoup(content_return, "html.parser")
    json_return = json.loads(soup_return.getText())
    returns = json_return["results"]
    returns_string = str(returns)
    returns_string = returns_string[3:-3]
    returns_string = returns_string.replace("\\'", "\'")
    returns_dict = eval(returns_string)
    progress_bar["value"] = 3
    progress_bar.update()
    file_name = path + "/" + titlish + '.pickle'
    with open(file_name, 'wb') as handle:
        pickle.dump(returns_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    sleep(0.75)
    progress_bar["value"] = 0
Ejemplo n.º 36
0
def get_article_text(article_url):
    if type(article_url) != str and type(article_url) != unicode:
        raise TypeError('URL must be a string')

    html = requests.get(article_url).content
    soup = BeautifulSoup(html, 'html.parser')

    for s in soup(['style', 'script', '[document]', 'head', 'title']):
        s.extract()

    return soup.getText().strip().encode("utf-8")
Ejemplo n.º 37
0
def get_file_list(course_url, cookies):
    r = requests.get(course_url, cookies=cookies)
    soup = BeautifulSoup(r.text, 'lxml')
    soup_list = soup.find_all('div', {'class': 'activityinstance'})
    files = {}
    for soup in soup_list:
        soup = soup.find('a')
        link = soup['href']
        if 'resource' in link:
            files[link.split('=')[-1]] = soup.getText()
    return files
Ejemplo n.º 38
0
    def parse_details(self, response):
        data = response.meta['data']

        try:
            d = BeautifulSoup("\n".join(
                response.xpath(
                    "//p[@style = 'text-align: justify;']").extract()))
            description = d.getText().replace("\u2019", "'")
        except:
            description = "Description indisp."

        try:
            ttb = BeautifulSoup(
                response.xpath(
                    "//div[@class='biz-hours']").extract_first().replace(
                        "</h4>", " : ").replace(" </li>", ". "))
            timetable = ttb.getText()
        except:
            timetable = "Horaires indisp."

        try:
            p = BeautifulSoup(
                response.xpath(
                    "//div[@class='fee-kind']").extract_first().replace(
                        "</h4>",
                        "\n").replace('class ="fee-conditions">',
                                      '> (').replace("</ul>", ")").replace(
                                          "</li>", ". "))
            price = normalize("NFKD", p.getText())
        except:
            price = "Tarifs indisp."

        data['url'] = response.url
        data['timetable'] = timetable
        data['reviews'] = ""
        data['rank'] = 0
        data['summary'] = description
        data['price'] = price
        data['source'] = "3-expoInTheCity"

        yield data
Ejemplo n.º 39
0
def fetch_artical(url):
    d = {}
    resp = requests.get(url)
    if resp.status_code != 200:
        return None
    content = BeautifulSoup(resp.text, 'html.parser')

    # get the information of board, artical title, author
    if content.select_one('#topbar > a.board > span') != None:
        content.select_one('#topbar > a.board > span').decompose()
    board = ''
    if content.select_one('#topbar > a.board') != None:
        board = content.select_one('#topbar > a.board').getText()
    title = ''
    if content.select_one(
            '#main-content > div:nth-child(3) > span.article-meta-value'
    ) != None:
        title = content.select_one(
            '#main-content > div:nth-child(3) > span.article-meta-value'
        ).getText()
    author = ''
    if content.select_one(
            '#main-content > div:nth-child(1) > span.article-meta-value'
    ) != None:
        author = content.select_one(
            '#main-content > div:nth-child(1) > span.article-meta-value'
        ).getText()

    # remove pushes and other unimportant information
    content = content.select_one('#main-content')
    if content == None:
        print(url)
    for metaline in content.find_all('div', class_='article-metaline'):
        metaline.decompose()
    if content.select_one(
            '#main-content > div.article-metaline-right') != None:
        content.select_one(
            '#main-content > div.article-metaline-right').decompose()
    pushes = content.find_all('div', class_='push')
    for p in pushes:
        p.decompose()

    # get the artical
    artical = content.getText()

    json_data = {
        'board': board,
        'title': title,
        'author': author,
        'artical': artical
    }

    return json_data
Ejemplo n.º 40
0
    def get_chapter_text(self, chapter):
        """Gets the chapter text from the specified chapter.
        Work.load_chapters() must be called first."""

        if chapter > 0 and chapter <= self.chapters and self.chapters > 1:
            if len(self.chapter_ids) == self.chapters:
                chapter_html = self.request(
                    "https://archiveofourown.org/works/%i/chapters/%s?view_adult=true"
                    % (self.workid, self.chapter_ids[chapter - 1]))
                div = chapter_html.find("div", {'role': 'article'})
                return str(BeautifulSoup.getText(div))
            else:
                raise utils.UnloadedError(
                    "Work.load_chapters() must be called first")

        elif chapter == 1:
            div = self.soup.find("div", {'role': 'article'})
            return str(BeautifulSoup.getText(div))
        else:
            raise utils.UnloadedError(
                "Work.load_chapters() must be called first")
Ejemplo n.º 41
0
def index_document(document):
    # Create BeautifulSoup object from html text, and ignore/remove the non-ASCII
    soup = BeautifulSoup(document.encode("ascii", errors='ignore'),
                         'html.parser')
    # Remove non-visible tags [Reference: https://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text]
    [
        tag.extract()
        for tag in soup(['style', 'script', '[document]', 'head', 'title'])
    ]
    # Get visible text from html document
    visible_text = soup.getText()
    return index_string(visible_text)
Ejemplo n.º 42
0
def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.getText()
    res = ''
    for line in text.split('\n'):
        while line is not None and len(line) > 2 and (line[0] == ' '
                                                      or line[0] == ' '):
            line = line[1:]
        if line is not None and len(line) > 1 and line[0] != '\n':
            # print(line)
            res = res + line + '\n'
    return res
Ejemplo n.º 43
0
def parse_json():
    parser = EmailReplyParser(language='en')
    with open('english.json', 'rb') as fl:
        messages = json.load(fl)
    parsed = []
    for text in messages:
        soup = BeautifulSoup(text, 'lxml')
        text = soup.getText('\n')
        text = parser.parse_reply(text)
        parsed.append(text)
    import code
    code.interact(local=locals())
Ejemplo n.º 44
0
def verify():
    parser = EmailReplyParser(language='fi')
    texts = json.load(open('test/emails/emails.json'))
    texts = list(filter(lambda d: type(d) == str, texts))
    parsed = []
    for text in texts:
        print('-'*100)
        soup = BeautifulSoup(text, 'lxml')
        text = soup.getText('\n')
        text = parser.parse_reply(text)
        parsed.append(text)
        print(text)
Ejemplo n.º 45
0
    def get_areaCode(self, number):

        try:
            url = 'http://www.allareacodes.com/' + str(number)
            site = self.mechRead(url)
            nameSplit1 = site.split('<td>Major City:</td>')[1]
            nameSplit2 = nameSplit1.split('</td>')[0]
            soup = BeautifulSoup(nameSplit2)
            area = soup.getText()
            return area.encode("utf-8")
        except Exception:
            return 'None'
Ejemplo n.º 46
0
 def transformContent(self, dbName, collectionName):
     db = self.mongoClient.getConnection(dataBaseName=dbName)
     collection = db[collectionName]
     for i in collection.find():
         content = i['content']
         soup = BeautifulSoup(content, 'lxml')
         content = soup.getText()
         i['content'] = self.cuttingMachine.deleSpecialChar(content)
         i['keywords'] = self.cuttingMachine.doCutting(i['content'])
         i['status'] = 0  #处理状态,0:未处理 1:待处理 2:处理中 3:处理完成
         i['mtime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         db.article_text.insert(i)
Ejemplo n.º 47
0
    def get_Job(self, url):

        site = self.mechRead(url)
        try:
            nameSplit1 = site.split(
                '<div class="_42ef"><div><div class="_50f3">')[1]
            nameSplit2 = nameSplit1.split('<span class="_50f8">')[0]
            soup = BeautifulSoup(nameSplit2)
            job = soup.getText()
            return job.encode("utf-8")
        except Exception:
            return 'None'
Ejemplo n.º 48
0
def buildHashMap(zipped_folder):
    '''
    This function will take in a zipped folder of html pages and iterate through them, extracting all words from the pre-processed html text.
    It will output a dictionary of words occuring the collection of pages.
    The dictionary keys are words that contain a list of occurrences and locations of where the terms are found in the documents.
    '''
    with ZipFile(zipped_folder, 'r') as zip:
        # Display the directory of files in the zip folder
        zip.printdir()
        word_list = {}

        # Loop through each file in the zip container
        for name in zip.namelist():
            # Read the html file
            data = zip.read(name)

            # Parse the data with BeautifulSoup
            soup = BeautifulSoup(data, 'html.parser')

            # Extract the text from the html file and generate a list of "words" separated by spaces
            page_text = soup.getText().split()

            # Loop through words in extracted text and preprocess to exclude prohibited words
            doc_words = {}
            for id, word in enumerate(page_text):

                # Check if word contains only letters
                if word.isalpha():
                    # set word to lowercase
                    word = word.lower()

                    # Map frequencies and occurrences for each word in the corpus
                    if word in doc_words.keys():
                        doc_words[word]['freq'] += 1
                        doc_words[word]['locations'].append(id)
                    else:
                        doc_words[word] = {
                            'doc_id': name,
                            'freq': 1,
                            'tf_idf': 0,
                            'locations': [id]
                        }
            # Add document words to corpus vocabulary
            for word in doc_words.keys():
                if word in word_list.keys():
                    word_list[word].append(doc_words[word])
                else:
                    word_list[word] = [doc_words[word]]

            doc_words = {}

        return word_list
def process_set(paths, sent_dict):
    neg_but_pos = 0
    pos_but_neg = 0
    reviews = []
    for path in paths:
        fileset = [f for f in listdir(path)]
        for f in fileset:
            with open(path + "/" + f, 'r', encoding="utf8") as content_file:
                content = content_file.read()
            soup = BeautifulSoup(content.lower(), 'html.parser')
            tokens = word_tokenize(soup.getText())
            tagged_tokens = pos_tag(tokens)

            review_str = ""
            not_flag = False
            word_count = 0
            pos_score = 0
            neg_score = 0
            for (x, y) in tagged_tokens:
                if y.startswith("JJ") or y.startswith("RB") or y.startswith(
                        "NN") or y.startswith("V"):
                    review_str += x + " "

                    scores = sent_dict.get(x, [0, 0])
                    pos = scores[0]
                    neg = scores[1]

                    if not_flag:
                        pos_score += neg
                        neg_score += pos
                    else:
                        pos_score += pos
                        neg_score += neg

                elif x == "no" or x == "not":
                    not_flag = True
                else:
                    not_flag = False

            if pos_score > neg_score:
                if "neg" in path:
                    pos_but_neg += 1
            elif neg_score > pos_score:
                if "pos" in path:
                    neg_but_pos += 1
            reviews.append(
                ("processed_" + path + "/" + f.split('.')[0] + ".json",
                 review_str, pos_score, neg_score))

    tlog("Positive reviews with negative score: " + str(neg_but_pos))
    tlog("Negative reviews with positive score: " + str(pos_but_neg))
    return reviews
Ejemplo n.º 50
0
def search_google(query):
    '''Search google and determine if wikipedia is in it'''
    search_object = google.search(query)
    #Determine if a wikipedia url is in the first 5 searches
    urls = []
    for i in range(0, 4):
        url = search_object.__next__()
        urls.append(url)
        if "wikipedia.org/wiki" in url:
            wikipedia_search = wikipedia.search(query)[0]
            url = wikipedia.page(wikipedia_search).url
            response = wikipedia.summary(wikipedia_search) + " ({0})".format(
                url)
            return response
    #If there were no wikipedia pages
    first_url = urls[0]
    try:
        article = Article(first_url)
        article.download()
        article.parse()
        article.nlp()
        article_summary = article.summary
        article_title = article.title
        return "{0}\n{1} - ({2})".format(article_summary, article_title,
                                         first_url)

    except Exception as article_exception:
        try:
            log.debug(
                "Got error {0}, {1} while using newspaper, switching to bs4".
                format(article_exception.message, article_exception.args))
            html = requests.get(first_url).text
            #Parse the html using bs4
            soup = BeautifulSoup(html, "html.parser")
            [
                s.extract() for s in soup(
                    ['style', 'script', '[document]', 'head', 'title'])
            ]
            text = soup.getText()
            # break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines
                      for phrase in line.split("  "))
            # drop blank lines
            soup_text = '\n'.join(chunk for chunk in chunks if " " in chunk)
            response = format(soup_text) + " ({0})".format(first_url)
            return response
        except Exception as search_exception:
            log.info("Error {0},{1} occurred while searching query {2}".format(
                search_exception.message, search_exception.args, query))
            return "Error encountered on query {0}".format(query)
Ejemplo n.º 51
0
def get_page(resp):
    """
        Argument: The object contains information about the GET request
        The function will get the text of the file and write it the page_text file line by line
    """
    txt = resp.text
    soup = BeautifulSoup(txt, "html.parser")
    text_page = [soup.getText()]
    page_txt = open("page_text", 'w')
    for line in text_page:
        line = line.encode('utf-8').strip()
        page_txt.write(line + "\n")
    page_txt.close()
Ejemplo n.º 52
0
    def get_City(self, url):

        site = self.mechRead(url)
        try:
            nameSplit1 = site.split(
                'hovercard="/ajax/hovercard/page.php?id=112222822122196">')[2]
            nameSplit2 = nameSplit1.split('</a></div>')[0]
            soup = BeautifulSoup(nameSplit2)
            city = soup.getText()
            city = city.replace(',', '')
            return city
        except Exception:
            return 'None'
Ejemplo n.º 53
0
Archivo: views.py Proyecto: yf1291/nlp4
            def get_content(url):  #如果url是网上链接
                try:

                    resp = urllib.request.urlopen(url)

                    html = resp.read()
                    html = (html.replace('<br>', '\n')).replace('<br/>', '\n')

                    #urllib.request.urlopen(url).read()
                    bs = BeautifulSoup(html, "html.parser")

                    return bs.get_text(), bs
                except:  #如果url是本地地址.
                    try:
                        htmlfile = open(url, 'r', encoding='utf-8')
                        html = htmlfile.read()
                        html = (html.replace('<br>',
                                             '\n')).replace('<br/>', '\n')
                        bs = BeautifulSoup(html, "html.parser")

                        return bs.getText(), bs
                    except:  #第三种是假的html,还是需要先下载.
                        import requests

                        r = requests.get(url)

                        with open('laji.html', "wb") as f:  # 这里面补上后缀名
                            f.write(r.content)
                        url = 'laji.txt'

                        htmlfile = open('laji.html', 'r', encoding='utf-8')
                        html = htmlfile.read()
                        html = (html.replace('<br>',
                                             '\n')).replace('<br/>', '\n')
                        bs = BeautifulSoup(html, "html.parser")

                        return bs.getText(), bs

                return bs.get_text(), bs
Ejemplo n.º 54
0
def get_strings(html, select=None):
    bs = BeautifulSoup(html, 'lxml')
    if select is not None:
        bs = bs.select(select)[0]
    for s in bs(['script', 'style']):
        s.extract()
    txt = bs.getText(separator=' ')
    res = []
    for line in txt.split('\n'):
        pret = prettify(line)
        if pret != '':
            res.append(pret)
    return res
Ejemplo n.º 55
0
def clean_html(html):
    if "<" in html and ">" in html:
        try:
            soup = BeautifulSoup(html, features="html.parser")
            plist = soup.find('plist')
            if plist:
                plist.decompose()  # remove plists because ugh
            text = soup.getText()
        except:
            text = remove_tags(html)
        return '. '.join(text.split("\r\n\r\n\r\n"))
    else:
        return '. '.join(html.split("\r\n\r\n\r\n"))
Ejemplo n.º 56
0
def getalltext(soup: BeautifulSoup) -> object:
    """
    Returns all text from an HTML soup
    :param soup: Beautifulsoup document
    :return: Text without HTML tags
    """
    # if isinstance(soup, NavigableString):
    #     return str(soup)
    # txt = ''.join(soup.find_all(text=True))
    # body = soup.find('body').getText()
    # title = soup.find('title').getText()
    # return re.sub(r'[\n\t\r\,]', ' ', body + title)
    return re.sub(r'[\n\t\r\,]', ' ', soup.getText())
Ejemplo n.º 57
0
def searchWikia(query):
    engine = "http://lyrics.wikia.com/wiki/Special:Search?search="
    searchlink = engine + query
    soup = BeautifulSoup(urllib.request.urlopen(searchlink), "lxml")
    soup = soup.find("a", "result-link")
    if soup != None:
        desc = soup.getText()
        link = soup["href"]
    else:
        desc = None
        link = None
    result = SearchResult(desc, link)
    return result
Ejemplo n.º 58
0
def get_text(page_source):
    """
     :param page_source:
     :return: the Text content in the page source
     """
    soup = BeautifulSoup(page_source, 'html.parser')
    texts = soup.findAll(text=True)
    [
        s.extract()
        for s in soup(['style', 'script', '[document]', 'head', 'title'])
    ]
    visible_text = soup.getText()
    return visible_text
Ejemplo n.º 59
0
    def extract_content(self, soup: BeautifulSoup) -> str:
        txt = str(soup)

        start_tag = '<div class="story_text">'
        end_tag = '<p class="autor">'

        start = txt.find(start_tag)
        end = txt.find(end_tag)

        content = txt[start:end]

        content = BeautifulSoup(str(content), features="lxml")  # remove all <>
        return content.getText()
Ejemplo n.º 60
0
 def process(self, html: BeautifulSoup, url, status_code):
     if status_code == 200:
         canonical = html.find("link", {"rel": "canonical"})
         if canonical is None:
             sep = chr(1)
             texts = [
                 t.strip(" \n\t")
                 for t in html.getText(separator=sep).split(sep)
             ]
             data = PageContent(
                 url,
                 [t for t in texts if len(t.split()) > self.min_length])
             self.content.append(data)