예제 #1
0
    def build(self, urls, docs=None, extacted_cache=False):
        """ """
        indexed_docs = []
        doc_id = 0
        # ---------------------------------
        if docs and len(urls) != len(docs):
            raise ValueError

        for i,URL in enumerate(urls):
            doc = StrictIndexDocument()
            doc.id, doc.url = doc_id, URL
            doc_id += 1

            if os.path.exists('./data/extracted-'+ str(doc_id) +'-cached.txt'):
                with codecs.open('./data/extracted-'+ str(doc_id) +'-cached.txt', 'r', encoding='utf-8') as f:
                    extr_text = f.read()
            else:
                if docs:
                    extractor = Extractor(extractor='ArticleExtractor', html=docs[i])
                else:
                    extractor = Extractor(extractor='ArticleExtractor', url=URL)
                extr_text = extractor.getText()
                # ---------------------------------
                if len(extr_text) < len(docs[i]) * self.threshold_extractor_fails:
                    if docs:
                        exKeepAll = Extractor(extractor='KeepEverythingExtractor', html=docs[i])
                    else:
                        exKeepAll = Extractor(extractor='KeepEverythingExtractor', url=URL)
                    extr_text = exKeepAll.getText()
                # ---------------------------------
                if extacted_cache:
                    with codecs.open('./data/extracted-'+ str(doc_id) +'-cached.txt', 'w', encoding='utf-8') as f:
                        print >>f, extr_text

            doc.words = self.extract_words(extr_text)
            doc.n_words = len(doc.words)

            paragraph = self.re_set_paragraph.sub(u' ', extr_text)
            sentences, poss = self.SS.predict(paragraph)

            # with codecs.open(u'data/outs.txt', 'a',encoding='utf-8') as f_out:
            #      print >>f_out, u'\n\n-----\n', u','.join( [ str(p) for p in poss ] )
            #      # print >>f_out, u','.join( [ str(p) for p in poss2 ] )
            #      for s in sentences:
            #          print >>f_out, s, '\n'

            poss = [0] + list(poss)
            doc.sentences = [ (poss[i], poss[i+1], sentences[i]) for i in xrange(len(poss)-1) ]
            doc.n_sentences = len(doc.sentences)

            indexed_docs.append(doc)
        return indexed_docs
예제 #2
0
파일: crawler.py 프로젝트: aknirala/crawler
def get_articles(url):
    doc = urllib.request.urlopen(url)
    docContent = BeautifulSoup(doc, 'html.parser')
    articles = []
    for element in docContent.find_all('div'):
        try:
            if element.attrs['style'] == 'width:550px':
                article = defaultdict(str)
                article_link = 'http://www.moneycontrol.com' + element.a['href']
                for p in element.find_all('p'):
                    if 'a_10dgry' in p.attrs['class']:
                        article_time = p.contents[0].split('|')[0]
                        article_date = p.contents[0].split('|')[1][:-1]
                        article['link'] = article_link
                        article['time'] = article_time
                        article['date'] = article_date
                        extractor = Extractor(extractor='ArticleExtractor',
                                              url=article_link)
                        article['content'] = extractor.getText()
                        article['title'] = BeautifulSoup(extractor.getHTML(),
                                                         'html.parser').find_all('h1')[0].contents[0]
                        articles.append(article)
                        break
        except:
            logging.debug('div has no width attribute')
    return articles
예제 #3
0
def extract_blog_posts(url_string, PAGES = 48):
    blog_posts = []
    page_count = 0
    
    while(page_count<=PAGES):
        page_count+=1
        url = url_string.format(page_count) # create url
        driver.get(url)
        
        try:        
            article = driver.find_elements_by_tag_name('article')        
            articles_size = len(article)
            print 'processing ', url
        except SocketError as e:
            if e.errno != errno.ECONNRESET:
                raise # Not error we are looking for
            continue
            
        for i in xrange(articles_size):
            headers = article[i].find_elements_by_tag_name("header")
        for header in headers:
            article_a = header.find_elements_by_xpath("//h1/a[@title]")
        print 'extracting ...'             
        for e in article_a:
            extractor = Extractor(extractor = 'ArticleExtractor', url = e.get_attribute('href'))
            texts = extractor.getText()    
            
            blog_posts.append({'title': e.text, 'content': clean_html(texts), 'link': e.get_attribute('href')})
            return blog_posts
예제 #4
0
파일: kompas.py 프로젝트: ciheul/bigcrawler
 def parse_item(self, response):
     response_news = NewsItem()
     response_news['url'] = response.url
     response_news['html'] = Binary(zlib.compress(response.body, 9))
     extractor = Extractor(extractor='ArticleExtractor', html=response.body)
     response_news['content'] = extractor.getText()
     return response_news
예제 #5
0
 def process_text(self, text):
     if text == "":
         return text
     extractor = Extractor(extractor='ArticleExtractor',
                           html=text)
     new_val = extractor.getText()
     return new_val
예제 #6
0
    def parse(self, response):
        hxs = Selector(response)
        
        item = ArticleItem()
        item["title"] = hxs.xpath('//title/text()').extract()
        item["link"] = response.url
        item["source"] = hxs.xpath('//p').extract()
        
        extractor = Extractor(extractor='ArticleExtractor', url=item["link"])
        
        source = extractor.getHTML()
        item["text"] = extractor.getText()
        item["html"] = source
        
        page = html.fromstring(source)
        links = page.xpath("//p//a/@href")

        linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+))?$")
        
        for link in links:
            if linkPattern.match(link) and not link in self.crawled_links:
                self.crawled_links.append(link)
                yield Request(link, self.parse)
        

        yield item
예제 #7
0
	def run(self):
		count = 0
		docCount = self.doc_cursor.count()
		for doc in self.doc_cursor:
			url = doc['url']
			if (self.keepText(url)):
				try:
					extractor = Extractor(extractor='ArticleExtractor', url=url)
					extracted_text = extractor.getText()
				
					if (len(extracted_text) > 0):
						title = extractor.getTitle()
						
						if title != None:
							doc['title'] = title
							doc['extracted_text'] = title + " " + extracted_text
						else:
							doc['extracted_text'] = extracted_text
						self.db_collection.save(doc)
						print 'OK -' + url
				except IOError, err:
					print "IOError with url " + url
					print str(err)
				except (LookupError):
					print "LookupError - Maybe not text or weird encoding " + url
				except (UnicodeDecodeError, UnicodeEncodeError):
					print "UnicodeDecodeError or UnicodeEncodeError- " + url
예제 #8
0
def get_text(url):
    from boilerpipe.extract import Extractor
    try :
        extractor = Extractor(extractor='DefaultExtractor', url=url)
        return extractor.getText(), extractor.getHTML()
    except:
        return "",""
예제 #9
0
def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    BP = Extractor(html=html)

    # run readability
    Rdb = Document(html)

    html = Rdb.summary()
    # return article data
    return {
      'extracted_title': Rdb.short_title().strip(),
      'extracted_content': strip_tags(BP.getText()),
    }

  # otherwise return an empty dict
  else:
    return {}
예제 #10
0
def GOOGLE_get_data(company):

    google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company
    rss_feed = feedparser.parse(google_news_rss_url)

    content_list = list()

    for entry in rss_feed['entries']:
        title = entry['title']
        link = entry['link']
        try:
            news_page = urllib2.urlopen(link).read()
            extractor = Extractor(extractor='ArticleExtractor', html=news_page)
        except:
            continue
        content = extractor.getText()
        now = datetime.datetime.now()
        content_list.append({"title": title,
                            "article": content,
                            "link": link,
                            "source": "GOOGLE",
                            "target": company,
                            "date": "%04d%02d%02d" % (now.year, now.month, now.day),
                            "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
                            

    DBOperation.save_db(content_list)
예제 #11
0
def extract_and_save(url, path):
	try:
		handle = urllib2.urlopen(url)
		html_content = handle.read()
		extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
		text = extractor.getText()
		if text:
			if detect_english(text):
				links = get_all_urls(html_content, url)
				for link in links:
					try:
						handle = urllib2.urlopen(url)
						html_content = handle.read()
						#extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)		
						#text_content = extractor.getText()
						#if text_content:
						#	if detect_english(text_content):
						encoded_url = encode(link)
						f = open(path + "/" + encoded_url, "w")
						f.write(html_content)
						f.close()
					except:
						print url
						traceback.print_exc()
						return None
	except:
		print url
		traceback.print_exc()
		return None
예제 #12
0
def extractor(URL):

    extractor = Extractor(extractor='ArticleExtractor', url=URL)

    data = extractor.getText()

    file = open("data.txt", "w")
    file.write(data.encode('UTF-8'))
    file.close()

    #Scinde la contenu en phrase
    with open('data.txt', 'r') as f:
        s = f.read()
        sentences = s.split('.')

    #Liste de mot vide
    w=[]

    #Scinde les phrase en mots
    for sentence in sentences :
        w.extend(sentence.split(' '))

    print w

    #Retourne la liste de Mot
    return w
def download_article_file(articleURL, articleFileDirectory, code):
	articleFilePath = articleFileDirectory + code
				
	# Download the article and save as file
	if (articleURL == ""):
		print "ERROR: Empty URL detected! File not created"
		return None
	else:
		# If a directory for files doesn't exist, create it
		dir = os.path.dirname(articleFileDirectory)

		if not os.path.isdir(dir):
			#print "Created directory: " + dir
			os.makedirs(dir)
		
		try:
			#fullArticle = urllib2.urlopen(articleURL)
			#fullArticleText = fullArticle.read()

			# Use boilerpipe to remove boilerplate and formatting
			extractor = Extractor(extractor='ArticleExtractor', url=articleURL)
			fullArticleText = extractor.getText()

			# Test to see if article is in English. If not, then return None
			top_language = cld.detect(fullArticleText.encode('utf-8'))[0]
			if (top_language != 'ENGLISH'):
				print "SKIPPED: Article is in " + top_language
				return None

			outfile = open(articleFilePath, 'w+')			
			outfile.write(fullArticleText.encode('ascii', 'ignore'))
			outfile.close

			# Use lxml's HTML cleaner to remove markup
			#htmltree = lxml.html.fromstring(fullArticleText)		
			#cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True)
			#cleaned_tree = cleaner.clean_html(htmltree)
			#return cleaned_tree.text_content()
			return fullArticleText
	

		except urllib2.HTTPError:
			print "ERROR: HTTPError. Article file download skipped: " + articleURL	
			return None

		except urllib2.URLError:
			print "ERROR: URLError. Article file download skipped: " + articleURL	
			return None

		except LookupError:
			print "ERROR: LookupError. Article file download skipped: " + articleURL	
			return None
		
		except UnicodeDecodeError:
			print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL
			return None

		except:
	                print "ERROR: ", sys.exc_info()[0]
        	        return None
예제 #14
0
def detag_html_file(infile, outfile, id):
    from boilerpipe.extract import Extractor

    if not USE_BOILERPLATE:
        return detag_html_file_bs(infile, outfile, id)

    tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension
    try:
        copyfile(infile, tempfile)
        extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile)
        os.unlink(tempfile)

        extracted_text = extractor.getText()
        extracted_html = extractor.getHTML()

        soup = BeautifulSoup(extracted_html)
        output = codecs.open(outfile, encoding='utf-8', mode='w')
        output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n");
        head = soup.find('head')
        if head:
            title_tag = head.find('title')
            if title_tag and title_tag.string:
                output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n")

        extract_para(soup, output)
        output.write(u"</DOC>\n")
        output.close()
    except Exception, exc:
        try:
            os.unlink(tempfile)
        except:
            pass

        return detag_html_file_bs(infile, outfile, id)
예제 #15
0
def post_index(post):
    extractor = Extractor(extractor='ArticleExtractor', url=post['href'])
    post_text = extractor.getText().replace('\n', ' ')
    url = 'http://localhost:9200/bookmarks/bookmark/%s/_create' % post['hash']
    data = '{"title":"%s", "url":"%s", "text":"%s"}' % (post['description'], post['href'], post_text.replace('"', '\\"'))
    r = requests.put(url, data=data)
    print r.status_code
예제 #16
0
	def parse_page(self, response):
		if response.meta.has_key('crawldepth'):
			depth = response.meta['crawldepth']
		else:
		#       Set search depth here
			depth = 1
		log.msg('Depth = %s' % str(depth), level=log.INFO)
		if not isinstance(response, HtmlResponse):
		    log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
		    return

		log.msg('Response from: %s' % response.url, level=log.INFO)
		url_bf.add(response.url)
	
		# TODO: Extract page title
	
		extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
		cleaned_text = extractor.getText()

		# Eliminate duplicates
		keywordset = set(keywordlist)

		found_list = []
		for keyword in keywordset: # TODO: Is there a more efficient way to do this?
			# Look at word boundaries to match entire words only
			if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
				found_list.append(keyword)

		# Parse this page		
		item = BiffleItem()
		if (len(found_list) > 0):
			item['url'] = response.url
			item['body'] = cleaned_text
			item['keywords'] = ', '.join(found_list)
			item['process_date'] = datetime.today()
			log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
			self.map_keyword_count(found_list)
			yield item

		if (depth > 0):	
			# Find the next requests and yield those
			hxs = HtmlXPathSelector(response)
			links = hxs.select('//a/@href').extract()
			log.msg('Links on page: %s' % len(links), level=log.INFO)
			depth -= 1
			log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
			for l in links:
				l = urlparse.urljoin(response.url, l)
				if (l in url_bf):
					pass
					#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
				else:
					url_bf.add(l)
					#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
					# Decrement depth for next layer of links
					#callback = lambda response, depth = depth: self.parse_page(response, depth)			
					callback = lambda response: self.parse_page(response)
					request = Request(l, callback=callback)
					request.meta['crawldepth'] = depth
					yield request
예제 #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("raw_dir_path")
    parser.add_argument("out_file_path")
    args = parser.parse_args()

    f_names = [(int(f), f) for f in listdir(args.raw_dir_path)]
    f_names = sorted(f_names)
    fout = open(args.out_file_path, 'w')

    for int_f_name, f_name in f_names:
        trec_reader = TrecReader(join(args.raw_dir_path, f_name))
        empty_cnt = 0
        err_cnt = 0

        for docno, html_text in trec_reader:
            if not html_text:
                empty_cnt += 1
            try:
                extractor = Extractor(extractor='ArticleExtractor', html=html_text)
                text = extractor.getText()
                text = text.replace('\n', ' ').replace('\t', ' ')
                text = text.encode('ascii', 'ignore')
                text = text_clean(text)
                if text:
                    fout.write(docno + '\t' + text + '\n')
                else:
                    empty_cnt += 1
            except Exception as e:
                err_cnt += 1

    fout.close()
    print empty_cnt, err_cnt
예제 #18
0
파일: extract.py 프로젝트: ViDA-NYU/memex
def get_text_boilerpipe(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        return extractor.getText()
    except:
        print "Exception"
        return None
예제 #19
0
def Process(DocIn,OutName):
    out = open(OutName,'w')
    
    logging.info('reading [%s]', DocIn)
    ErrCnt = 0
    EmptyCnt = 0
    for cnt,line in enumerate(open(DocIn)):
        vCol = line.strip().split('\t')
        DocNo = vCol[0]
        RawHtml = ' '.join(vCol[1:])
        RawHtml = DiscardHTMLHeader(RawHtml)
        if "" == RawHtml:
            EmptyCnt += 1
            continue
        try:
            extractor = Extractor(extractor='ArticleExtractor',html=RawHtml)
            text = extractor.getText()
            text = text.replace('\n',' ').replace('\t',' ')
            text = text.encode('ascii','ignore')
            text = TextClean(text)
            if "" != text:
                print >>out, DocNo + '\t' + text
            else:
                EmptyCnt += 1
#             print DocNo + '\t' + text.encode('ascii','ignore')
        
        except Exception as e:
            ErrCnt += 1
            
        if 0 == (cnt % 100):
            logging.info('parsed [%d] doc [%d] Err [%d] Empty', cnt,ErrCnt,EmptyCnt)

    out.close()
    logging.info('finished [%d] doc [%d] Err', cnt,ErrCnt)
예제 #20
0
파일: api.py 프로젝트: ciheul/ciheul
    def dehydrate(self, bundle):
        """GET Method"""
        
        #print bundle.data['content']
        if bundle.data['content']:
            extractor = Extractor(extractor='ArticleExtractor', html=bundle.data['content'])
            bundle.data['content'] = extractor.getText()

        try:
            article_stats = ArticleStat.objects.filter(article_id=bundle.obj.id)
            bundle.data['stat'] = {
                'reads': sum(map(lambda x: x.reads, article_stats)),
                'likes': sum(map(lambda x: x.likes, article_stats)),
                'dislikes': sum(map(lambda x: x.dislikes, article_stats)),
                'shares': sum(map(lambda x: x.shares, article_stats)),
            }
        except ObjectDoesNotExist:
            bundle.data['stat'] = {
                'reads': 0, 
                'likes': 0, 
                'dislikes': 0,
                'shares': 0,
            }

        # no cookies or no sessionid field in cookies, then just send normal
        # newsfeed to anonymous user
        #if not bundle.request.COOKIES or not bundle.request.COOKIES['sessionid']:
        if not bundle.request.COOKIES or not 'sessionid' in bundle.request.COOKIES:
            return bundle

        try:
            # even if there is a cookie, sessionid field might be not exist,
            # then it is also anonymous user
            s = get_current_session(bundle.request.COOKIES['sessionid'])
            if s is None or 'user_id' not in s:
                return bundle

            # get activity information whether user has already
            # read/liked/shared
            activity = Activities.objects.get(user_id=s['user_id'], \
                article_id=bundle.obj.id)

            # assign information 
            bundle.data['activity'] = {
                'read': activity.like or activity.share,
                'like': activity.like,
                'dislike': activity.dislike,
                'share': activity.share
            }
        except ObjectDoesNotExist:
            # assign False if the news has never been opened
            bundle.data['activity'] = {
                'read': False, 
                'like': False, 
                'dislike': False, 
                'share': False
            }

        return bundle
예제 #21
0
파일: scrap.py 프로젝트: ViDA-NYU/memex
def extract_text(html_content):
  try:
    extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
    #print extractor.getText()
    return extractor.getText()
  except:
    print "Exception in html extraction"
    return None
예제 #22
0
파일: main.py 프로젝트: sysofwan/zapfeeds
def extract_article(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        text_string = extractor.getText()
        text_string = htmlParser.unescape(text_string)
    except Exception:
        logger.error('Error extracting article html')
        text_string = ''
    return text_string
예제 #23
0
def get_news_by_url(url):
    print "Come to get_news_by_url"
    article = {}
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))
        "Get the title of News"
        title = ""
        titleElements = soup.findAll(id="disqus_title")
        for ele in titleElements:
            title = ele.getText().encode('utf-8')
        article["title"] = title 
        print title
        
        "Get the posttime of News,Timezone ET"
        postTime = ""
        postTimeElements = soup.findAll(attrs={'class':"datestamp"})
        for ele in postTimeElements:
            timeStamp = float(ele["epoch"])
        postTime = datetime.fromtimestamp(timeStamp/1000)
        article["post_time"] = postTime
        
        "Initiate the post date"
        postDay = postTime.date()
        article["post_date"] = postDay;
        
        "Get the author information "
        author = ""
        authorElements = soup.findAll(attrs={'class':"byline"})
        for ele in authorElements:
            author = ele.contents[0].strip().replace("By","").replace("-","").replace("and", ",").strip();
        article["author"] = author
        
        "Get the content of article"
        extractor=Extractor(extractor='ArticleExtractor',url=url)
        content = extractor.getText().encode("utf-8")
        article["content"] =  content
        
        "Initiate the Sources"
        source = "Bloomberg News"
        article["source"] = source
        
        "Initiate the update_time"
        updateTime = datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
        article["update_time"] = updateTime
        
        "Initiate the embers_id"
        embersId = hashlib.sha1(content).hexdigest()
        article["embers_id"] =  embersId

        "settup URL"
        article["url"] =  url
    except:
        print "Error: %s" %sys.exc_info()[0]
        article = {}
    finally:
        return article
예제 #24
0
파일: miner.py 프로젝트: lidsky/alienknows
def extract_article(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        text_string = extractor.getText()
        text_string = htmlParser.unescape(text_string)
        text_string = unicodedata.normalize('NFKD', text_string).encode('ascii','ignore')
    except Exception:
        print 'Error extracting article html'
        text_string = ''
    return text_string
def test_boilerpipe():
    your_url = "http://stackoverflow.com/questions/9352259/trouble-importing-boilerpipe-in-python"
    extractor = Extractor(extractor='ArticleExtractor', url=your_url)
    extracted_html = extractor.getHTML()
    extracted_text = extractor.getText()

    print '\nfunction: %s ' % inspect.stack()[0][3]
    print 'extracted  html: %i text: %i' % (len(extracted_html), len(extracted_text))
    print ''
    n.assert_greater(len(extracted_text), min_str_length)
예제 #26
0
def html_to_text(html):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html)
    except Exception as e:
        logger.exception('\nError extracting text from html. Exception: %s, %s',
                         e.__class__.__name__, e)
        return ''
    text = extractor.getText()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
    return text
 def fetch_articles(self):
     greq_gen = (grequests.get(u, headers=self.header,) for u in self.urls)
     responses = grequests.map(greq_gen)
     for i,res in enumerate(responses):
         if res is not None:
             extractor = Extractor(html=res.text)
             self.entries[i]['text'] = extractor.getText()
             if '...' in self.entries[i]['title']:
                 self.entries[i]['title'] = extractor.getTitle()
     
     return True
예제 #28
0
파일: article.py 프로젝트: lasoren/hivemind
	def extract(self, article):
		try:
			extractor = Extractor(extractor='ArticleSentencesExtractor', url=article.url)
		except Exception as e:
			return ''
		article_text = ''
		try:
			article_text = extractor.getText()
		except Exception:
			pass
		return article_text.encode('utf-8')
def main():
  contents = sys.argv[1]
  for url in listdir(contents):
    print url
    with codecs.open(url, "w", encoding="utf-8") as out:
      try:
        html = urlopen(url.replace("{", "/")).read()
        extracted = Extractor(html=html)
        out.write(extracted.getText())
      except HTTPError:
        out.write("")
예제 #30
0
def boiler():
    from boilerpipe.extract import Extractor
    for i in range(0, 1000):
        input_filename = 'page/' + str(i) + '.txt'
        output_filename = 'boilerpipe/' + str(i) + '.txt'
        input_file = open(input_filename, 'r')
        s = input_file.read()
        input_file.close()
        extractor = Extractor(extractor='ArticleExtractor', html=s.decode('GBK', 'ignore'))
        output_file = open(output_filename, 'wb')
        output_file.write(extractor.getText().encode('utf-8'))
        output_file.close()
예제 #31
0
def summarize(url=None, html=None, n=100, cluster_threshold=5, top_sentences=5):

    # Adapted from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
    #
    # Parameters:
    # * n  - Number of words to consider
    # * cluster_threshold - Distance between words to consider
    # * top_sentences - Number of sentences to return for a "top n" summary
            
    # Begin - nested helper function
    def score_sentences(sentences, important_words):
        scores = []
        sentence_idx = -1
    
        for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
    
            sentence_idx += 1
            word_idx = []
    
            # For each word in the word list...
            for w in important_words:
                try:
                    # Compute an index for important words in each sentence
    
                    word_idx.append(s.index(w))
                except ValueError as e: # w not in this particular sentence
                    pass
    
            word_idx.sort()
    
            # It is possible that some sentences may not contain any important words
            if len(word_idx)== 0: continue
    
            # Using the word index, compute clusters with a max distance threshold
            # for any two consecutive words
    
            clusters = []
            cluster = [word_idx[0]]
            i = 1
            while i < len(word_idx):
                if word_idx[i] - word_idx[i - 1] < cluster_threshold:
                    cluster.append(word_idx[i])
                else:
                    clusters.append(cluster[:])
                    cluster = [word_idx[i]]
                i += 1
            clusters.append(cluster)
    
            # Score each cluster. The max score for any given cluster is the score 
            # for the sentence.
    
            max_cluster_score = 0
            for c in clusters:
                significant_words_in_cluster = len(c)
                total_words_in_cluster = c[-1] - c[0] + 1
                score = 1.0 * significant_words_in_cluster                     * significant_words_in_cluster / total_words_in_cluster
    
                if score > max_cluster_score:
                    max_cluster_score = score
    
            scores.append((sentence_idx, score))
    
        return scores    
    
    # End - nested helper function
    
    extractor = Extractor(extractor='ArticleExtractor', url=url, html=html)

    # It's entirely possible that this "clean page" will be a big mess. YMMV.
    # The good news is that the summarize algorithm inherently accounts for handling
    # a lot of this noise.

    txt = extractor.getText()
    
    sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
    normalized_sentences = [s.lower() for s in sentences]

    words = [w.lower() for sentence in normalized_sentences for w in
             nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    top_n_words = [w[0] for w in fdist.items() 
            if w[0] not in nltk.corpus.stopwords.words('english')][:n]

    scored_sentences = score_sentences(normalized_sentences, top_n_words)

    # Summarization Approach 1:
    # Filter out nonsignificant sentences by using the average score plus a
    # fraction of the std dev as a filter

    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]

    # Summarization Approach 2:
    # Another approach would be to return only the top N ranked sentences

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-top_sentences:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

    # Decorate the post object with summaries

    return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
                mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
예제 #32
0
def bp_extract(url):
    extr = Extractor(extractor='ArticleExtractor', url=url)
    text = extr.getText()
    print(text)
예제 #33
0
def textify(html_text, extractor="raw", encoding="UTF8"):

    if not isinstance(html_text, unicode):
        try:
            html_text_unicode = unicode(html_text, encoding)
        except UnicodeDecodeError:
            try:
                html_text_unicode = unicode(html_text, 'utf-8')
            except UnicodeDecodeError:
                try:
                    html_text_unicode = unicode(html_text, 'iso-8859-1')
                except UnicodeDecodeError:
                    try:
                        html_text_unicode = unicode(html_text, 'cp1252')
                    except UnicodeDecodeError as e:
                        print "ERROR conv to unicode", e
    else:
        html_text_unicode = html_text
    if not html_text_unicode:
        return ""

    if extractor.lower() != "raw":
        try:
            from boilerpipe.extract import Extractor
            bp = Extractor(extractor=extractor, html=html_text_unicode)
            return bp.getText()
        except Exception:
            try:
                bp = Extractor(extractor=extractor, html=html_text_unicode)
                return bp.getText()
            except Exception as e:
                sys.stderr.write(
                    "ERROR running %s boilerpipe on %s:\n%s: %s\n" %
                    (extractor, html_text, type(e), e))
                return ""
        del bp
    else:
        text = html_text_unicode

    ### Entity Nonsense from A. Swartz's html2text http://www.aaronsw.com/2002/html2text/html2text.py ###

    def name2cp(k):
        if k == 'apos': return ord("'")
        if hasattr(htmlentitydefs, "name2codepoint"):  # requires Python 2.3
            return htmlentitydefs.name2codepoint[k]
        else:
            k = htmlentitydefs.entitydefs[k]
            if k.startswith("&#") and k.endswith(";"):
                return int(k[2:-1])  # not in latin-1
            return ord(codecs.latin_1_decode(k)[0])

    def charref(name):
        if name[0] in ['x', 'X']:
            c = int(name[1:], 16)
        else:
            c = int(name)
        try:
            return unichr(c)
        except NameError:  #Python3
            return chr(c)

    def entityref(c):
        try:
            name2cp(c)
        except KeyError:
            return "&" + c + ';'
        else:
            try:
                return unichr(name2cp(c))
            except NameError:  #Python3
                return chr(name2cp(c))

    def replaceEntities(s):
        s = s.group(1)
        if s[0] == "#":
            return charref(s[1:])
        else:
            return entityref(s)

    r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")

    def unescape(s):
        s = s.replace('&nbsp;', ' ')
        return r_unescape.sub(replaceEntities, s)

    ### End Entity Nonsense ###

    re_clean_comments = re.compile(r'<!--.*?-->', re.I | re.DOTALL)
    re_clean_javascript = re.compile(r'<script[^>]*/?>.*?</script>',
                                     re.I | re.DOTALL)
    re_clean_style = re.compile(r'<style[^>]*/?>.*?</style>', re.I | re.DOTALL)
    re_clean_balises = re.compile(r'<[/!?]?\[?[a-z0-9\-]+[^>]*>',
                                  re.I | re.DOTALL)
    #re_clean_blanks = re.compile(r'[ \s]+')
    re_clean_blanks = re.compile(r'[ \t\f\v]+')
    re_clean_multiCR = re.compile(r'( ?[\n\r]+)+', re.M)
    try:
        text = unescape(text)
        text = re_clean_blanks.sub(' ', text)
        text = re_clean_comments.sub(' ', text)
        text = re_clean_javascript.sub(' ', text)
        text = re_clean_style.sub(' ', text)
        text = re_clean_balises.sub(' ', text)
        text = re_clean_blanks.sub(' ', text).strip()
        text = re_clean_multiCR.sub('\n\r', text)
    except:
        pass

    return text
예제 #34
0
def unpack_line(line):
    line = string.replace(line, " ", " ")
    els = string.split(line, " ")
    url = els[2]
    categories = els[1]
    number = els[0]
    return number, categories, url


file_base = open('Main_base.txt', 'r')
Line = file_base.readlines()
file_extract = open('Dbase_exctract.txt', 'w')

for line in Line:
    try:
        number1, adress1, sites1 = unpack_line(line)
        number = number1.strip("\n")
        adress = adress1.strip("\n")
        sites = sites1.strip("\n")
        extractor = Extractor(extractor='ArticleExtractor', url=sites)
        file_ex = open(
            'ExtractSites/' + adress + '/' + number + 'forclass.txt', 'w')
        file_ex.write(extractor.getText().encode("UTF-8"))
        file_ex.close()
        file_extract.write(line)
    except:
        print line

file_base.close()
file_extract.close()
with open('finalurls.txt') as fp:
    createDirectories()
    with open('output/hashedUrls.csv', 'w') as csvfile:
        fieldnames = ['url', 'key']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for line in fp:
            md5value = hashlib.md5(line.strip().encode('utf-8')).hexdigest()
            writer.writerow({'url': line.strip(), 'key': md5value})
            print(count, ":", md5value)
            count = count + 1
            try:
                rawhtml = urllib.request.urlopen(line.strip()).read()
                with open('output/rawHtml/%s.html' % md5value,
                          'w+',
                          encoding='utf-8') as rawf:
                    print(rawhtml, file=rawf)

                extractor = Extractor(extractor='ArticleExtractor',
                                      html=rawhtml)
                htmlText = extractor.getText()
                with open('output/processedHtml/%s.txt' % md5value,
                          'w+',
                          encoding='utf-8') as processedf:
                    print(htmlText, file=processedf)
                    # print(htmlText)
            except KeyboardInterrupt:
                exit()
            except:
                pass
예제 #36
0
def extract_rss_articles(rss):

    new_entries_inserted = 0
    try:
        #rss parser
        rss_feed = feedparser.parse(rss)

    except:
        logging.warn('Warn: Parsing failed for rss source={}'.format(rss))
        return 0

    for entry in rss_feed['entries']:

        #title extracted
        if 'title' in entry.keys():
            title = entry.title
        else:
            continue

        #link extracted
        if 'link' in entry.keys():
            link = entry.link
            source = link.split("//")[-1].split("/")[0]
        else:
            continue
        id = hashlib.md5((title + link).encode("utf-8")).hexdigest()

        if new_id(id):

            #date of publish of article extracted
            if 'published_parsed' in entry.keys():
                published_date = entry.published_parsed
                published_date = datetime.fromtimestamp(
                    mktime(published_date)).isoformat()
                published_date = published_date.split("T")[0]
            else:
                published_date = "0000-00-00"
            #print(published_date)

            #summary of article extracted
            if 'summary' in entry.keys():
                summary = entry['summary']
            else:
                summary = ""
            TAG_RE = re.compile(r'<[^>]+>')
            summary = TAG_RE.sub('', summary)

            #extract full content of article
            content = ""
            if rss != "https://services.india.gov.in/feed/rss?cat_id=12&ln=en":
                if rss == "http://goidirectory.nic.in/rss/minstry_rss.php?categ_id=1":
                    try:
                        response = requests.get(link)
                        paragraphs = justext.justext(
                            response.content, justext.get_stoplist("English"))
                        for paragraph in paragraphs:
                            if not paragraph.is_boilerplate:
                                content = content + paragraph.text
                    except:
                        content = ""
                else:
                    try:
                        extractor = Extractor(
                            extractor='ArticleSentencesExtractor', url=link)
                        content = extractor.getText()
                    except:
                        content = ""

            else:
                content = summary

            if content == "" or content == "unknown":
                continue

            #insert article into database
            try:
                cursor.execute('use main_database')
                cursor.execute(
                    'insert english_database values (%s,%s,%s,%s,%s,%s,%s)',
                    (id, published_date, title, link, source, summary,
                     content))
                logging.info(
                    'Info: New Article pushed into database from {}'.format(
                        source))
                conn.commit()
                print("Article Fetched")

            except Exception as error:
                logging.info(
                    'Warn: Article cannot be pushed from source {}, error={}'.
                    format(source, error))
                continue

            #insert the link of this article into viewed_links.txt, since it has been viewed
            with open('viewed_articles_ids.txt', 'a') as f:
                f.write('{}\n'.format(id))
            new_entries_inserted = new_entries_inserted + 1
    print("rss source processed")
    #return count of new entries inserted into database
    return new_entries_inserted
예제 #37
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import with_statement

import sys
import os

from boilerpipe.extract import Extractor

sys.path.insert(0, os.path.abspath('..'))

from clint import args

if __name__ == '__main__':

    html_file = args.get(0)
    html = open(html_file).read()
    extractor = Extractor(extractor='ArticleExtractor', html=html)
    print extractor.getText().encode('utf-8')
예제 #38
0
def parse_readings():
    """
    Reads from list generated by parse_course().
    Reads each readings page from scrape_readings().
    Parses the HTML.
    Writes to JSON.
    """

    # Use the json list of readings if it exists
    try:
        with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile:
            readings = json.loads(jsonfile.read())
    # otherwise, generate it
    except FileNotFoundError:
        parse_course()
        with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile:
            readings = json.loads(jsonfile.read())

    # Create lists to hold readings
    reading_list = []
    pdf_list = []
    error_list = []

    for reading in readings:
        # Skip pdf files
        if '.pdf' in reading['url']:
            pdf_list.append(reading)

        else:
            # Container for parsed data
            reading_item = {}

            # Use goldfinch to make a valid filename from the URL
            filename = vfn(reading['url'], initCap=False).decode()

            # Initialize a newspaper article
            # url is  empty because we don't need newspaper to do any scraping
            # but it's a required property
            article = Article(url='')

            # Open the local version of the HTML file
            try:
                with open('%s/%s/%s' % (data_dir, readings_html_dir, filename),
                          'r') as htmlfile:
                    # Save both the raw html and add it to the article
                    raw_html = htmlfile.read()
                    article.set_html(raw_html)
            except FileNotFoundError:
                print('Error reading saved html file')

            # Use newspaper to do the parsing
            article.parse()

            reading_item['title'] = article.title
            reading_item['authors'] = article.authors

            # Set iso string version of date if it exists.
            # It needs to be a string because we'll be exporting to JSON
            reading_item['pub_date'] = article.publish_date.isoformat() \
                if article.publish_date else None

            # Usually newspaper's extractor works best
            reading_item['n_text'] = article.text

            # But when it fails, we may want to use boilerpipe extraction as
            # a fallback
            extractor = Extractor(extractor='ArticleExtractor', html=raw_html)
            reading_item['b_text'] = extractor.getText()

            # print('Newspaper words: %s' % len(reading_item['n_text'].split()))
            # print('Boilerpipe words: %s' % len(reading_item['b_text'].split()))

            # if(reading_item['text'] == ''):
            #     extractor = Extractor(extractor='ArticleExtractor', html=raw_html)
            #     reading_item['text'] = extractor.getText()

            # Add the parsed data to our existing reading data
            reading['page'] = reading_item

            # Note failed parses
            if (reading_item['n_text'] == '' and reading_item['b_text'] == ''):
                print('Could not parse text for %s' % reading['url'])
                error_list.append(reading)
            else:
                reading_list.append(reading)

    print('Sucessfully parsed readings: %s' % len(error_list))

    print('Skipped PDF readings: %s' % len(pdf_list))

    print('Articles without parseable text: %s' % len(error_list))

    # print('Articles without authors: %s' % len([
    #     reading for reading in reading_list
    #     if reading['page']['authors'] == []]))

    # print('Articles without dates: %s' % len([
    #     reading for reading in reading_list
    #     if reading['page']['pub_date'] is None]))

    # Write to json file
    with open('%s/%s' % (data_dir, readings_file), 'w') as jsonfile:
        jsonfile.write(json.dumps(reading_list))
class WebStatic:
    def __init__(self):
        self.URL = ''
        self.extractor = ''

    def setUrl(self, URL):
        self.URL = URL

    def getTextWeb(self):
        self.extractor = Extractor(extractor='KeepEverythingExtractor',
                                   url=self.URL)
        return self.extractor.getText()

    def getArticleText(self):
        self.extractor = Extractor(extractor='ArticleExtractor', url=self.URL)
        return self.extractor.getText()

    def getNews(self):
        self.extractor = Extractor(extractor='KeepEverythingExtractor',
                                   url=self.URL)
        buffer = list(self.extractor.getText().split(' '))
        buffer_two = []
        isnews = False
        pattern = '.\s\d\d.\d\d.\d{4}'
        for item in list(buffer):
            item = str(item).split()
            item = ' '.join(item)
            if re.search(pattern, item):
                isnews = True
                item = str(item).split(' ')
                buffer_two.append(item[0])
                item = item[1]
            if item == '':
                isnews = False
            if isnews:
                buffer_two.append(item)
        buffer_two.pop(0)
        pattern_year = '\d\d.\d\d.\d{4}'
        self.news = []
        newses = ''
        isnew = False
        for item in buffer_two:
            if re.search(pattern_year, item):
                newses = newses.replace('!', '').replace(',', '').replace(
                    '«', '').replace('»', '').replace(':',
                                                      '').replace('–', ' ')
                self.news.append(newses)
                newses = ''
                isnew = True
                continue
            if isnew:
                newses = newses + '' + item
                isnew = False
            else:
                newses = newses + ' ' + item
        self.news.pop(0)
        return self.news

    def getRelevantNews(self):
        # Определите здесь свой запрос
        QUERY_TERMS = ['стол', 'кубка', 'регион']
        # получаем массив новостей
        self.news = self.getNews()
        # Textcollection определяет абстракции tf, idf и tf_idf,
        # поэтому нам не требуется определять свои версии
        tc = nltk.TextCollection(self.news)
        relevant = []
        for idx in range(len(self.news)):
            score = 1
            for term in [t.lower() for t in QUERY_TERMS]:
                score += tc.tf_idf(term, self.news[idx])
            if score > 0:
                relevant.append({'score': score, 'title': self.news[idx]})
        # Сортировать результаты по релевантности и выводим
        relevants = sorted(relevant, key=lambda p: p['score'], reverse=True)
        for post in relevants:
            print('{0}'.format(post['title']))
        return relevants

    def getCollocation(self):
        # Число искомых словосочетаний
        N = 10
        all_tokens = [
            token for post in self.news for token in post.lower().split()
        ]
        for word in self.news:
            all_tokens.append(word.lower())
        finder = nltk.BigramCollocationFinder.from_words(all_tokens)
        finder.apply_freq_filter(2)
        finder.apply_word_filter(
            lambda w: w in nltk.corpus.stopwords.words('english'))
        scorer = association.BigramAssocMeasures.jaccard
        collocations = finder.nbest(scorer, N)
        for collocation in collocations:
            c = ' '.join(collocation)
            print(c)

    def getMatrixDiag(self):
        vector = TfidfVectorizer(analyzer='word',
                                 norm=None,
                                 use_idf=True,
                                 smooth_idf=True)
        tfIdf = vector.fit_transform(self.news)
        sim = cosine_similarity(tfIdf, tfIdf)
        newsList = []
        x = 1
        for i in self.news:
            newsList.append(str(x))
            x = x + 1
        simDf = pd.DataFrame(sim,
                             index=sorted(newsList),
                             columns=sorted(newsList))
        f = plt.figure(figsize=(19, 15))
        plt.matshow(simDf.corr(), fignum=f.number)
        plt.xticks(range(simDf.shape[1]),
                   simDf.columns,
                   fontsize=14,
                   rotation=45)
        plt.yticks(range(simDf.shape[1]), simDf.columns, fontsize=14)
        cb = plt.colorbar()
        cb.ax.tick_params(labelsize=14)
        plt.title('Косинусное сравнение новостей', fontsize=16)
        plt.show()
        print(simDf)
예제 #40
0
 def bpLargGetText(self):
     extractor = Extractor(extractor='LargestContentExtractor',
                           url=self.url)
     extracted = extractor.getText()
     return extracted
예제 #41
0
 def bpArtGetText(self):
     extractor = Extractor(extractor='ArticleExtractor', url=self.url)
     extracted = extractor.getText()
     return extracted
예제 #42
0
linksFile = open('1000TwitterLinks.txt','r')
for link in linksFile:
	if(link == ''):
		pass
	else:
		try:
			curlCommand = 'curl ' + link
			hash_object = hashlib.md5(link)
			print(hash_object.hexdigest() + '.html')
			htmlFile = hash_object.hexdigest() + ':htmlFile'
			textFile = hash_object.hexdigest() + ':txt'
			f = open(htmlFile, "w")
			raw_html = subprocess.call(curlCommand, shell=True, stdout=f)
			extractor = Extractor(extractor='ArticleExtractor', url=link)
			with open(textFile, 'w') as the_file:
				the_file.write(str(extractor.getText()))
				linksDict[textFile] = link
				print str(extractor.getText())

		except KeyboardInterrupt:
			exit()		
		except:
			pass


with open('textURLFile', 'w') as file:
	for key,value in linksDict.items():
		file.write('%s:%s\n' % (key, value))	
		
		
예제 #43
0
def extract_main_text(html_text):
    extractor=Extractor(extractor='ArticleExtractor',html=html_text)
    extracted_text=extractor.getText()
    return extracted_text
예제 #44
0
			print(hash_object.hexdigest() + '.html')



			htmlFile = os.path.join("sourceHTML_data", hash_object.hexdigest()+ "':html'")
			textFile = os.path.join("sourceTXT_data", hash_object.hexdigest()+ "':txt'")
			#file_to_open = os.path.join(data_folder, "raw_data.txt")

			#htmlFile = hash_object.hexdigest() + ':html'
			#textFile = hash_object.hexdigest() + ':txt'
			
			extractor = Extractor(extractor='ArticleExtractor', url=link)
				#print (str(extractor.getText()))
			
			
			if (len(str(extractor.getText())) > 0):
				#open(htmlFile, "w")
				f = open(htmlFile, "w")
				raw_html = subprocess.call(curlCommand, shell=True, stdout=f)
				#htmlFile.write(str(extractor.getHTML()))
				with open(textFile, 'w') as the_file:
					the_file.write(str(extractor.getText()))
					linksDict[textFile] = link
					print (str(extractor.getText()))
				#linksDict[html] = link
			else:
				print("yes")


		except KeyboardInterrupt:
			exit()		
예제 #45
0
def main():
    data = get_train() + get_test()

    f = file('generated/extracted_text', 'w')

    for i, item in enumerate(data):
        # status update
        if (i % 500) == 0:
            print i, datetime.datetime.now().time()

        #  parse file
        data = {}
        soup = boil_soup(item['urlid'])

        # given boilerplate
        data['boilerplate'] = [item['title'], item['body']]

        # extract text
        extractor = Extractor(extractor='ArticleExtractor', html=unicode(soup))
        data['boilerpipe'] = [extractor.getText()]

        # remove non-text tags
        for tag in ['script', 'style']:
            for el in soup.find_all(tag):
                el.extract()

        # extract text for each tag
        for tag in TAGS:
            items = []
            for el in soup.find_all(tag):
                el.extract()

                if tag == 'img':
                    try:
                        items.append(el['alt'])
                    except KeyError:
                        pass
                    try:
                        items.append(el['title'])
                    except KeyError:
                        pass
                else:
                    items.append(el.text)

            data[tag] = items

        # extract meta tags
        meta = soup.find_all('meta')
        for el in meta:
            prop = el.get('property') if el.get('property') else el.get('name')
            if not prop:
                continue
            prop = prop.lower()
            try:
                s = unicode(el['content'])
            except:
                continue

            data['meta-' + prop] = s.split(u',') if prop == 'keywords' else [s]

        # preprocess string
        for item in data:
            data[item] = map(clean_string, data[item])
            data[item] = filter(None, data[item])

        print >> f, json.dumps(data)

    f.close()
예제 #46
0
from boilerpipe.extract import Extractor
from urllib.parse import urlparse
import glob
import os

path = 'C:\\RawHtmls/*.txt'
urlCounter = 0
files = glob.glob(path)
for file in files:
    try:
        urlCounter = urlCounter + 1
        f = open(file, 'r')
        fileName = fileName = "C:\\ProcessedText\\" + os.path.basename(f.name)
        currentHtml = f.read()
        f.close()
        extractor = Extractor(extractor='ArticleExtractor', html=currentHtml)
        currentText = extractor.getText()
        output_file = open(fileName, "w")
        output_file.write(str(currentText.encode("utf-8")))
        output_file.close()
        print("Download Completed : " + fileName)
    except:
        print('Error :', urlCounter)
예제 #47
0
	if count == END:
		break
	if line[0] == "=":
		if found == False and count >= START:
			print("Critical Error:"+title+" has no url that passed the filter!")
			log.write(title+"\n")
		title = line.strip('\n').strip("=")
		count += 1
		found = False
	else:
		if not found and count >= START:
			if check(line,KEY_WORDS,NEGATIVE_KEY_WORDS):
				output = open(OUTPUT_KEYWORD+title+".txt",'w')
				try:
					extractor = Extractor(extractor='DefaultExtractor', url=line)
					txt = extractor.getText().encode('utf-8')
					print(len(txt))
					if len(txt) > 2500:
						output.write(txt)
						output.close()
						if len(txt) < 4000:
							print("Succeed,collecting another policy:"+title)
							title += "*"
						else:
							found = True
							print("Succeeded:"+title)
				except:
					print("Error:"+title+" request failed")


예제 #48
0
#f = open(html, 'r')
#html = f.read()
#
#print html
#
#DefaultExtractor = Extractor(extractor='DefaultExtractor', html=html)
#print "DefaultExtractor:\n" + DefaultExtractor.getText() + "\n"
#
#ArticleSentencesExtractor = Extractor(extractor='ArticleSentencesExtractor', html=html)
#print "ArticleSentencesExtractor:\n" + ArticleSentencesExtractor.getText() + "\n"

#DefaultExtractor = Extractor(extractor='DefaultExtractor', url=url)
#print "DefaultExtractor:\n" + DefaultExtractor.getText() + "\n"
#
ArticleExtractor = Extractor(extractor='ArticleExtractor', url=url)
print "ArticleExtractor:\n" + ArticleExtractor.getText() + "\n"

ArticleSentencesExtractor = Extractor(extractor='ArticleSentencesExtractor',
                                      url=url)
print "ArticleSentencesExtractor:\n" + ArticleSentencesExtractor.getText(
) + "\n"

#KeepEverythingExtractor = Extractor(extractor='KeepEverythingExtractor', url=url)
#print "KeepEverythingExtractor:\n" + KeepEverythingExtractor.getText() + "\n"
#
##KeepEverythingWithMinKWordsExtractor = Extractor(extractor='KeepEverythingWithMinKWordsExtractor', url=url)
##print "KeepEverythingWithMinKWordsExtractor:\n" + KeepEverythingWithMinKWordsExtractor.getText() + "\n"
#
#LargestContentExtractor = Extractor(extractor='LargestContentExtractor', url=url)
#print "LargestContentExtractor:\n" + LargestContentExtractor.getText() + "\n"
#
예제 #49
0
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


path = 'content/'
texts = {}
print('Start: ' + parser_type)
train_data = pd.read_csv('train_groups.csv', dtype=np.int16)
for filename in tqdm(listdir(path)):
    doc_id = int(filename.strip('.dat'))
    if doc_id not in train_data.doc_id.values:
        continue
    with codecs.open(path + filename, 'r', 'utf-8') as f:
        url = f.readline().strip()
        html = f.read()
        extractor = Extractor(extractor=parser_type, html=html)
        s = extractor.getText()
        s = s.replace('\n', " ")
        s = s.replace('\t', " ")
        s = s.replace('\r', " ")
        texts[doc_id] = s

train_data['text'] = train_data.apply(lambda row: texts[row.doc_id], axis=1)

save_obj(train_data, 'train_data' + parser_type)
예제 #50
0
def test_extraction():
    extractor = Extractor(extractor='ArticleExtractor', url='http://paulgraham.com/startupideas.html')
    print 'extractor created'
    print extractor.getText()
예제 #51
0
def extraction(link):
    extractor = Extractor(extractor='ArticleExtractor', url=link)
    extracted_text = extractor.getText()
    if extracted_text != "" or extracted_text != None:
        news_text.append(extracted_text)
예제 #52
0
from boilerpipe.extract import Extractor

if __name__ == '__main__':
    URL = 'http://programmingisterrible.com/post/112612689998/san-francisco-for-londoners'
    extractor = Extractor(extractor='ArticleExtractor', url=URL)
    print extractor.getText()
예제 #53
0
import os
from boilerpipe.extract import Extractor

# creating directory using os library
os.mkdir("processed")

count = 1

while (count < 1001):
    with open('raw_html/%s.html' % count, 'r+', encoding='utf-8') as fp:
        #reading the collected html files from previous step
        extractor = Extractor(extractor='ArticleExtractor', html=fp.read())
        #extracting non-html content
        processed = extractor.getText()
        with open('processed/%s.txt' % count, 'w',
                  encoding='utf-8') as outfile1:
            outfile1.write(processed)

    count = count + 1
예제 #54
0
    - DefaultExtractor
    - ArticleExtractor
    - ArticleSentencesExtractor
    - KeepEverythingExtractor
    - KeepEverythingWithMinKWordsExtractor
    - LargestContentExtractor
    - NumWordsRulesExtractor
    - CanolaExtractor
"""

url = 'https://techcrunch.com/2017/02/13/mit-speech-chip/'  #BadStatusLine from boilerpipurle

url = "http://www.forbes.com/sites/trevorclawson/2017/02/23/finding-a-voice-can-a-uk-startup-compete-with-its-heavy-hitters-in-the-speech-recognition-market/"

url = "https://nakedsecurity.sophos.com/2017/03/03/researcher-uses-googles-speech-tools-to-skewer-google-recaptcha/"

url = "http://www.natureworldnews.com/articles/32595/20161123/microsoft-officially-makes-first-humanly-accurate-speech-recognition-tech.htm"

url = "http://www.businessinsider.com/ibm-edges-closer-to-human-speech-recognition-2017-3"
#ArticleExtractor = Extractor(extractor='ArticleExtractor', url=url)
#print "ArticleExtractor:\n" + ArticleExtractor.getText() + "\n"

ArticleSentencesExtractor = Extractor(extractor='ArticleSentencesExtractor',
                                      url=url)
print ArticleSentencesExtractor.getText()

article = Goose().extract(url=url)
print article.cleaned_text

document = Document(requests.get(url))
document.content()
예제 #55
0
파일: test6.py 프로젝트: ayiis/coding
import q
import requests
from readability import Document

url = 'https://news.cnblogs.com/n/624615/'
url = 'https://tech.sina.com.cn/i/2019-04-29/doc-ihvhiqax5802337.shtml'
url = 'http://forthxu.com/blog/article/73.html'
url = 'http://forthxu.com/blog/article/91.html'
url = 'http://forthxu.com/blog/article/gmail-sub-account.html'

response = requests.get(url)

doc = Document(response.content)

print(doc.title())

s_html = doc.summary(True)

print("s_html:", s_html)

extractor = Extractor(extractor='ArticleExtractor', html=s_html)
# extractor = Extractor(extractor='ArticleExtractor', url=url)

extracted_text = extractor.getText()

print("extracted_text:", extracted_text)

# extracted_html = extractor.getHTML()

q.d()
예제 #56
0
import asyncio
from boilerpipe.extract import Extractor
from helpers.extractors import extractors

url = "https://dpstele.com"
extractor = extractors["article_sentences"]

ext = Extractor(extractor=extractor, url=url)

print(ext.getText())
successful_text_list = []
unsuccessful_url_list = []
for url in url_list:
    try:
        r = requests.get(url, timeout=timeout)
        if r.status_code != 200:
            unsuccessful_url_list.append(url)
            continue
        html = r.text
    except:
        unsuccessful_url_list.append(url)
        continue

    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html)
        text = extractor.getText().replace('\\', '').strip().replace(
            '\r', ' ').replace('\n', ' ')
    except:
        unsuccessful_url_list.append(url)
        continue

    successful_text_list.append(text)
    successful_url_list.append(url)

#
# write files
#
print()
print('There were ' + str(len(unsuccessful_url_list)) +
      ' unsuccessful webpage downloads. These URLs are listed in ' +
      output_directory + '/unsuccessful_url_list.txt')
f = open(output_directory + '/unsuccessful_url_list.txt', 'w')
예제 #58
0
 def update_content_by_url(self):
     from boilerpipe.extract import Extractor
     extractor = Extractor(extractor='ArticleExtractor', url=self.url)
     self.content_html = extractor.getHTML()
     self.content_text = extractor.getText()
예제 #59
0
파일: feeds.py 프로젝트: stephegn/rss
def getArticleProcItem(link):
    #request the url
    extractor = Extractor(extractor='ArticleExtractor', url=link)
    text = extractor.getText()
    return ProcessingItem(text)
예제 #60
0
def remove_boiler(htmlD):
    extractor = Extractor(extractor='DefaultExtractor', html=htmlD)
    text = extractor.getText().encode('ascii', 'ignore').decode('ascii')
    return text