Python Extractor Exemples, boilerpipe.extract.Extractor Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : parse-and-download.py Projet : Akibalogh/biffle-prototype

def download_article_file(articleURL, articleFileDirectory, code):
	articleFilePath = articleFileDirectory + code
				
	# Download the article and save as file
	if (articleURL == ""):
		print "ERROR: Empty URL detected! File not created"
		return None
	else:
		# If a directory for files doesn't exist, create it
		dir = os.path.dirname(articleFileDirectory)

		if not os.path.isdir(dir):
			#print "Created directory: " + dir
			os.makedirs(dir)
		
		try:
			#fullArticle = urllib2.urlopen(articleURL)
			#fullArticleText = fullArticle.read()

			# Use boilerpipe to remove boilerplate and formatting
			extractor = Extractor(extractor='ArticleExtractor', url=articleURL)
			fullArticleText = extractor.getText()

			# Test to see if article is in English. If not, then return None
			top_language = cld.detect(fullArticleText.encode('utf-8'))[0]
			if (top_language != 'ENGLISH'):
				print "SKIPPED: Article is in " + top_language
				return None

			outfile = open(articleFilePath, 'w+')			
			outfile.write(fullArticleText.encode('ascii', 'ignore'))
			outfile.close

			# Use lxml's HTML cleaner to remove markup
			#htmltree = lxml.html.fromstring(fullArticleText)		
			#cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True)
			#cleaned_tree = cleaner.clean_html(htmltree)
			#return cleaned_tree.text_content()
			return fullArticleText
	

		except urllib2.HTTPError:
			print "ERROR: HTTPError. Article file download skipped: " + articleURL	
			return None

		except urllib2.URLError:
			print "ERROR: URLError. Article file download skipped: " + articleURL	
			return None

		except LookupError:
			print "ERROR: LookupError. Article file download skipped: " + articleURL	
			return None
		
		except UnicodeDecodeError:
			print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL
			return None

		except:
	                print "ERROR: ", sys.exc_info()[0]
        	        return None

Exemple #2

0

Afficher le fichier

 def process_item(self, html_page):
     try:
         publish_date = examine(html_page['html'])
         from boilerpipe.extract import Extractor
         extractor = Extractor(extractor='ArticleExtractor',
                               html=html_page['html'])
         body = str(extractor.getText())
         title = str(extractor.source.getTitle())
         art = {
             'title': title,
             'body': body,
             'lang': self.lang,
             'source': html_page['source'],
             'url': html_page['url'],
             'crawl_date': html_page['timestamp'],
             'publish_date': publish_date,
             'article_id': sha1(html_page['url'].encode('utf-8')).hexdigest(),
             'sentences': []
         }
         if self.art_ok(art['body']):
             content = art['body']
             content = content.replace(u'\xa0', u' ')
             content = content.replace('\\n', '\n')
             sents = []
             if self.lang == 'en':
                 sents = sent_tokenize(content)
             else:
                 for para in content.split('\n'):
                     sents += sentence_split(para, self.lang)
                 sents = [sent for sent in sents if self.check_sent(sent)]
             art['sentences'] = sents
             if len(sents) >= 3:
                 self.output_corpus.add_instance(art)
     except Exception as e:
         pass

Exemple #3

0

Afficher le fichier

Fichier : data_extractor.py Projet : PhilippeNguyen/COMP598Ass1

def get_text(url):
    from boilerpipe.extract import Extractor
    try :
        extractor = Extractor(extractor='DefaultExtractor', url=url)
        return extractor.getText(), extractor.getHTML()
    except:
        return "",""

Exemple #4

0

Afficher le fichier

Fichier : project.py Projet : ritvik123/SearchWithTrie

def scrape(file, split1, split2, urlName):
    links_from_RSS_feed = []
    Requests_from_RSS = requests.get(
        'http://feeds.reuters.com/reuters/businessNews')
    Rss_soup = BeautifulSoup(Requests_from_RSS.text, "html5lib")

    lFile = open(file, "r")
    usedLinks = [line.strip() for line in lFile]
    lFile.close()

    for link in Rss_soup.find_all('guid'):
        links_from_RSS_feed.append(
            str(link.getText().replace('?feedType=RSS&feedName=businessNews',
                                       '')))

    l_file = open(file, "w")
    for item in links_from_RSS_feed:
        l_file.write(str(item) + "\n")
    l_file.close()

    no_of_links = len(links_from_RSS_feed)

    for i in range(0, no_of_links):
        fileName = links_from_RSS_feed[i].rsplit('/', split1)[split2]
        extractedText = Extractor(extractor='ArticleExtractor',
                                  url=urlName + fileName)
        print(fileName)
        write_file = open("Data/" + str(i) + ".txt", "w")
        write_file.write(str(datetime.date.today()) + "\n")
        write_file.write(str(extractedText.getText().encode("utf-8")))
        write_file.close()
    return no_of_links

Exemple #5

0

Afficher le fichier

Fichier : processing grammarly webblog.py Projet : ehnaton/piton

def extract_blog_posts(url_string, PAGES = 48):
    blog_posts = []
    page_count = 0
    
    while(page_count<=PAGES):
        page_count+=1
        url = url_string.format(page_count) # create url
        driver.get(url)
        
        try:        
            article = driver.find_elements_by_tag_name('article')        
            articles_size = len(article)
            print 'processing ', url
        except SocketError as e:
            if e.errno != errno.ECONNRESET:
                raise # Not error we are looking for
            continue
            
        for i in xrange(articles_size):
            headers = article[i].find_elements_by_tag_name("header")
        for header in headers:
            article_a = header.find_elements_by_xpath("//h1/a[@title]")
        print 'extracting ...'             
        for e in article_a:
            extractor = Extractor(extractor = 'ArticleExtractor', url = e.get_attribute('href'))
            texts = extractor.getText()    
            
            blog_posts.append({'title': e.text, 'content': clean_html(texts), 'link': e.get_attribute('href')})
            return blog_posts

Exemple #6

0

Afficher le fichier

 def scrap_link_boilerpipe(url):
     try:
         extractor = Extractor(extractor='ArticleSentencesExtractor',
                               url=url)
         return extractor.getText()
     except:
         return False

Exemple #7

0

Afficher le fichier

Fichier : fakt.py Projet : axelspringer/semaeval

def articles_from_feed():
    articles = []

    feed = feedparser.parse(rss_fakt)
    for item in feed["items"]:
        url = convert_url(item["link"])
        print item["published"]
        print url
        try:
            extractor = Extractor(extractor="ArticleExtractor", url=url)

            date = email.utils.parsedate_tz(item["published"])
            timestamp = email.utils.mktime_tz(date)
            iso = datetime.datetime.utcfromtimestamp(timestamp).isoformat()
            filename = url.split(",")[-1].split(".")[0]

            data = {
                "text": extractor.getText(),
                "date": iso,
                "url": url,
                "filename": filename
            }
        except Exception as e:
            print "Error downloading article from " + url
        articles.append(data)
    return articles

Exemple #8

0

Afficher le fichier

Fichier : extract.py Projet : JSerowik/News_Extraction

def scrape(feed, used, excep, split1, split2, urlName, nameF):
    arrLinks = []
    req = requests.get('http://feeds.reuters.com/reuters/businessNews')
    soupRss = BeautifulSoup(req.text, "html5lib")
    # Checks list of already queried links
    logrFile = open(used, "r")
    usedLinks = [line.strip() for line in logrFile]
    logrFile.close()
    # Extracts links from inital feed, excluding non-news
    for link in soupRss.find_all('guid'):
        arrLinks.append(
            str(link.getText().replace('?feedType=RSS&feedName=businessNews',
                                       '')))
    # Store currently extracted links as not to repeat
    log_file = open(used, "w")
    for item in arrLinks:
        log_file.write(str(item) + "\n")
    log_file.close()
    # Extracts stripped news content with timestamp, omitting used links
    for item in arrLinks:
        fileName = str(item.rsplit('/', split1)[split2])
        if any(fileName in s for s in usedLinks):
            print fileName + " has been extracted."
        else:
            extractedText = Extractor(extractor='ArticleExtractor',
                                      url=urlName + fileName)
            print fileName + ": New"
            write_file = open("extractedFiles/" + nameF + fileName + ".txt",
                              "w")
            write_file.write(str(datetime.date.today()) + "\n")
            write_file.write(str(extractedText.getText().encode("utf-8")))
            write_file.close()

Exemple #9

0

Afficher le fichier

def extract(args):
    if not os.path.isfile("articles.json"):
        print "File articles.json does not exist"
        print "Have you already crawled?"
        exit()

    with open("articles.json") as article_list:
        articles = [
            json.loads(line) for line in article_list.read().splitlines()
        ]

    for article in articles:
        if args.html:
            with open(article['path'], "rb") as html:
                extractor = Extractor(extractor='ArticleExtractor',
                                      html=html.read())
        else:
            extractor = Extractor(extractor='ArticleExtractor',
                                  url=article['url'])

        dirname = os.path.join("articles", article['domain']) + "/text"
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        filename = sha1(article['url']).hexdigest() + '.txt'
        path = os.path.join(dirname, filename)

        with open(path, "wb+") as extracted_text:
            extracted_text.write(extractor.getText().encode("utf-8"))

Exemple #10

0

Afficher le fichier

Fichier : article_extractor.py Projet : voidfiles/particle

def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    BP = Extractor(html=html)

    # run readability
    Rdb = Document(html)

    html = Rdb.summary()
    # return article data
    return {
      'extracted_title': Rdb.short_title().strip(),
      'extracted_content': strip_tags(BP.getText()),
    }

  # otherwise return an empty dict
  else:
    return {}

Exemple #11

0

Afficher le fichier

Fichier : text_extractor.py Projet : kienpt/site_discovery_public

    def extract_body_with_boilerpipe(html):
        """
        Extractor types:
                DefaultExtractor
                ArticleExtractor
                ArticleSentencesExtractor
                KeepEverythingExtractor
                KeepEverythingWithMinKWordsExtractor
                LargestContentExtractor
                NumWordsRulesExtractor
                CanolaExtractor
        Reference: https://github.com/misja/python-boilerpipe
        Note: set JAVA_HOME if import fails

        Returns
        --------
        str: extracted body text. Return empty string if extraction fails
        """
        try:
            extractor = Extractor(extractor='KeepEverythingExtractor',
                                  html=html)
            extracted_text = extractor.getText()
        except:
            print "Failed to extract text with boilerpipe"
            extracted_text = ""

        return extracted_text

Exemple #12

0

Afficher le fichier

Fichier : expand_urls.py Projet : ViDA-NYU/memex

def extract_and_save(url, path):
	try:
		handle = urllib2.urlopen(url)
		html_content = handle.read()
		extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
		text = extractor.getText()
		if text:
			if detect_english(text):
				links = get_all_urls(html_content, url)
				for link in links:
					try:
						handle = urllib2.urlopen(url)
						html_content = handle.read()
						#extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)		
						#text_content = extractor.getText()
						#if text_content:
						#	if detect_english(text_content):
						encoded_url = encode(link)
						f = open(path + "/" + encoded_url, "w")
						f.write(html_content)
						f.close()
					except:
						print url
						traceback.print_exc()
						return None
	except:
		print url
		traceback.print_exc()
		return None

Exemple #13

0

Afficher le fichier

Fichier : batch_extractor.py Projet : c4fcm/Terra-Incognita

	def run(self):
		count = 0
		docCount = self.doc_cursor.count()
		for doc in self.doc_cursor:
			url = doc['url']
			if (self.keepText(url)):
				try:
					extractor = Extractor(extractor='ArticleExtractor', url=url)
					extracted_text = extractor.getText()
				
					if (len(extracted_text) > 0):
						title = extractor.getTitle()
						
						if title != None:
							doc['title'] = title
							doc['extracted_text'] = title + " " + extracted_text
						else:
							doc['extracted_text'] = extracted_text
						self.db_collection.save(doc)
						print 'OK -' + url
				except IOError, err:
					print "IOError with url " + url
					print str(err)
				except (LookupError):
					print "LookupError - Maybe not text or weird encoding " + url
				except (UnicodeDecodeError, UnicodeEncodeError):
					print "UnicodeDecodeError or UnicodeEncodeError- " + url

Exemple #14

0

Afficher le fichier

def extract_article_content(html, url):
    """
    Disclaimer
    ----------
    Copied from
    https://github.com/turi-code/how-to/blob/master/
            extract_article_content_from_HTML.py
    Description
    ----------
    Extract the primary textual content from an HTML news article.
    In many cases, the HTML source of news articles is littered with
    boilerplate text that you would not want to include when doing text
    analysis on the content the page. Even if you could write some rules to
    extract the content from one page, it's unlikely that those rules would
    apply to an article from another site. The boilerpipe module allows us to
    solve this problem more generally.
    Parameters
    ----------
    html : str
        The source HTML from which to extract the content.
    url : str
        The url, needed for logging purposes only
    Returns
    -------
    out : str
        The primary content of the page with all HTML and boilerplate text
        removed.
    Examples
    --------
    >>> extract_article_content(
            "<html><body><p>Turi is in the business of building the best " \
            "machine learning platform on the planet. Our goal is to make " \
            "it easy for data scientists to build intelligent, predictive " \
            "applications quickly and at scale. Given the perplexing array " \
            "of tools in this space, we often get asked "Why Turi? What " \
            "differentiates it from tools X, Y, and Z?" This blog post aims " \
            "to provide some answers. I’ll go into some technical details " \
            "about the challenges of building a predictive application, and " \
            "how Turi’s ML platform can help.</p></body></html>")
    >>> Turi is in the business of building the best " \
            "machine learning platform on the planet. Our goal is to make " \
            "it easy for data scientists to build intelligent, predictive " \
            "applications quickly and at scale. Given the perplexing array " \
            "of tools in this space, we often get asked "Why Turi? What " \
            "differentiates it from tools X, Y, and Z?" This blog post aims " \
            "to provide some answers. I’ll go into some technical details " \
            "about the challenges of building a predictive application, and " \
            "how Turi’s ML platform can help.
    See Also
    --------
    - `Boilerpipe project <https://code.google.com/p/boilerpipe/>`_
    - `Boilerpipe Python module <https://pypi.python.org/pypi/boilerpipe>`_
    """
    from boilerpipe.extract import Extractor
    if html and html.strip():
        try:
            extractor = Extractor(extractor='ArticleExtractor', html=html)
            return extractor.getText()
        except Exception as e:
            error = "Function extract_article_content: " + url + " - " + str(e)

Exemple #15

0

Afficher le fichier

Fichier : boilerpipe_extractor.py Projet : jgerrish/nltk_ext

 def process_text(self, text):
     if text == "":
         return text
     extractor = Extractor(extractor='ArticleExtractor',
                           html=text)
     new_val = extractor.getText()
     return new_val

Exemple #16

0

Afficher le fichier

def scrape(feed, used, excep, split1, split2, urlName, nameF):
	arrLinks = []
	req = requests.get('http://feeds.reuters.com/reuters/businessNews')
	soupRss = BeautifulSoup(req.text, "html5lib")

	logrFile = open(used,"r")
	usedLinks = [line.strip() for line in logrFile]
	logrFile.close()

	for link in soupRss.find_all('guid'):
		arrLinks.append(str(link.getText().replace('?feedType=RSS&feedName=businessNews', '')))

	log_file = open(used,"w")
	for item in arrLinks:
		log_file.write(str(item)+"\n")
	log_file.close()

	for i in range(0, 8):
		fileName = arrLinks[i].rsplit('/', split1)[split2]
		#if any(fileName in s for s in usedLinks):
		#	print fileName +" has been extracted."
		#else:
		extractedText = Extractor(extractor='ArticleExtractor', url=urlName+fileName)
		print fileName
		write_file = open("Data/"+str(i)+".txt","w")
		write_file.write(str(datetime.date.today()) + "\n")
		write_file.write(str(extractedText.getText().encode("utf-8")))
		write_file.close()

Exemple #17

0

Afficher le fichier

Fichier : extract.py Projet : remram44/memex

def get_text_boilerpipe(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        return extractor.getText()
    except:
        print "Exception"
        return None

Exemple #18

0

Afficher le fichier

Fichier : biffle_spider.py Projet : Akibalogh/biffle-prototype

	def parse_page(self, response):
		if response.meta.has_key('crawldepth'):
			depth = response.meta['crawldepth']
		else:
		#       Set search depth here
			depth = 1
		log.msg('Depth = %s' % str(depth), level=log.INFO)
		if not isinstance(response, HtmlResponse):
		    log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
		    return

		log.msg('Response from: %s' % response.url, level=log.INFO)
		url_bf.add(response.url)
	
		# TODO: Extract page title
	
		extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
		cleaned_text = extractor.getText()

		# Eliminate duplicates
		keywordset = set(keywordlist)

		found_list = []
		for keyword in keywordset: # TODO: Is there a more efficient way to do this?
			# Look at word boundaries to match entire words only
			if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
				found_list.append(keyword)

		# Parse this page		
		item = BiffleItem()
		if (len(found_list) > 0):
			item['url'] = response.url
			item['body'] = cleaned_text
			item['keywords'] = ', '.join(found_list)
			item['process_date'] = datetime.today()
			log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
			self.map_keyword_count(found_list)
			yield item

		if (depth > 0):	
			# Find the next requests and yield those
			hxs = HtmlXPathSelector(response)
			links = hxs.select('//a/@href').extract()
			log.msg('Links on page: %s' % len(links), level=log.INFO)
			depth -= 1
			log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
			for l in links:
				l = urlparse.urljoin(response.url, l)
				if (l in url_bf):
					pass
					#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
				else:
					url_bf.add(l)
					#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
					# Decrement depth for next layer of links
					#callback = lambda response, depth = depth: self.parse_page(response, depth)			
					callback = lambda response: self.parse_page(response)
					request = Request(l, callback=callback)
					request.meta['crawldepth'] = depth
					yield request

Exemple #19

0

Afficher le fichier

Fichier : article_entities_extractor.py Projet : danhazan/dh-project-british-invasion

def extract_article_text(url):
    if url in utils.BROKEN_URLS or any([True for sd in BAD_SUBDOMAINS if sd in url]):
        return ""

    while True:
        try:
            extractor = Extractor(extractor='ArticleExtractor', url=url)
            break
        except socket.timeout:
            print("got socket.timeout on url: {}. retrying...".format(url), file=utils.stddbg)
        except URLError as e:
            if e.reason == "timed out":
                print("got urllib 'timed out' on url {}. retrying...".format(url), file=utils.stddbg)
            elif hasattr(e.reason, "strerror") and e.reason.strerror == 'getaddrinfo failed':
                print("got urllib 'getaddrinfo failed' on url {}. retrying...".format(url), file=utils.stddbg)
            elif e.code == 503:
                print("got urllib 503 error on url {}. retrying...".format(url), file=utils.stddbg)
            else:
                if not hasattr(e, "url"):
                    e.url = url
                raise
        except Exception as e:
            e.url = url
            raise e

    text = str(unicodedata.normalize('NFKD', (str(extractor.getText()))).encode('ascii', 'ignore'))
    return filter_junk(text)

Exemple #20

0

Afficher le fichier

Fichier : clean_gov2.py Projet : AdeDZY/clean-gov2

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("raw_dir_path")
    parser.add_argument("out_file_path")
    args = parser.parse_args()

    f_names = [(int(f), f) for f in listdir(args.raw_dir_path)]
    f_names = sorted(f_names)
    fout = open(args.out_file_path, 'w')

    for int_f_name, f_name in f_names:
        trec_reader = TrecReader(join(args.raw_dir_path, f_name))
        empty_cnt = 0
        err_cnt = 0

        for docno, html_text in trec_reader:
            if not html_text:
                empty_cnt += 1
            try:
                extractor = Extractor(extractor='ArticleExtractor', html=html_text)
                text = extractor.getText()
                text = text.replace('\n', ' ').replace('\t', ' ')
                text = text.encode('ascii', 'ignore')
                text = text_clean(text)
                if text:
                    fout.write(docno + '\t' + text + '\n')
                else:
                    empty_cnt += 1
            except Exception as e:
                err_cnt += 1

    fout.close()
    print empty_cnt, err_cnt

Exemple #21

0

Afficher le fichier

Fichier : webSemantic.py Projet : arnalex/Python_Nltk

def extractor(URL):

    extractor = Extractor(extractor='ArticleExtractor', url=URL)

    data = extractor.getText()

    file = open("data.txt", "w")
    file.write(data.encode('UTF-8'))
    file.close()

    #Scinde la contenu en phrase
    with open('data.txt', 'r') as f:
        s = f.read()
        sentences = s.split('.')

    #Liste de mot vide
    w=[]

    #Scinde les phrase en mots
    for sentence in sentences :
        w.extend(sentence.split(' '))

    print w

    #Retourne la liste de Mot
    return w

Exemple #22

0

Afficher le fichier

Fichier : tasks.py Projet : vioan/pinboard-archiver

def post_index(post):
    extractor = Extractor(extractor='ArticleExtractor', url=post['href'])
    post_text = extractor.getText().replace('\n', ' ')
    url = 'http://localhost:9200/bookmarks/bookmark/%s/_create' % post['hash']
    data = '{"title":"%s", "url":"%s", "text":"%s"}' % (post['description'], post['href'], post_text.replace('"', '\\"'))
    r = requests.put(url, data=data)
    print r.status_code

Exemple #23

0

Afficher le fichier

Fichier : wiki_crawler.py Projet : jrathin/IgemParser

def Text_extractor(y, page, team, team_i, counter=0):
    """Extract the text of team pages using BoilerPipe."""
    try:
        upage = urllib.parse.quote_plus(page)
        url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage
        extractor = Extractor(extractor='ArticleExtractor', url=url)
    except:
        counter += 1
        if counter > 10:
            print("Failed to get the text for page {}".format(page))
            return None
        Text_extractor(y, page, team, team_i, counter=counter)
    f = open(
        'results/%s/%s/%s_-_-_CONTENT.html' %
        (y, team, page.replace('/', '#')), 'w')
    f.write(extractor.getHTML())
    f.close()
    f = open(
        'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')),
        'w')
    f.write(extractor.getText())
    f.close()
    path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#'))
    # text = text.replace('\\n', '\\\\n')
    output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path)
    teams_pages_text_db.write(output)

Exemple #24

0

Afficher le fichier

Fichier : extract_article.py Projet : nfelnlp/anlp-svinj

def extract_metadata(url):
    extractor = Extractor(extractor='KeepEverythingExtractor', url=url)
    text = extractor.getText().split("\n")
    author = None
    date = None
    keywords = []
    find_keywords = False
    for line in text:
        #author
        match = re.match("Von\s(\w+\s\w+)(,\s[\s\w]*$|$)", line)
        if match:
            author = match.group(1)
            continue

        #date
        match = re.match("([0-9]{2}\.[0-9]{2}\.[0-9]{4})$", line)
        if match:
            date = match.group(1)
            continue

        #keywords
        if find_keywords:
            match = re.match("Hat\sIhnen\sdieser\sArtikel\sgefallen.*", line)
            if match:
                find_keywords = False
                continue
            else:
                keywords.append(line) 
            
        match = re.match("Schlagwörter zu diesem Artikel:", line)
        if match:
            find_keywords = True
        
    return author, date, keywords

Exemple #25

0

Afficher le fichier

Fichier : crawler.py Projet : aknirala/crawler

def get_articles(url):
    doc = urllib.request.urlopen(url)
    docContent = BeautifulSoup(doc, 'html.parser')
    articles = []
    for element in docContent.find_all('div'):
        try:
            if element.attrs['style'] == 'width:550px':
                article = defaultdict(str)
                article_link = 'http://www.moneycontrol.com' + element.a['href']
                for p in element.find_all('p'):
                    if 'a_10dgry' in p.attrs['class']:
                        article_time = p.contents[0].split('|')[0]
                        article_date = p.contents[0].split('|')[1][:-1]
                        article['link'] = article_link
                        article['time'] = article_time
                        article['date'] = article_date
                        extractor = Extractor(extractor='ArticleExtractor',
                                              url=article_link)
                        article['content'] = extractor.getText()
                        article['title'] = BeautifulSoup(extractor.getHTML(),
                                                         'html.parser').find_all('h1')[0].contents[0]
                        articles.append(article)
                        break
        except:
            logging.debug('div has no width attribute')
    return articles

Exemple #26

0

Afficher le fichier

Fichier : html_to_trec.py Projet : Big-Data/hunter-gatherer

def detag_html_file(infile, outfile, id):
    from boilerpipe.extract import Extractor

    if not USE_BOILERPLATE:
        return detag_html_file_bs(infile, outfile, id)

    tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension
    try:
        copyfile(infile, tempfile)
        extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile)
        os.unlink(tempfile)

        extracted_text = extractor.getText()
        extracted_html = extractor.getHTML()

        soup = BeautifulSoup(extracted_html)
        output = codecs.open(outfile, encoding='utf-8', mode='w')
        output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n");
        head = soup.find('head')
        if head:
            title_tag = head.find('title')
            if title_tag and title_tag.string:
                output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n")

        extract_para(soup, output)
        output.write(u"</DOC>\n")
        output.close()
    except Exception, exc:
        try:
            os.unlink(tempfile)
        except:
            pass

        return detag_html_file_bs(infile, outfile, id)

Exemple #27

0

Afficher le fichier

Fichier : parse-csv.py Projet : rubenros1795/iconicity

 def ParserBoilerEverything(html_object):
     extractor = Extractor(extractor='DefaultExtractor', html=html_object)
     sents = extractor.getText()
     try:
         return sents
     except Exception as e:
         return

Exemple #28

0

Afficher le fichier

Fichier : kompas.py Projet : ciheul/bigcrawler

 def parse_item(self, response):
     response_news = NewsItem()
     response_news['url'] = response.url
     response_news['html'] = Binary(zlib.compress(response.body, 9))
     extractor = Extractor(extractor='ArticleExtractor', html=response.body)
     response_news['content'] = extractor.getText()
     return response_news

Exemple #29

0

Afficher le fichier

Fichier : ParseClueWebDoc.py Projet : xiongchenyan/cxPyLib

def Process(DocIn,OutName):
    out = open(OutName,'w')
    
    logging.info('reading [%s]', DocIn)
    ErrCnt = 0
    EmptyCnt = 0
    for cnt,line in enumerate(open(DocIn)):
        vCol = line.strip().split('\t')
        DocNo = vCol[0]
        RawHtml = ' '.join(vCol[1:])
        RawHtml = DiscardHTMLHeader(RawHtml)
        if "" == RawHtml:
            EmptyCnt += 1
            continue
        try:
            extractor = Extractor(extractor='ArticleExtractor',html=RawHtml)
            text = extractor.getText()
            text = text.replace('\n',' ').replace('\t',' ')
            text = text.encode('ascii','ignore')
            text = TextClean(text)
            if "" != text:
                print >>out, DocNo + '\t' + text
            else:
                EmptyCnt += 1
#             print DocNo + '\t' + text.encode('ascii','ignore')
        
        except Exception as e:
            ErrCnt += 1
            
        if 0 == (cnt % 100):
            logging.info('parsed [%d] doc [%d] Err [%d] Empty', cnt,ErrCnt,EmptyCnt)

    out.close()
    logging.info('finished [%d] doc [%d] Err', cnt,ErrCnt)

Exemple #30

0

Afficher le fichier

def extract_and_save(url, path):
    try:
        handle = urllib2.urlopen(url)
        html_content = handle.read()
        extractor = Extractor(extractor='KeepEverythingExtractor',
                              html=html_content)
        text = extractor.getText()
        if text:
            if detect_english(text):
                links = get_all_urls(html_content, url)
                for link in links:
                    try:
                        handle = urllib2.urlopen(url)
                        html_content = handle.read()
                        #extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
                        #text_content = extractor.getText()
                        #if text_content:
                        #	if detect_english(text_content):
                        encoded_url = encode(link)
                        f = open(path + "/" + encoded_url, "w")
                        f.write(html_content)
                        f.close()
                    except:
                        print url
                        traceback.print_exc()
                        return None
    except:
        print url
        traceback.print_exc()
        return None

Exemple #31

0

Afficher le fichier

Fichier : crawler.py Projet : tirthparikh/topic-engine

    def extract_main_text(self):
        if self.res is None:
            return None

        extractor = Extractor(  # extractor='ArticleExtractor',
            url=self.url)
        return [extractor.getText()]

Exemple #32

0

Afficher le fichier

Fichier : article_spider.py Projet : KanwarGill/NewsInvestigator

    def parse(self, response):
        hxs = Selector(response)
        
        item = ArticleItem()
        item["title"] = hxs.xpath('//title/text()').extract()
        item["link"] = response.url
        item["source"] = hxs.xpath('//p').extract()
        
        extractor = Extractor(extractor='ArticleExtractor', url=item["link"])
        
        source = extractor.getHTML()
        item["text"] = extractor.getText()
        item["html"] = source
        
        page = html.fromstring(source)
        links = page.xpath("//p//a/@href")

        linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+))?$")
        
        for link in links:
            if linkPattern.match(link) and not link in self.crawled_links:
                self.crawled_links.append(link)
                yield Request(link, self.parse)
        

        yield item

Exemple #33

0

Afficher le fichier

Fichier : crawler_google.py Projet : happinesstaker/FYP

def GOOGLE_get_data(company):

    google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company
    rss_feed = feedparser.parse(google_news_rss_url)

    content_list = list()

    for entry in rss_feed['entries']:
        title = entry['title']
        link = entry['link']
        try:
            news_page = urllib2.urlopen(link).read()
            extractor = Extractor(extractor='ArticleExtractor', html=news_page)
        except:
            continue
        content = extractor.getText()
        now = datetime.datetime.now()
        content_list.append({"title": title,
                            "article": content,
                            "link": link,
                            "source": "GOOGLE",
                            "target": company,
                            "date": "%04d%02d%02d" % (now.year, now.month, now.day),
                            "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
                            

    DBOperation.save_db(content_list)

Exemple #34

0

Afficher le fichier

Fichier : api.py Projet : ciheul/ciheul

    def dehydrate(self, bundle):
        """GET Method"""
        
        #print bundle.data['content']
        if bundle.data['content']:
            extractor = Extractor(extractor='ArticleExtractor', html=bundle.data['content'])
            bundle.data['content'] = extractor.getText()

        try:
            article_stats = ArticleStat.objects.filter(article_id=bundle.obj.id)
            bundle.data['stat'] = {
                'reads': sum(map(lambda x: x.reads, article_stats)),
                'likes': sum(map(lambda x: x.likes, article_stats)),
                'dislikes': sum(map(lambda x: x.dislikes, article_stats)),
                'shares': sum(map(lambda x: x.shares, article_stats)),
            }
        except ObjectDoesNotExist:
            bundle.data['stat'] = {
                'reads': 0, 
                'likes': 0, 
                'dislikes': 0,
                'shares': 0,
            }

        # no cookies or no sessionid field in cookies, then just send normal
        # newsfeed to anonymous user
        #if not bundle.request.COOKIES or not bundle.request.COOKIES['sessionid']:
        if not bundle.request.COOKIES or not 'sessionid' in bundle.request.COOKIES:
            return bundle

        try:
            # even if there is a cookie, sessionid field might be not exist,
            # then it is also anonymous user
            s = get_current_session(bundle.request.COOKIES['sessionid'])
            if s is None or 'user_id' not in s:
                return bundle

            # get activity information whether user has already
            # read/liked/shared
            activity = Activities.objects.get(user_id=s['user_id'], \
                article_id=bundle.obj.id)

            # assign information 
            bundle.data['activity'] = {
                'read': activity.like or activity.share,
                'like': activity.like,
                'dislike': activity.dislike,
                'share': activity.share
            }
        except ObjectDoesNotExist:
            # assign False if the news has never been opened
            bundle.data['activity'] = {
                'read': False, 
                'like': False, 
                'dislike': False, 
                'share': False
            }

        return bundle

Exemple #35

0

Afficher le fichier

Fichier : extract_util.py Projet : motazsaad/article-info-extractor

def get_text_boilerpipe(url):
    try:
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        extracted_text = extractor.getText()
    except BaseException as error:
        extracted_text = 'error: {}'.format(error)
        print('error: {}'.format(error))
    return extracted_text

Exemple #36

0

Afficher le fichier

def sentences_from_urls(url: str, extractor_name=EXTRACTORS[0], model=MODELS[0],
                        min_words=0, with_proba=False, return_raw=False):
    extractor = Extractor(extractor=extractor_name)
    model = models[model]
    extracted_text = extractor.getTextBlocks(url=url)
    if len(extracted_text) > 0:
        func = model.predict_proba if with_proba else model.predict
        return func(extracted_text, min_words=min_words, return_raw=return_raw)

Exemple #37

0

Afficher le fichier

Fichier : extract_text.py Projet : kienpt/structured_data_classification

def html2text_bp(html):
    text = None
    try:
        extractor = Extractor(extractor=extractor_type, html=html)
        text = extractor.getText()
    except:
        traceback.print_exc()
    return text

Exemple #38

0

Afficher le fichier

Fichier : scrap.py Projet : remram44/memex

def extract_text(html_content):
  try:
    extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
    #print extractor.getText()
    return extractor.getText()
  except:
    print "Exception in html extraction"
    return None

Exemple #39

0

Afficher le fichier

def extract_html(html_text, parser):
    try:
        extractor = Extractor(extractor=parser, html=html_text)
    except Exception as e:
        return None, None
    title = extractor.source.getTitle()
    body_text = extractor.getText()
    return title, body_text

Exemple #40

0

Afficher le fichier

Fichier : crawler.py Projet : ruansongsong/crawler

def saveHtml(url, page):
    extractor = Extractor(extractor='ArticleExtractor', html=page)
    processed_plaintext = extractor.getText()
    # print processed_plaintext
    fileName = "./doc/" + (url + ".txt").replace("/", "()")
    f = open(fileName, "w")
    f.write(processed_plaintext)
    f.close()

Exemple #41

0

Afficher le fichier

 def ParserBoilerDefault(html_object):
     extractor = Extractor(extractor='DefaultExtractor', html=html_object)
     sents = extractor.getText()
     try:
         sents = list(nlp(sents).sents)
         return sents
     except Exception as e:
         return

Exemple #42

0

Afficher le fichier

 def boilerpipe_text(cls,url_in=None,html_in=None,extractor='ArticleExtractor'):
     assert (url_in!=None) != (html_in!=None) # one, not both
     inp=url_in or html_in
     if url_in:
         extractor = Extractor(extractor=extractor, url=inp)
     else:
         extractor = Extractor(extractor=extractor, html=inp)
     return HtmlTextCleaner().spec_text_cleaner(extractor.getText())

Exemple #43

0

Afficher le fichier

Fichier : article_extract.py Projet : analyticsbot/Python-Code---Part-7

def getBoilerPlate(url):
    #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html'
    try:
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        extracted_text = extractor.getText().replace('\n', '')
        return '', extracted_text
    except Exception, e:
        return '', ''

Exemple #44

0

Afficher le fichier

 def ParserBoilerArticle(html_object):
     extractor = Extractor(extractor='ArticleSentencesExtractor', html=html_object)
     sents = extractor.getText()
     try:
         sents = list(nlp(sents).sents)
         return sents
     except Exception as e:
         return

Exemple #45

0

Afficher le fichier

Fichier : utils.py Projet : jzbjyb/rri

def load_from_html(filename,
                   use_boilerpipe=True,
                   use_nltk=True,
                   use_regex=True,
                   binary=False):
    if binary:
        charset = UnicodeDammit(open(filename, 'rb').read())
        charset = charset.original_encoding
        try:
            content = open(filename, 'r', encoding=charset).read()
        except Exception as e:
            # if has error, return empty results
            logging.warn('encode error: {}, {}'.format(filename, e))
            return {'title': [], 'body': []}
    else:
        content = open(filename, 'r', encoding='utf-8').read()
    start = time.time()
    if not use_regex or not use_boilerpipe:
        bs = BeautifulSoup(content, 'html.parser')
    if use_regex:
        match = re.search(r'<title.*?>(.+?)</title>', content[:5000],
                          re.DOTALL | re.IGNORECASE)
        title = match.group(1) if match else ''
        title = html.unescape(title).strip()
    else:
        if bs.title != None and bs.title.string != None:
            title = bs.title.string.strip()
        else:
            title = ''
    t1 = time.time() - start
    start = time.time()
    if use_boilerpipe:
        extractor = Extractor(extractor='ArticleExtractor',
                              html=content)  # time consuming
        body = extractor.getText()
    else:
        body = bs.select('body')
        if len(body) <= 0:
            body = bs
        else:
            body = body[0]
        # remove all useless label
        [x.extract() for x in body.findAll('script')]
        [x.extract() for x in body.findAll('style')]
        [x.extract() for x in body.findAll('meta')]
        [x.extract() for x in body.findAll('link')]
        body = body.text
    t2 = time.time() - start
    start = time.time()
    result = {
        'title':
        my_word_tokenize(title) if use_nltk else clean_text(title).split(' '),
        'body':
        my_word_tokenize(body) if use_nltk else clean_text(body).split(' '),
    }
    t3 = time.time() - start
    #print('{}\t{}\t{}'.format(t1, t2, t3))
    return result

Exemple #46

0

Afficher le fichier

    def parse_item(self, response):
        title = response.css('title::text').extract_first()

        extractor = Extractor(extractor='ArticleExtractor', html=response.body)

        yield Article(title=title,
                      text=extractor.getText(),
                      url=response.url,
                      field=self.name)

Exemple #47

0

Afficher le fichier

Fichier : main.py Projet : sysofwan/zapfeeds

def extract_article(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        text_string = extractor.getText()
        text_string = htmlParser.unescape(text_string)
    except Exception:
        logger.error('Error extracting article html')
        text_string = ''
    return text_string

Exemple #48

0

Afficher le fichier

def get_news_by_url(url):
    print "Come to get_news_by_url"
    article = {}
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))
        "Get the title of News"
        title = ""
        titleElements = soup.findAll(id="disqus_title")
        for ele in titleElements:
            title = ele.getText().encode('utf-8')
        article["title"] = title
        print title

        "Get the posttime of News,Timezone ET"
        postTime = ""
        postTimeElements = soup.findAll(attrs={'class': "datestamp"})
        for ele in postTimeElements:
            timeStamp = float(ele["epoch"])
        postTime = datetime.fromtimestamp(timeStamp / 1000)
        article["post_time"] = postTime

        "Initiate the post date"
        postDay = postTime.date()
        article["post_date"] = postDay

        "Get the author information "
        author = ""
        authorElements = soup.findAll(attrs={'class': "byline"})
        for ele in authorElements:
            author = ele.contents[0].strip().replace("By", "").replace(
                "-", "").replace("and", ",").strip()
        article["author"] = author

        "Get the content of article"
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        content = extractor.getText().encode("utf-8")
        article["content"] = content

        "Initiate the Sources"
        source = "Bloomberg News"
        article["source"] = source

        "Initiate the update_time"
        updateTime = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
        article["update_time"] = updateTime

        "Initiate the embers_id"
        embersId = hashlib.sha1(content).hexdigest()
        article["embers_id"] = embersId

        "settup URL"
        article["url"] = url
    except:
        print "Error: %s" % sys.exc_info()[0]
        article = {}
    finally:
        return article

Exemple #49

0

Afficher le fichier

Fichier : BloombergNewsScrape.py Projet : GeorgeQiao/Embers_VT

def get_news_by_url(url):
    print "Come to get_news_by_url"
    article = {}
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))
        "Get the title of News"
        title = ""
        titleElements = soup.findAll(id="disqus_title")
        for ele in titleElements:
            title = ele.getText().encode('utf-8')
        article["title"] = title 
        print title
        
        "Get the posttime of News,Timezone ET"
        postTime = ""
        postTimeElements = soup.findAll(attrs={'class':"datestamp"})
        for ele in postTimeElements:
            timeStamp = float(ele["epoch"])
        postTime = datetime.fromtimestamp(timeStamp/1000)
        article["post_time"] = postTime
        
        "Initiate the post date"
        postDay = postTime.date()
        article["post_date"] = postDay;
        
        "Get the author information "
        author = ""
        authorElements = soup.findAll(attrs={'class':"byline"})
        for ele in authorElements:
            author = ele.contents[0].strip().replace("By","").replace("-","").replace("and", ",").strip();
        article["author"] = author
        
        "Get the content of article"
        extractor=Extractor(extractor='ArticleExtractor',url=url)
        content = extractor.getText().encode("utf-8")
        article["content"] =  content
        
        "Initiate the Sources"
        source = "Bloomberg News"
        article["source"] = source
        
        "Initiate the update_time"
        updateTime = datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
        article["update_time"] = updateTime
        
        "Initiate the embers_id"
        embersId = hashlib.sha1(content).hexdigest()
        article["embers_id"] =  embersId

        "settup URL"
        article["url"] =  url
    except:
        print "Error: %s" %sys.exc_info()[0]
        article = {}
    finally:
        return article

Exemple #50

0

Afficher le fichier

Fichier : algorithm.py Projet : sysofwan/zapfeeds

def html_to_text(html):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html)
    except Exception as e:
        logger.exception('\nError extracting text from html. Exception: %s, %s',
                         e.__class__.__name__, e)
        return ''
    text = extractor.getText()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
    return text

Exemple #51

0

Afficher le fichier

Fichier : miner.py Projet : lidsky/alienknows

def extract_article(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        text_string = extractor.getText()
        text_string = htmlParser.unescape(text_string)
        text_string = unicodedata.normalize('NFKD', text_string).encode('ascii','ignore')
    except Exception:
        print 'Error extracting article html'
        text_string = ''
    return text_string

Exemple #52

0

Afficher le fichier

Fichier : test_preprocess.py Projet : joyce-duan/All-Things-Data-Science

def test_boilerpipe():
    your_url = "http://stackoverflow.com/questions/9352259/trouble-importing-boilerpipe-in-python"
    extractor = Extractor(extractor='ArticleExtractor', url=your_url)
    extracted_html = extractor.getHTML()
    extracted_text = extractor.getText()

    print '\nfunction: %s ' % inspect.stack()[0][3]
    print 'extracted  html: %i text: %i' % (len(extracted_html), len(extracted_text))
    print ''
    n.assert_greater(len(extracted_text), min_str_length)

Exemple #53

0

Afficher le fichier

Fichier : article.py Projet : lasoren/hivemind

	def extract(self, article):
		try:
			extractor = Extractor(extractor='ArticleSentencesExtractor', url=article.url)
		except Exception as e:
			return ''
		article_text = ''
		try:
			article_text = extractor.getText()
		except Exception:
			pass
		return article_text.encode('utf-8')

Exemple #54

0

Afficher le fichier

Fichier : gen.py Projet : OlivierBlanvillain/blogforever-crawler-publication

def main():
  contents = sys.argv[1]
  for url in listdir(contents):
    print url
    with codecs.open(url, "w", encoding="utf-8") as out:
      try:
        html = urlopen(url.replace("{", "/")).read()
        extracted = Extractor(html=html)
        out.write(extracted.getText())
      except HTTPError:
        out.write("")

Exemple #55

0

Afficher le fichier

Fichier : relevantArticleSearch_rss.py Projet : kirikei/news_crawler

 def fetch_articles(self):
     greq_gen = (grequests.get(u, headers=self.header,) for u in self.urls)
     responses = grequests.map(greq_gen)
     for i,res in enumerate(responses):
         if res is not None:
             extractor = Extractor(html=res.text)
             self.entries[i]['text'] = extractor.getText()
             if '...' in self.entries[i]['title']:
                 self.entries[i]['title'] = extractor.getTitle()
     
     return True

Exemple #56

0

Afficher le fichier

Fichier : SpeedTest.py Projet : euangelion666/Scrapy-test

def boiler():
    from boilerpipe.extract import Extractor
    for i in range(0, 1000):
        input_filename = 'page/' + str(i) + '.txt'
        output_filename = 'boilerpipe/' + str(i) + '.txt'
        input_file = open(input_filename, 'r')
        s = input_file.read()
        input_file.close()
        extractor = Extractor(extractor='ArticleExtractor', html=s.decode('GBK', 'ignore'))
        output_file = open(output_filename, 'wb')
        output_file.write(extractor.getText().encode('utf-8'))
        output_file.close()

Exemple #57

0

Afficher le fichier

Fichier : boiler_test.py Projet : kyu999/ArticleClipper

    def fetch_info(self):
        """
        boilerpipeの本文抽出は余計な部分まで取得することは少ないけれど、本来より少ないことは多々ある
        """
        urls = self.get_urls()
        got_infos = []
        for url in urls:
            extractor = Extractor(extractor='ArticleExtractor', url=url)
            text = extractor.getText()
            content = requests.get(url).content
            got_infos.append([url, text, content])

        return got_infos