Example #1
0
def test_extract(foo_file):
    r = extract(foo_file)
    u = [node.tag for node in r]
    assert u == [
        'article',
        'body',
    ]
Example #2
0
def test_extract(test_file):
    r = extract(test_file)
    u = [node.tag for node in r]
    assert u == [
        'article',
        'body',
    ]
Example #3
0
def test_extract_tabular(foo_file):
    r = list(extract(foo_file))
    u = [node.tag for node in r]
    assert u == [
        'article',
        'body',
    ]
    for node in r[0]:
        assert node.tag == 'div'
Example #4
0
def scrape(url):
    r = get(url)
    text = list(extract(r.content))

    concat = ''
    for n in text:
        concat += str(n.text_content().encode('utf-8'))

    return concat
Example #5
0
def test_extract_tabular(foo_file):
    r = list(extract(foo_file))
    u = [node.tag for node in r]
    assert u == [
        'article',
        'body',
    ]
    for node in r[0]:
        assert node.tag == 'div'
Example #6
0
    def extract_text(self):
        """Parse fulltext, do keyword extraction using the newspaper lib
    => newspaper.readthedocs.io
    """
        libextract_nodes = list(extract(self.html.encode("utf-8")))
        self.fulltext = libextract_nodes[0].text_content()

        entities = EntityExtractor(self.fulltext)
        entities.get_scored_entities()  # Averaged Perceptron Tagger
        self.keywords = entities.get_keywords()  # Above median?
        self.names = entities.get_names()  # Filter top
Example #7
0
def noise_extractor(url, base_urls):
	content = requests.get(url).content
	body = html.fromstring(content)
	links = body.xpath("//a/@href")
	full_links = [link for link in links if urlparse(link).netloc in base_urls and urlparse(link).scheme in ['http', 'https']]
	cur_parse = urlparse(url)
	cur_base = cur_parse.scheme + '://' + cur_parse.netloc
	internal_links = [urljoin(cur_base, link) for link in links if urlparse(link).netloc == '' and not link.startswith('#') and urlparse(link).scheme == '']
	link_to_explore = full_links + internal_links
	
	sample_contents = [requests.get(url).content for url in link_to_explore[:4]] + [content]
	textnodes = [t for content in sample_contents for t in list(extract(content, count=5))]
	noise = set()
	seqs = [[0 for i in xrange(len(textnodes))] for j in xrange(len(textnodes))]
	for i in xrange(len(textnodes)):
		t1 = textnodes[i].text_content()
		for j in xrange(i):
			t2 = textnodes[j].text_content()
			seq = difflib.SequenceMatcher(None, t1, t2)
			if seq.ratio() > 0.9:
				noise.add(t1)
				noise.add(t2)
	return noise
Example #8
0
from requests import get
from libextract.api import extract

r = get('http://en.wikipedia.org/wiki/Information_extraction')
textnodes = list(extract(r.content))

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from IO import IO


class libextract:
    def __init__(self):
        pass

    def predict(self, url):
        print 'Predicting %s' % url
        d = {}
        d['Body'] = ''
        d['Title'] = ''
        try:
            html = urllib.urlopen(url).read()
            d['Body'] = Document(html).summary()
            d['Title'] = Document(html).short_title()
        except:
            print 'Exception %s' % url
            return d
        if d['Title'] == None:
            d['Title'] = ''
Example #9
0
def extract_visable_text(html_path):
    textnodes = list(extract(html_path))
    text_str = unicode(''.join((e.text_content().encode('utf-8')) for e in textnodes))
    print text_str
    return text_str
Example #10
0
	print 'Boilerpipe...'
	try:			
		article = Extractor(url=eachurl)
		title = '_'
		#title = article.getTitle()
		content = article.getHTML()
	except:
		print 'Failed URl %s' %eachurl
		content = '_'
		title = '_'
	body_score[-1].append(fscore(word_tokenize(content), data))
	title_score[-1].append(fscore(word_tokenize(title), title_true))
	######################################################################################
	print 'libextract...'
	# html = urllib.urlopen(eachurl).read()
	textnodes = list(extract(html))
	try:
		content = ' '.join(each.text_content() for each in textnodes[:5])
	except:
		print 'Not combining unicode %s' %eachurl
		content = '_'
	title = '_'
	body_score[-1].append(fscore(word_tokenize(content), data))
	title_score[-1].append(fscore(word_tokenize(title), title_true))
	#####################################################################################
	print 'NewsExtractor ....'
	NW.predict(eachurl)
	title = NW.title
	content = NW.content
	if  fscore(word_tokenize(title), title_true)<0.7:
		print 'OOOPS.......'
Example #11
0
def extract_all_text(html):
    textnodes = list(extract(html))
    text_str = unicode(''.join((e.text_content().encode('utf-8')) for e in textnodes))
    return text_str
Example #12
0
 print 'Boilerpipe...'
 try:
     article = Extractor(url=eachurl)
     title = '_'
     #title = article.getTitle()
     content = article.getHTML()
 except:
     print 'Failed URl %s' % eachurl
     content = '_'
     title = '_'
 body_score[-1].append(fscore(word_tokenize(content), data))
 title_score[-1].append(fscore(word_tokenize(title), title_true))
 ######################################################################################
 print 'libextract...'
 # html = urllib.urlopen(eachurl).read()
 textnodes = list(extract(html))
 try:
     content = ' '.join(each.text_content() for each in textnodes[:5])
 except:
     print 'Not combining unicode %s' % eachurl
     content = '_'
 title = '_'
 body_score[-1].append(fscore(word_tokenize(content), data))
 title_score[-1].append(fscore(word_tokenize(title), title_true))
 #####################################################################################
 print 'NewsExtractor ....'
 NW.predict(eachurl)
 title = NW.title
 content = NW.content
 if fscore(word_tokenize(title), title_true) < 0.7:
     print 'OOOPS.......'
from requests import get
from libextract.api import extract

r = get('http://en.wikipedia.org/wiki/Information_extraction')
textnodes = list(extract(r.content))

import sys
reload(sys)
sys.setdefaultencoding('utf-8')



from IO import IO
	
class libextract:
	def __init__(self):
		pass

	def predict(self,url):
		print 'Predicting %s' %url
		d = {}
		d['Body'] = ''
		d['Title'] = ''
		try:			
			html = urllib.urlopen(url).read()
			d['Body'] = Document(html).summary()
			d['Title'] = Document(html).short_title()
		except:
			print 'Exception %s' %url
			return d
		if d['Title'] == None:
Example #14
0
def print_nodes(html):
    textnodes = list(extract(html))
    text_str = ''.join((e.text_content().encode('utf-8')) for e in textnodes)
    print text_str