Example #1
0
    def fetch(self) -> None:
        try:
            r = requests.get(self.url, headers=self.headers)
        except ConnectionError:
            self.is_invalid = True
            return

        r.encoding = "utf-8"
        page = MetadataParser(html=r.text)

        self.title = page.get_metadata("title")
        self.description = page.get_metadata("description")
        self.image = page.get_metadata("image")
Example #2
0
    def get_or_create_ressource(self, url):
        try:
            ressource = self.get(url=url)
            raise Exception
        except Ressource.DoesNotExist:
            ressource = Ressource(url=url)

            md_strategy = ['og', 'dc', 'page', 'meta', ]
            md = MetadataParser(url=url)

            ressource.title = md.get_metadata('title', )
            ressource.excerpt = md.get_metadata('description', )
            ressource.image = md.get_metadata('image', )

        ressource.save()

        return ressource
Example #3
0
    def url_matcher(self, msg, match):
        url = match.group(0)
        r = requests.head(url)
        max_size = self.config['DOC_MAX_SIZE']
        max_len = self.config['DOC_MAX_LEN']

        # files that are too big cause trouble. Let's just ignore them.
        if 'content-length' in r.headers and \
           int(r.headers['content-length']) > max_size:
            return

        # ignore anything that is not allowed in configuration
        allowed_content_types = self.config['ALLOWED_CONTENT_TYPES']
        content_type = ''
        if 'content-type' in r.headers:
            content_type = re.sub(r'\s*\;.*$', '', r.headers['content-type'])
            content_type = content_type.strip()

        if content_type not in allowed_content_types:
            return

        html = requests.get(url).text
        readable_article = Document(html).summary()
        readable_article = self.text_cleanup(readable_article)

        if len(readable_article) > max_len:
            readable_article = readable_article[:max_len] + '...'

        readable_title = Document(html).title()

        page = MetadataParser(html=html)
        readable_description = page.get_metadata('description')

        if readable_description is None:
            readable_description = ''

        readable_description = self.text_cleanup(readable_description)

        description = ''
        if len(readable_description) > len(readable_article):
            description = readable_description
        else:
            description = readable_article

        if description:
            return "~> {}\n~> {}\n~> {}".format(url,
                                            readable_title,
                                            description)
        else:
            return "~> {}\n~> {}".format(url,
                                     readable_title) 
Example #4
0
    def url_matcher(self, msg, match):
        url = match.group(0)
        r = requests.head(url)
        max_size = self.config['DOC_MAX_SIZE']
        max_len = self.config['DOC_MAX_LEN']

        # files that are too big cause trouble. Let's just ignore them.
        if 'content-length' in r.headers and \
           int(r.headers['content-length']) > max_size:
            return

        # ignore anything that is not allowed in configuration
        allowed_content_types = self.config['ALLOWED_CONTENT_TYPES']
        content_type = ''
        if 'content-type' in r.headers:
            content_type = re.sub(r'\s*\;.*$', '', r.headers['content-type'])
            content_type = content_type.strip()

        if content_type not in allowed_content_types:
            return

        html = requests.get(url).text
        readable_article = Document(html).summary()
        readable_article = self.text_cleanup(readable_article)

        if len(readable_article) > max_len:
            readable_article = readable_article[:max_len] + '...'

        readable_title = Document(html).title()

        page = MetadataParser(html=html)
        readable_description = page.get_metadata('description')

        if readable_description is None:
            readable_description = ''

        readable_description = self.text_cleanup(readable_description)

        description = ''
        if len(readable_description) > len(readable_article):
            description = readable_description
        else:
            description = readable_article

        if description:
            return "~> {}\n~> {}\n~> {}".format(url, readable_title,
                                                description)
        else:
            return "~> {}\n~> {}".format(url, readable_title)
Example #5
0
from metadata_parser import MetadataParser
import pdb
import pprint

# hey use lxml >= 2.3.5 ; use 3.x though!
# otherwise this site will break ! http://www.nasa.gov/externalflash/discovery/index.html

if 0:
    a = MetadataParser(url='http://cnn.com')
    print(a.get_metadata('title'))

    b = MetadataParser(url='http://nyt.com')
    print(b.get_metadata('title'))

    c = MetadataParser(url='http://thedailybeast.com')
    print(c.get_metadata('title'))

    print("\n-------------------------------------------------------\n")
    print(a.metadata)
    print("\n-------------------------------------------------------\n")
    print(b.metadata)
    print("\n-------------------------------------------------------\n")
    print(c.metadata)
    print("\n-------------------------------------------------------\n")

    print(c.get_metadata('title'))
    print(c.get_metadata('canonical'))
    print(c.get_metadata('url'))
    print(c.absolute_url(c.get_metadata('canonical')))
    print(c.absolute_url(c.get_metadata('url')))
    print(c.get_discrete_url())
def parsearticle(article, pathuuid):
    mainimage = {}
    images = []
    req = requests.get(
        "http://" + os.getenv("RENDER_HOST") + ":3000/render/" +
        urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"]))
    print("http://" + os.getenv("RENDER_HOST") + ":3000/render/" +
          urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"]))
    articletext = MetadataParser(html=json.loads(req.text)['html'])
    imgurl = str(articletext.get_metadata('image'))
    if not imgurl.startswith("http"):
        imgurl = 'http:' + imgurl
    imgurlnopost = imgurl.rsplit('?', 1)[0]
    imgname = imgurlnopost.rsplit('/', 1)[-1]
    imgpath = pathuuid + '/' + imgname + str(uuid.uuid4())
    publication = json.loads(article.decode('utf-8'))["publication"]
    category = json.loads(article.decode('utf-8'))["category"]
    title = json.loads(article.decode('utf-8'))["title"]
    articleurl = json.loads(article.decode('utf-8'))["link"]
    geturl = None
    os.mkdir(pathuuid)
    count = 0
    try:
        geturl = urllib.request.urlretrieve(imgurl, imgpath)
    except:
        pass
    while not geturl:
        req = requests.get("http://" + os.getenv("RENDER_HOST") +
                           ":3000/render/" + urllib.parse.quote_plus(
                               json.loads(article.decode('utf-8'))["link"]))
        articletext = MetadataParser(html=json.loads(req.text)['html'])
        imgurl = str(articletext.get_metadata('image'))
        imgurlnopost = imgurl.rsplit('?', 1)[0]
        imgname = imgurlnopost.rsplit('/', 1)[-1]
        try:
            geturl = urllib.request.urlretrieve(imgurl, imgpath)
            count += 1
        except:
            if count > 10:
                raise ValueError('Article failed too many times')
            pass
    mainimage['imgurl'] = imgurl
    mainimage['imgname'] = imgname
    mainimage['imgpath'] = imgpath
    mainimage['content_type'] = geturl[1]['Content-Type']
    images.append(mainimage)
    images1 = getimages(
        json.loads(req.text)['html'],
        json.loads(req.text)['tree']['frameTree']['resources'], images,
        pathuuid)
    try:
        articletext = fulltext(json.loads(req.text)['html'], language='en')
    except:
        articletext = ""
    thing = {}
    thing['title'] = json.loads(article.decode('utf-8'))["title"]
    thing['articletext'] = articletext
    thing['summary'] = summarize(articletext)
    thing['assets'] = images1
    thing['publication'] = publication
    thing['category'] = category
    thing['articleurl'] = articleurl
    thing['html'] = json.loads(req.text)['html']

    return thing
Example #7
0
from metadata_parser import MetadataParser

if 0:
    a= MetadataParser(url='http://cnn.com')
    print a.get_metadata('title')

    b= MetadataParser(url='http://nyt.com')
    print b.get_metadata('title')

    c= MetadataParser(url='http://thedailybeast.com')
    print c.get_metadata('title')

    print "\n-------------------------------------------------------\n"
    print a.metadata
    print "\n-------------------------------------------------------\n"
    print b.metadata
    print "\n-------------------------------------------------------\n"
    print c.metadata
    print "\n-------------------------------------------------------\n"

    print c.get_metadata('title')
    print c.get_metadata('canonical')
    print c.get_metadata('url')
    print c.absolute_url(c.get_metadata('canonical'))
    print c.absolute_url(c.get_metadata('url'))
    print c.get_discrete_url()

if 0:
    a= MetadataParser(url='http://liqr.co/rsvpnewyork')
    print "title:"
    print a.get_metadata('title')
Example #8
0
	  "group by url order by count(*) desc;"

cur.execute(sql)
urls = cur.fetchall()

i = 0
for url in urls:
	i = i + 1
	url 				= remove_characters(url[0])
	try:	
		headers 		= {
		    				'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
		    				'From': '*****@*****.**'  # This is another valid field
						  }
		page 			= MetadataParser(url=url, requests_timeout=5, url_headers=headers)
		title 			= remove_characters(page.get_metadata('title'))
		url_resolved 	= remove_characters(page.get_metadata('url'))
		image 			= remove_characters(page.get_metadata('image'))
		description 	= remove_characters(page.get_metadata('description'))

		sql 			= "insert into url_meta (title, description, url, url_md5, image) " \
			  			  "values ('" + title + "', '" + description + "', '" + url_resolved + "', md5('" + url + "'), '" + image + "');"
	except Exception as e:
		e 				= remove_characters(str(e))
		sql 			= "insert into url_meta (title, description, url, url_md5, image) " \
			  			  "values ('error', '" + e + "', '" + url + "', md5('" + url + "'), '');"
	finally:
		cur.execute(sql) 
		cur.execute("commit;")
		if i % 100 == 0: print i
Example #9
0
from metadata_parser import MetadataParser
from opengraph import OpenGraph
import webpreview

url = 'https://health.usnews.com/wellness/health-buzz/articles/2018-01-05/smelling-your-partners-shirt-could-decrease-your-stress-levels-study-says'
page = MetadataParser(url=url)
print page.metadata
print page.get_metadata('title')

og = OpenGraph(url=url)
print og

wb = webpreview.OpenGraph(url, ['og:title', 'og:description'])
print wb.title
print wb.description