Python crawlContent Exemples

Exemple #1

0

Afficher le fichier

Fichier : feedCrawl.py Projet : fletcheralaina/QDoc

def crawlFeed(source, feedName, feedUrl):
    # Given a feedname from a Source and a URL of where the Rss feed is,
    # Get the new articles, with their basic params: Title, URL, Publish Time
    startStamp = loadLastStamp(feedName)
    html = urlopen(feedUrl).read()
    epoch = datetime(1970, 1, 1).replace(tzinfo=pytz.utc)

    soup = BeautifulSoup(html, "html.parser")
    latestStamp = startStamp
    newArticles = []

    for it in soup.find_all("item"):
        dt = extractPubTime(it)
        guid = extractGuid(it, source)
        timestamp = (dt - epoch).total_seconds()  # Hacky way of going from Datetime object to timestamp
        if timestamp > startStamp:  # new article
            latestStamp = max(timestamp, latestStamp)
            url = extractLink(it)
            newArticles.append(Article(guid, it.title.text, url, timestamp, source, feedName, "", ""))
        else:
            break  # we're done, this assumes articles are ordered by descending pubDate

    newArticles = crawlContent(newArticles)  # crawls for content, img and possible keywords (?)
    saveNewArticles(newArticles)  # save to Database
    print feedName, " => +" + str(len(newArticles))

    saveLastStamp(feedName, latestStamp)  # save to not reload articles

Exemple #2

0

Afficher le fichier

Fichier : feedCrawl.py Projet : fletcheralaina/QDoc

def crawlFeed(source, feedName, feedUrl):
    # Given a feedname from a Source and a URL of where the Rss feed is,
    # Get the new articles, with their basic params: Title, URL, Publish Time
    startStamp = loadLastStamp(feedName)
    html = urlopen(feedUrl).read()
    epoch = datetime(1970, 1, 1).replace(tzinfo=pytz.utc)

    soup = BeautifulSoup(html, 'html.parser')
    latestStamp = startStamp
    newArticles = []

    for it in soup.find_all('item'):
        dt = extractPubTime(it)
        guid = extractGuid(it, source)
        timestamp = (dt - epoch).total_seconds(
        )  # Hacky way of going from Datetime object to timestamp
        if timestamp > startStamp:  # new article
            latestStamp = max(timestamp, latestStamp)
            url = extractLink(it)
            newArticles.append(
                Article(guid, it.title.text, url, timestamp, source, feedName,
                        '', ''))
        else:
            break  # we're done, this assumes articles are ordered by descending pubDate

    newArticles = crawlContent(
        newArticles)  # crawls for content, img and possible keywords (?)
    saveNewArticles(newArticles)  # save to Database
    print feedName, " => +" + str(len(newArticles))

    saveLastStamp(feedName, latestStamp)  # save to not reload articles

Exemple #3

0

Afficher le fichier

Fichier : cleanContent.py Projet : fletcheralaina/QDoc

ski = int(sys.argv[2])
a = db.qdoc.find({
    '$query': {
        'source': source
    },
    '$orderby': {
        'timestamp': -1
    }
}).limit(1).skip(ski)
a = db.qdoc.find({
    '$query': {
        'keywords': 'print'
    },
    '$orderby': {
        'timestamp': -1
    }
}).limit(1).skip(ski)
a = a[0]
art = [
    Article(a['_id'], a['title'], a['url'], a['timestamp'], a['source'],
            a['feed'])
]
art = crawlContent(art)

with open("new_content.txt", "w") as f:
    f.write(art[0].content)
with open("old_content.txt", "w") as f:
    f.write(a['content'].encode('utf-8'))

print "Done"

Exemple #4

0

Afficher le fichier

Fichier : cleanContent.py Projet : fletcheralaina/QDoc

from dbco import *
from article import *
from crawlContent import *
import sys

# articles = db.qdoc.find({'content': ''})

# for a in articles:
# 	art = [Article('blabla', a['title'], a['url'], 1431243710, a['source'], "cnn_world")]
# 	art = crawlContent(art)
# 	new_cont = art[0].content
# 	db.qdoc.update({'guid': a['guid']}, {'$set': {'content': new_cont}}, multi=True)
# 	print "Done"



source = sys.argv[1]
ski = int(sys.argv[2])
a = db.qdoc.find({'$query': {'source': source}, '$orderby': {'timestamp': -1}}).limit(1).skip(ski)
a = db.qdoc.find({'$query': {'keywords': 'print'}, '$orderby': {'timestamp': -1}}).limit(1).skip(ski)
a = a[0]
art = [Article(a['_id'], a['title'], a['url'], a['timestamp'], a['source'], a['feed'])]
art = crawlContent(art)

with open("new_content.txt", "w") as f:
	f.write(art[0].content)
with open("old_content.txt", "w") as f:
	f.write(a['content'].encode('utf-8'))

print "Done"

Exemple #5

0

Afficher le fichier

from dbco import *
from crawlContent import *
import article
from article import *

print db.qdoc.find({'content': ''}).count()

art = db.qdoc.find({'content': ''})

for ar in art:
    a = [Article('blabla', "blabla2", ar['url'], 1431243710, 'cnn', "cnnyolo")]
    a = crawlContent(a)
    db.qdoc.update({'url': ar['url']}, {'$set': {'content': a[0].content}})
    # a[0].content

Exemple #6

0

Afficher le fichier

Fichier : reloadContent.py Projet : fletcheralaina/QDoc

from dbco import *
from crawlContent import *
import article
from article import *

print db.qdoc.find({'content': ''}).count()

art = db.qdoc.find({'content': ''})

for ar in art:
	a = [Article('blabla', "blabla2", ar['url'], 1431243710, 'cnn', "cnnyolo")]
	a = crawlContent(a)
	db.qdoc.update({'url': ar['url']}, {'$set': {'content': a[0].content}})
	# a[0].content