Python Journo.find_or_create Examples

Programming Language: Python
Class/Type: Journo
Method/Function: find_or_create
Examples at hotexamples.com: 1
Python Journo.find_or_create - 1 examples found. These are the top rated real world Python examples of Journo.find_or_create extracted from open source projects. You can rate examples to help us improve the quality of examples.
Frequently Used Methods
Show Hide
find_or_create(1)
update_activation(1)
Example #1
Show file
File: ScraperUtils.py Project: johncrumpton/journalisted
def scrape_articles( found, extract, opts):
    """Scrape list of articles, return error counts.

    found -- list of article contexts to scrape
    extract -- extract function
    opts:
        max_errors -- tolerated number of errors before bailing
        test
        force_rescrape
        etc...
    """

    extralogging = False
    max_errors = getattr(opts,'max_errors',0)
    expected_journo = getattr(opts,'expected_journo',None)

    # remove dupes (eg often articles appear in more than one RSS feed)
    found = unique_articles(found)

    # randomise the order of articles, so that if the scraper does abort
    # due to too many errors, successive runs should be able to pick up
    # all the scrapable articles.
    random.shuffle(found)
    #assert(len(found)>0)
    ukmedia.DBUG2("%d articles to scrape\n" % (len(found)))

    if opts.test:
        ukmedia.DBUG("DRY RUN\n")

    store = ArticleDB.ArticleDB()


    failcount = 0
    abortcount = 0
    newcount = 0
    had_count = 0
    rescrape_count = 0

    for context in found:

        try:
            known_urls = set((context['srcurl'], context['permalink']))
            got = store.find_article(known_urls)
            if len(got) > 0:
                if extralogging:
                    for article_id in got:
                        ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id)))
                if not opts.force_rescrape:
                    had_count += 1
                    continue;   # skip it - we've already got it
                else:
                    assert(len(got) == 1)


            #ukmedia.DBUG2( u"fetching %s\n" % (context['srcurl']) )
            resp = urllib2.urlopen( context['srcurl'] )

            # is the server sending an charset encoding?
            kwargs = {}
            content_type = resp.info().getheader('Content-Type','')
            m = re.compile(r';\s*charset\s*=\s*([^;]*)', re.I).search(content_type)
            if m:
                kwargs['encoding'] = m.group(1)

            # grab the content
            html = resp.read()

            # add any URLs we were redirected via...
            for code,url in resp.redirects:
                known_urls.add(url)
                if code==301:    # permanant redirect
                    context['permalink'] = url

            # check html for a rel="canonical" link:
            canonical_url = extract_canonical_url(html, context['permalink'])
            if canonical_url is not None:
                known_urls.add(canonical_url)
                context['permalink'] = canonical_url

            # strip off "?rss=yes" etc from permalink
            tidied_url = tidy_url(context['permalink'])
            if tidied_url != context['permalink']:
                context['permalink'] = tidied_url
                known_urls.add(tidied_url)

            context['urls'] = known_urls

            # check that all urls are OK (eg express.co.uk have a habit of publishing borked ones for blogs)
            for url in known_urls:
                url.encode('utf-8') # will raise an exception if dud

            # repeat url-based existence check with the urls we now have
            # TODO: if so, add any new urls... maybe rescrape and update article? 
            article_id = None
            got = store.find_article(known_urls)
            if len(got) > 0:
                if extralogging:
                    for article_id in got:
                        ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id)))
                if not opts.force_rescrape:
                    had_count += 1
                    continue;   # skip it - we've already got it
                else:
                    assert(len(got) == 1)
                    article_id = got[0]

            # some extra, last minute context :-)
            context[ 'lastscraped' ] = datetime.now()

            art = extract(html, context, **kwargs)

            if art:
                # set the srcorg id for the article
                if 'srcorgname' in art and art['srcorgname'] is not None:
                    srcorg = Misc.GetOrgID( art[ 'srcorgname' ] )
                else:
                    # no publication specified - look up using domain name
                    o = urlparse.urlparse(art['permalink'])
                    domain = o[1].lower()
                    srcorg = Publication.find_or_create(domain)
                art['srcorg'] = srcorg


                # resolve bylined authors to journo ids
                authors = Byline.CrackByline(art['byline'])
                attributed = []
                for author in authors:
                    attributed.append(Journo.find_or_create(author, art, expected_journo))
                art['journos'] = attributed

                if opts.test:
                    ukmedia.PrettyDump( art )

                if article_id:
                    # rescraping existing article
                    art['id'] = article_id
                    article_id = store.upsert( art )
                    rescrape_count += 1
                else:
                    #
                    article_id = store.upsert( art )
                    newcount += 1



                if opts.test:
                    DB.conn().rollback()
                else:
                    DB.conn().commit()


        except Exception, err:
            DB.conn().rollback()

            # always just bail out upon ctrl-c
            if isinstance( err, KeyboardInterrupt ):
                raise

            failcount = failcount+1
            # TODO: phase out NonFatal! just get scraper to print out a warning message instead
            if isinstance( err, ukmedia.NonFatal ):
                continue

            report = traceback.format_exc()

            if 'title' in context:
                msg = u"FAILED (%s): '%s' (%s)" % (err, context['title'], context['srcurl'])
            else:
                msg = u"FAILED (%s): (%s)" % (err,context['srcurl'])
            ukmedia.DBUG( msg + "\n" )
            ukmedia.DBUG2( report + "\n" )
            ukmedia.DBUG2( '-'*60 + "\n" )

            abortcount = abortcount + 1
            if abortcount > max_errors:
                print >>sys.stderr, "Too many errors - ABORTING"
                raise