コード例 #1
0
def migrate_versions():
    XXX_this_hasnt_been_updated_for_having_multiple_git_dirs
    git_output = subprocess.check_output([GIT_PROGRAM, 'log'],
                                         cwd=models.GIT_DIR)
    commits = git_output.split('\n\ncommit ')
    commits[0] = commits[0][len('commit '):]
    print 'beginning loop'
    d = {}
    versions = [x.v for x in models.Version.objects.all()]
    for i, commit in enumerate(commits):
        (v, author, datestr, blank, changem) = commit.splitlines()
        if v in versions:
            continue
        fname = changem.split()[-1]
        changekind = changem.split()[0]
        if changekind == 'Reformat':
            continue
        date = datetime.strptime(' '.join(datestr.split()[1:-1]),
                                 '%a %b %d %H:%M:%S %Y')

        if not os.path.exists(os.path.join(
                models.GIT_DIR, fname)):  #file introduced accidentally
            continue

        url = 'http://%s' % fname
        try:
            article = models.Article.objects.get(url=url)
        except models.Article.DoesNotExist:
            url += '/'
            try:
                article = models.Article.objects.get(url=url)
            except models.Article.DoesNotExist:
                url = url[:-1]
                article = models.Article(url=url,
                                         last_update=date,
                                         last_check=date)
                if not article.publication(
                ):  #blogs aren't actually reasonable
                    continue

                article.save()

        text = subprocess.check_output([GIT_PROGRAM, 'show', v + ':' + fname],
                                       cwd=models.GIT_DIR)
        text = text.decode('utf-8')
        (date2, title, byline) = text.splitlines()[:3]

        boring = False

        print '%d/%d' % (i, len(commits)), url, v, date, title, byline, boring
        v = models.Version(article=article,
                           v=v,
                           date=date,
                           title=title,
                           byline=byline,
                           boring=boring)
        try:
            v.save()
        except models.IntegrityError:
            pass
コード例 #2
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    for url in get_all_article_urls():
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        if not models.Article.objects.filter(url=url).count():
            models.Article(url=url, git_dir=todays_git_dir).save()
コード例 #3
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    all_urls = get_all_article_urls()
    logger.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url))
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        if not models.Article.objects.filter(url=url).count():
            logger.debug('Adding!')
            models.Article(url=url, git_dir=todays_git_dir).save()
    logger.info('Done storing to database')
コード例 #4
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    all_urls = get_all_article_urls()
    logger.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url))
        # Looks like it skips URLs longer than 255?
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        # Is there an index on this column?
        if not models.Article.objects.filter(url=url).count():
            logger.debug('Adding Article {0}'.format(url))
            models.Article(url=url, git_dir=todays_git_dir).save()
    logger.info('Done storing to database')
コード例 #5
0
def update_articles():
    logger.info('Starting scraper; looking for new URLs')
    for url in get_all_article_urls():
        if not models.Article.objects.filter(url=url).count():
            models.Article(url=url).save()
コード例 #6
0
ファイル: scraper.py プロジェクト: toyg/newsdiffs
def update_articles():
    for url in get_all_article_urls():
        if not models.Article.objects.filter(url=url).count():
            models.Article(url=url).save()