def migrate_versions(): XXX_this_hasnt_been_updated_for_having_multiple_git_dirs git_output = subprocess.check_output([GIT_PROGRAM, 'log'], cwd=models.GIT_DIR) commits = git_output.split('\n\ncommit ') commits[0] = commits[0][len('commit '):] print 'beginning loop' d = {} versions = [x.v for x in models.Version.objects.all()] for i, commit in enumerate(commits): (v, author, datestr, blank, changem) = commit.splitlines() if v in versions: continue fname = changem.split()[-1] changekind = changem.split()[0] if changekind == 'Reformat': continue date = datetime.strptime(' '.join(datestr.split()[1:-1]), '%a %b %d %H:%M:%S %Y') if not os.path.exists(os.path.join( models.GIT_DIR, fname)): #file introduced accidentally continue url = 'http://%s' % fname try: article = models.Article.objects.get(url=url) except models.Article.DoesNotExist: url += '/' try: article = models.Article.objects.get(url=url) except models.Article.DoesNotExist: url = url[:-1] article = models.Article(url=url, last_update=date, last_check=date) if not article.publication( ): #blogs aren't actually reasonable continue article.save() text = subprocess.check_output([GIT_PROGRAM, 'show', v + ':' + fname], cwd=models.GIT_DIR) text = text.decode('utf-8') (date2, title, byline) = text.splitlines()[:3] boring = False print '%d/%d' % (i, len(commits)), url, v, date, title, byline, boring v = models.Version(article=article, v=v, date=date, title=title, byline=byline, boring=boring) try: v.save() except models.IntegrityError: pass
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') for url in get_all_article_urls(): if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): models.Article(url=url, git_dir=todays_git_dir).save()
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') all_urls = get_all_article_urls() logger.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url)) if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): logger.debug('Adding!') models.Article(url=url, git_dir=todays_git_dir).save() logger.info('Done storing to database')
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') all_urls = get_all_article_urls() logger.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url)) # Looks like it skips URLs longer than 255? if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue # Is there an index on this column? if not models.Article.objects.filter(url=url).count(): logger.debug('Adding Article {0}'.format(url)) models.Article(url=url, git_dir=todays_git_dir).save() logger.info('Done storing to database')
def update_articles(): logger.info('Starting scraper; looking for new URLs') for url in get_all_article_urls(): if not models.Article.objects.filter(url=url).count(): models.Article(url=url).save()
def update_articles(): for url in get_all_article_urls(): if not models.Article.objects.filter(url=url).count(): models.Article(url=url).save()