Ejemplo n.º 1
0
def strip_wrapping(html):
    """
    Removes the wrapping that might have resulted when using get_html_tree().
    """
    if html.startswith('<div>') and html.endswith('</div>'):
        html = html[5:-6]
    return html.strip()
Ejemplo n.º 2
0
def strip_wrapping(html):
    """
    Removes the wrapping that might have resulted when using get_html_tree().
    """
    if html.startswith('<div>') and html.endswith('</div>'):
        html = html[5:-6]
    return html.strip()
Ejemplo n.º 3
0
 def test_home_page_returns_correct_html(self):
     # we need Karyn in the DB in order to log her in.
     # load_model_objects returns a `dot_notation` dict which we can
     # use all of the model objects from, seen in the print stmnt below.
     self.client.login(username='******', password='******')
     response = self.client.get('/')
     html = response.content.decode('utf8').rstrip()
     self.assertTrue(html.startswith('<!DOCTYPE html>'))
     self.assertIn('<title>factotum</title>', html)
     self.assertTrue(html.endswith('</html>'))
Ejemplo n.º 4
0
def sanitize_fragment(html):
    '''
    #html5lib reorders arguments, so not usable
    import html5lib
    return html5lib.parseFragment(html).toxml().decode('utf-8')
    '''
    if not html:
        return u''
    import lxml.html
    body = lxml.html.document_fromstring(html).find('body')
    html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
    if html.startswith('<p>') and html.endswith('</p>'):
        html = html[3:-4]
    return html
Ejemplo n.º 5
0
def convertAllData(outputCsv, outputMedia, rootDir, origMediaPrefix, dbName='we_import'):

	# connect to db
	#connection = psycopg2.connect("dbname=we_production")
	connection = psycopg2.connect('dbname='+dbName)
	cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor)

	origDocPrefix = origMediaPrefix + '/docs/0000/'
	origImagePrefix = origMediaPrefix + '/images/0000/'

	# grab table data
	crossRef = {}
	allEntries = grabData(cursor, 'entries', crossRef)
	allPages = grabData(cursor, 'pages', crossRef)
	allImages = grabData(cursor, 'images', crossRef)
	allDocuments = grabData(cursor, 'documents', crossRef)
	allSpecies = grabData(cursor, 'species', crossRef)
	allSections = grabData(cursor, 'sections', crossRef)
	allEvents = grabData(cursor, 'events', crossRef)
	allFaqs = grabData(cursor,'faqs', crossRef)
	allVersions = grabData(cursor,'versions', crossRef)
	allAlerts = grabData(cursor,'alerts', crossRef)
	allLocations = grabData(cursor,'locations', crossRef)
	rawEntriesSections = grabData(cursor, 'entries_sections', crossRef)
	rawAlertsSections = grabData(cursor,'alerts_sections', crossRef)

	# clean up database connection
	cursor.close()
	connection.close()

	# create entry-section lookup table
	entryToSection = {}
	for raw in rawEntriesSections:
		if raw['entry_id'] not in entryToSection:
			entryToSection[raw['entry_id']] = [raw['section_id']]
		else:
			entryToSection[raw['entry_id']].append(raw['section_id'])

	# create alert-section lookup table
	alertToSection = {}
	for raw in rawAlertsSections:
		if raw['alert_id'] not in alertToSection:
			alertToSection[raw['alert_id']] = [raw['section_id']]
		else:
			alertToSection[raw['alert']].append(raw['section_id'])

	# create media dirs
	try:
		shutil.rmtree(outputMedia)
		os.makedirs(outputMedia)
		os.makedirs(outputMedia + '/images')
		os.makedirs(outputMedia + '/documents')
	except:
		pass

	# find and copy latest version of each media file
	versionLookup = {}
	docFileMap = {}
	for version in crossRef['versions'].values():
		docId = version.document_id
		versions = [(v.id,v.updated_at) for v in crossRef['versions'].values() if v.document_id==docId]
		ids,dates = zip(*versions)
		latestId = ids[dates.index(max(dates))]
		fileNameOrig = crossRef['versions'][latestId].filename
		fileName = convertFileName(fileNameOrig)

		fileNameBase = fileName
		postNum = 1
		while fileName in versionLookup:
			postNum += 1
			name,ext = os.path.splitext(fileNameBase)
			fileName = name + '-' + str(postNum) + ext
		version.out_filename = fileName
		fileNameOrig = '%s%04d/%s' % (origDocPrefix, latestId, fileNameOrig)
		fileNameNew = outputMedia + '/documents/' + fileName
		docFileMap[fileName] = fileNameOrig
		# TODO shutil.copy(fileNameOrig, fileNameNew)
		#print 'copied',fileNameOrig,'to',fileNameNew

	# index image files
	imageFileMap = {}
	for f in glob.glob(origImagePrefix + '/*/*'):
		fileName = convertFileName(os.path.basename(f))
		destFile = outputMedia + '/images/' + fileName
		imageFileMap[fileName] = f

	# convert titles and set tags in all tables
	titleLookup = {}
	for name,table in crossRef.iteritems():
		for curId,item in table.iteritems():
			# convert title
			if hasattr(item,'title') and item.title is not None:
				title = item.title
			elif hasattr(item,'name') and item.name is not None:
				title = item.name
			elif hasattr(item,'filename') and item.filename is not None:
				title = item.filename
			elif hasattr(item,'common_name') and item.common_name is not None:
				title = item.common_name
			titleNew = convertTitle(title)
			titleNewBase = titleNew
			postNum = 1
			while titleNew in titleLookup:
				postNum += 1
				titleNew = titleNewBase + '-' + str(postNum)
			item.out_title = title
			item.out_title_link = titleNew
			titleLookup[titleNew] = True
			
			# convert date
			if hasattr(item,'updated_at'):
				item.out_date = item.updated_at

			# set tags
			if name=='entries':
				if curId in entryToSection:
					item.sections = entryToSection[curId]
				else:
					item.sections = []
			elif name=='actions':
				if curId in actionToSection:
					item.sections = actionToSection[curId]
				else:
					item.sections = []
			elif name=='sections':
				item.sections = [curId]
		
	# translate links in html
	mediaFiles = []
	contentTypes = ['entries','pages','sections','locations','species','events','faqs','alerts']
	for curType in contentTypes:
		for curId,entry in crossRef[curType].iteritems():
			
			# get correct html field
			if hasattr(entry,'body_html'):
				htmlOrig = entry.body_html
			elif hasattr(entry,'description_html'):
				htmlOrig = entry.description_html
			if not htmlOrig:
				entry.out_content = ''
				continue

			# iterate over and translate each link
			tree = lxml.html.fromstring(htmlOrig.decode('utf-8'))
			links = tree.iterlinks()
			for link in links:
				linkBefore = link[0].get(link[1])
				replaceLink(link, crossRef, rootDir, mediaFiles)
				linkAfter = link[0].get(link[1])
				print 'TRANSLATED',linkBefore,'TO',linkAfter
				
			# form new html string
			html = lxml.html.tostring(tree)
			if html.endswith('</div>'):
				html = html[0:-6]
			if html.startswith('<div>'):
				html = html[5:]
			entry.out_content = html
			if '\x2019' in htmlOrig and 'path on the seawall' in htmlOrig:
				print '**********'
				print htmlOrig
				print '++++++++++'
				print html
				#sys.exit(-1)

	# find and copy images
	for media in mediaFiles:
		if media.isImage:
			if media.fileName in imageFileMap:
				destFile = outputMedia + '/images/' + media.fileName
				shutil.copy(imageFileMap[media.fileName], destFile)
				print 'copied image', imageFileMap[media.fileName], media.fileName
			else:
				print 'IMGFILE BAD', media.fileName
		else:
			if media.fileName in docFileMap:
				destFile = outputMedia + '/documents/' + media.fileName
				shutil.copy(docFileMap[media.fileName], destFile)
				print 'copied doc', docFileMap[media.fileName], media.fileName
			else:
				print 'DOCFILE BAD', media.fileName

			
	# collect all items
	allItems = []
	for ref in [crossRef[contentType] for contentType in contentTypes]:
		allItems += ref.values()

	# add remaining fields
	curId = 1
	for item in allItems:
		item.out_id = 10000 + curId
		curId += 1
		item.out_tags = []
		if hasattr(item,'sections'):
			item.out_tags = [crossRef['sections'][tag].title for tag in item.sections]
			print 'TAGS',item.out_tags
		item.out_thumb = '' # TODO: thumb

	# output csv
	f = open(outputCsv,'w')
	f.write('post_id,post_name,post_type,post_date,post_title,post_content,post_status,post_category,post_tags,post_thumbnail,news_summary\n')
	for item in allItems:
		f.write(createCsvRow(item))
	f.close()
	print 'ALL DONE, wrote', len(allItems), 'records'