def strip_wrapping(html): """ Removes the wrapping that might have resulted when using get_html_tree(). """ if html.startswith('<div>') and html.endswith('</div>'): html = html[5:-6] return html.strip()
def test_home_page_returns_correct_html(self): # we need Karyn in the DB in order to log her in. # load_model_objects returns a `dot_notation` dict which we can # use all of the model objects from, seen in the print stmnt below. self.client.login(username='******', password='******') response = self.client.get('/') html = response.content.decode('utf8').rstrip() self.assertTrue(html.startswith('<!DOCTYPE html>')) self.assertIn('<title>factotum</title>', html) self.assertTrue(html.endswith('</html>'))
def sanitize_fragment(html): ''' #html5lib reorders arguments, so not usable import html5lib return html5lib.parseFragment(html).toxml().decode('utf-8') ''' if not html: return u'' import lxml.html body = lxml.html.document_fromstring(html).find('body') html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8') if html.startswith('<p>') and html.endswith('</p>'): html = html[3:-4] return html
def convertAllData(outputCsv, outputMedia, rootDir, origMediaPrefix, dbName='we_import'): # connect to db #connection = psycopg2.connect("dbname=we_production") connection = psycopg2.connect('dbname='+dbName) cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor) origDocPrefix = origMediaPrefix + '/docs/0000/' origImagePrefix = origMediaPrefix + '/images/0000/' # grab table data crossRef = {} allEntries = grabData(cursor, 'entries', crossRef) allPages = grabData(cursor, 'pages', crossRef) allImages = grabData(cursor, 'images', crossRef) allDocuments = grabData(cursor, 'documents', crossRef) allSpecies = grabData(cursor, 'species', crossRef) allSections = grabData(cursor, 'sections', crossRef) allEvents = grabData(cursor, 'events', crossRef) allFaqs = grabData(cursor,'faqs', crossRef) allVersions = grabData(cursor,'versions', crossRef) allAlerts = grabData(cursor,'alerts', crossRef) allLocations = grabData(cursor,'locations', crossRef) rawEntriesSections = grabData(cursor, 'entries_sections', crossRef) rawAlertsSections = grabData(cursor,'alerts_sections', crossRef) # clean up database connection cursor.close() connection.close() # create entry-section lookup table entryToSection = {} for raw in rawEntriesSections: if raw['entry_id'] not in entryToSection: entryToSection[raw['entry_id']] = [raw['section_id']] else: entryToSection[raw['entry_id']].append(raw['section_id']) # create alert-section lookup table alertToSection = {} for raw in rawAlertsSections: if raw['alert_id'] not in alertToSection: alertToSection[raw['alert_id']] = [raw['section_id']] else: alertToSection[raw['alert']].append(raw['section_id']) # create media dirs try: shutil.rmtree(outputMedia) os.makedirs(outputMedia) os.makedirs(outputMedia + '/images') os.makedirs(outputMedia + '/documents') except: pass # find and copy latest version of each media file versionLookup = {} docFileMap = {} for version in crossRef['versions'].values(): docId = version.document_id versions = [(v.id,v.updated_at) for v in crossRef['versions'].values() if v.document_id==docId] ids,dates = zip(*versions) latestId = ids[dates.index(max(dates))] fileNameOrig = crossRef['versions'][latestId].filename fileName = convertFileName(fileNameOrig) fileNameBase = fileName postNum = 1 while fileName in versionLookup: postNum += 1 name,ext = os.path.splitext(fileNameBase) fileName = name + '-' + str(postNum) + ext version.out_filename = fileName fileNameOrig = '%s%04d/%s' % (origDocPrefix, latestId, fileNameOrig) fileNameNew = outputMedia + '/documents/' + fileName docFileMap[fileName] = fileNameOrig # TODO shutil.copy(fileNameOrig, fileNameNew) #print 'copied',fileNameOrig,'to',fileNameNew # index image files imageFileMap = {} for f in glob.glob(origImagePrefix + '/*/*'): fileName = convertFileName(os.path.basename(f)) destFile = outputMedia + '/images/' + fileName imageFileMap[fileName] = f # convert titles and set tags in all tables titleLookup = {} for name,table in crossRef.iteritems(): for curId,item in table.iteritems(): # convert title if hasattr(item,'title') and item.title is not None: title = item.title elif hasattr(item,'name') and item.name is not None: title = item.name elif hasattr(item,'filename') and item.filename is not None: title = item.filename elif hasattr(item,'common_name') and item.common_name is not None: title = item.common_name titleNew = convertTitle(title) titleNewBase = titleNew postNum = 1 while titleNew in titleLookup: postNum += 1 titleNew = titleNewBase + '-' + str(postNum) item.out_title = title item.out_title_link = titleNew titleLookup[titleNew] = True # convert date if hasattr(item,'updated_at'): item.out_date = item.updated_at # set tags if name=='entries': if curId in entryToSection: item.sections = entryToSection[curId] else: item.sections = [] elif name=='actions': if curId in actionToSection: item.sections = actionToSection[curId] else: item.sections = [] elif name=='sections': item.sections = [curId] # translate links in html mediaFiles = [] contentTypes = ['entries','pages','sections','locations','species','events','faqs','alerts'] for curType in contentTypes: for curId,entry in crossRef[curType].iteritems(): # get correct html field if hasattr(entry,'body_html'): htmlOrig = entry.body_html elif hasattr(entry,'description_html'): htmlOrig = entry.description_html if not htmlOrig: entry.out_content = '' continue # iterate over and translate each link tree = lxml.html.fromstring(htmlOrig.decode('utf-8')) links = tree.iterlinks() for link in links: linkBefore = link[0].get(link[1]) replaceLink(link, crossRef, rootDir, mediaFiles) linkAfter = link[0].get(link[1]) print 'TRANSLATED',linkBefore,'TO',linkAfter # form new html string html = lxml.html.tostring(tree) if html.endswith('</div>'): html = html[0:-6] if html.startswith('<div>'): html = html[5:] entry.out_content = html if '\x2019' in htmlOrig and 'path on the seawall' in htmlOrig: print '**********' print htmlOrig print '++++++++++' print html #sys.exit(-1) # find and copy images for media in mediaFiles: if media.isImage: if media.fileName in imageFileMap: destFile = outputMedia + '/images/' + media.fileName shutil.copy(imageFileMap[media.fileName], destFile) print 'copied image', imageFileMap[media.fileName], media.fileName else: print 'IMGFILE BAD', media.fileName else: if media.fileName in docFileMap: destFile = outputMedia + '/documents/' + media.fileName shutil.copy(docFileMap[media.fileName], destFile) print 'copied doc', docFileMap[media.fileName], media.fileName else: print 'DOCFILE BAD', media.fileName # collect all items allItems = [] for ref in [crossRef[contentType] for contentType in contentTypes]: allItems += ref.values() # add remaining fields curId = 1 for item in allItems: item.out_id = 10000 + curId curId += 1 item.out_tags = [] if hasattr(item,'sections'): item.out_tags = [crossRef['sections'][tag].title for tag in item.sections] print 'TAGS',item.out_tags item.out_thumb = '' # TODO: thumb # output csv f = open(outputCsv,'w') f.write('post_id,post_name,post_type,post_date,post_title,post_content,post_status,post_category,post_tags,post_thumbnail,news_summary\n') for item in allItems: f.write(createCsvRow(item)) f.close() print 'ALL DONE, wrote', len(allItems), 'records'