def parse_author(self, entry): """Lookup the author for the given entry.""" def _remember_author(author): if author.email is not None and \ author.email not in self._authors_by_email: self._authors_by_email[author.email] = author if author.username is not None and \ author.username not in self._authors_by_username: self._authors_by_username[author.username] = author author = entry.find(atom.author) email = author.findtext(atom.email) username = author.findtext(atom.name) for extension in self.extensions: rv = extension.lookup_author(author, entry, username, email) if rv is not None: _remember_author(rv) return rv if email is not None and email in self._authors_by_email: return self._authors_by_email[email] if username in self._authors_by_username: return self._authors_by_username[username] author = Author(username, email) _remember_author(author) self.authors.append(author) return author
def _get_author(self, dependency): author = self._authors.get(dependency) if author is None: element = self._lookup_user(self._dependencies, id=str(dependency))[0] author = Author(element.findtext(zine.username), element.findtext(zine.email), element.findtext(zine.real_name), element.findtext(zine.description), element.findtext(zine.www), element.findtext(zine.pw_hash), _to_bool(element.findtext(zine.is_author)), _pickle(element.findtext(zine.extra))) for privilege in element.findall(zine.privilege): p = self.app.privileges.get(privilege.text) if p is not None: author.privileges.add(p) self._authors[dependency] = author self.parser.authors.append(author) return author
def import_livejournal(self, username, password, import_what=IMPORT_JOURNAL, community='', security_custom=SECURITY_PROTECTED, categories=[], getcomments=True): """Import from LiveJournal using specified parameters.""" yield _(u'<p>Beginning LiveJournal import. Attempting to login...</p>') if import_what != IMPORT_JOURNAL: usejournal = community else: usejournal = None lj = LiveJournalConnect(username, password, usejournal) result = lj.login(getmoods=0) authors = { username: Author(username=username, email='', real_name=unicode(result['fullname'], 'utf-8')) } yield _(u'<p>Your name: <strong>%s</strong></p>') % \ authors[username].real_name moodlist = dict([(int(m['id']), unicode(str(m['name']), 'utf-8')) for m in result['moods']]) result = lj.getusertags() tags = dict([ (tag, Tag(gen_slug(tag), tag)) for tag in [unicode(t['name'], 'utf-8') for t in result['tags']] ]) yield _(u'<p><strong>Tags:</strong> %s</p>') % _(u', ').join( tags.keys()) ##result = lj.getdaycounts() ##daycounts = [(date(*strptime(item['date'], '%Y-%m-%d')[0:3]), ## item['count']) for item in result['daycounts']] ##totalposts = sum([x[1] for x in daycounts]) ##yield _(u'<p>Found <strong>%d</strong> posts on <strong>%d days'\ ## u'</strong> between %s and %s.</p>') % ( ## totalposts, ## len(daycounts), ## daycounts[0][0].strftime('%Y-%m-%d'), ## daycounts[-1][0].strftime('%Y-%m-%d')) posts = {} # Process implemented as per # http://www.livejournal.com/doc/server/ljp.csp.entry_downloading.html yield _(u'<ul>') yield _(u'<li>Getting metadata...</li>') result = lj.syncitems() sync_items = [] sync_total = int(result['total']) yield _(u'<li>%d items...</li>') % sync_total sync_items.extend(result['syncitems']) while len(sync_items) < sync_total: lastsync = max([ parse_lj_date(item['time']) for item in sync_items ]).strftime('%Y-%m-%d %H:%M:%S') yield _(u'<li>Got %d items up to %s...</li>') % (len(sync_items), lastsync) result = lj.syncitems(lastsync=lastsync) sync_items.extend(result['syncitems']) yield _(u'<li>Got all %d items.</li>') % len(sync_items) yield _(u'</ul>') #: Discard non-journal items. sync_items = [i for i in sync_items if i['item'].startswith('L-')] yield _(u'<p>Downloading <strong>%d</strong> entries...</p>') % len( sync_items) # Track what items we need to get sync_data = {} for item in sync_items: sync_data[int(item['item'][2:])] = { 'downloaded': False, 'time': parse_lj_date(item['time']) } # Start downloading bodies sync_left = [ sync_data[x] for x in sync_data if sync_data[x]['downloaded'] is False ] if sync_left: lastsync = (min([x['time'] for x in sync_left]) - timedelta(seconds=1)).strftime('%Y-%m-%d %H:%M:%S') while len(sync_left) > 0: yield _(u'<p>Getting a batch...</p>') try: result = lj.getevents(selecttype='syncitems', lastsync=lastsync) except xmlrpclib.Fault, fault: if fault.faultCode == 406: # LJ doesn't like us. Go back one second and try again. yield _(u'<p>LiveJournal says we are retrying the same '\ u'date and time too often. Trying again with the '\ u'time set behind by one second.</p>') lastsync = ( parse_lj_date(lastsync) - timedelta(seconds=1)).strftime('%Y-%m-%d %H:%M:%S') continue else: yield _(u'<p>Process failed. LiveJournal says: '\ u'(%d) %s</p>') % (fault.faultCode, fault.faultString) break yield _(u'<ol start="%d">') % (len(posts) + 1) for item in result['events']: if sync_data[item['itemid']]['downloaded'] is True: # Dupe, thanks to our lastsync time manipulation. Skip. continue sync_data[item['itemid']]['downloaded'] = True sync_data[item['itemid']]['item'] = item subject = item.get('subject', '') if isinstance(subject, xmlrpclib.Binary): subject = subject.data subject = unicode(str(subject), 'utf-8') #: LiveJournal subjects may contain HTML tags. Strip them and #: convert HTML entities to Unicode equivalents. subject = unescape( tag_re.sub('', ljuser_re.sub('\\2', subject))) poster = item.get('poster', username) if poster != username and import_what != IMPORT_COMMUNITY_ALL: # Discard, since we don't want this. yield _( u'<li><strong>Discarded:</strong> %s <em>(by %s)</em></li>' ) % (subject, poster) continue if poster not in authors: authors[poster] = Author(poster, '', '') # Map LiveJournal security codes to Zine status flags security = item.get('security', 'public') if security == 'usemask' and item['allowmask'] == 1: security = 'friends' if security == 'usemask': status = { SECURITY_DISCARD: None, SECURITY_PUBLIC: STATUS_PUBLISHED, SECURITY_PROTECTED: STATUS_PROTECTED, SECURITY_PRIVATE: STATUS_PRIVATE }[security_custom] if status is None: yield _(u'<li><strong>Discarded (masked):</strong> '\ u'%s</li>') % subject continue else: status = { 'public': STATUS_PUBLISHED, 'friends': STATUS_PROTECTED, 'private': STATUS_PRIVATE, }[security] #: Read time as local timezone and then convert to UTC. Zine #: doesn't seem to like non-UTC timestamps in imports. pub_date = get_timezone().localize( parse_lj_date(item['eventtime'])).astimezone(UTC) itemtags = [ t.strip() for t in unicode( item['props'].get('taglist', ''), 'utf-8').split(',') ] while '' in itemtags: itemtags.remove('') itemtags = [tags[t] for t in itemtags] extras = {} if 'current_music' in item['props']: if isinstance(item['props']['current_music'], xmlrpclib.Binary): extras['current_music'] = unicode( item['props']['current_music'].data, 'utf-8') else: extras['current_music'] = unicode( str(item['props']['current_music']), 'utf-8') if 'current_mood' in item['props']: if isinstance(item['props']['current_mood'], xmlrpclib.Binary): extras['current_mood'] = unicode( item['props']['current_mood'].data, 'utf-8') else: extras['current_mood'] = unicode( str(item['props']['current_mood']), 'utf-8') elif 'current_moodid' in item['props']: extras['current_mood'] = moodlist[int( item['props']['current_moodid'])] if 'current_coords' in item['props']: if isinstance(item['props']['current_coords'], xmlrpclib.Binary): extras['current_coords'] = unicode( item['props']['current_coords'].data, 'utf-8') else: extras['current_coords'] = unicode( str(item['props']['current_coords']), 'utf-8') if 'current_location' in item['props']: if isinstance(item['props']['current_location'], xmlrpclib.Binary): extras['current_location'] = unicode( item['props']['current_location'].data, 'utf-8') else: extras['current_location'] = unicode( str(item['props']['current_location']), 'utf-8') if 'picture_keyword' in item['props']: if isinstance(item['props']['picture_keyword'], xmlrpclib.Binary): extras['picture_keyword'] = unicode( item['props']['picture_keyword'].data, 'utf-8') else: extras['picture_keyword'] = unicode( str(item['props']['picture_keyword']), 'utf-8') extras['lj_post_id'] = item['itemid'] extras['original_url'] = item['url'] posts[item['itemid']] = Post( #: Generate slug. If there's no subject, use '-'+itemid. #: Why the prefix? Because if the user wants %year%/%month%/ #: for the post url format and we end up creating a slug #: like 2003/12/1059, it will conflict with the archive #: access path format of %Y/%m/%d and the post will become #: inaccessible, since archive paths take higher priority #: to slugs in zine's urls.py. slug=gen_timestamped_slug( gen_slug(subject) or ('-' + str(item['itemid'])), 'entry', pub_date), title=subject, link=item['url'], pub_date=pub_date, author=authors[poster], intro='', body=isinstance(item['event'], xmlrpclib.Binary) and unicode(item['event'].data, 'utf-8') or url_unquote_plus(str(item['event'])), tags=itemtags, categories=[Category(x) for x in categories], comments=[], # Will be updated later. comments_enabled=not item['props'].get( 'opt_nocomments', False), pings_enabled=False, # LiveJournal did not support pings uid='livejournal;%s;%d' % (usejournal or username, item['itemid']), parser=item['props'].get('opt_preformatted', False) and 'html' or 'livejournal', status=status, extra=extras) yield _(u'<li>%s <em>(by %s on %s)</em></li>') % ( subject, poster, pub_date.strftime('%Y-%m-%d %H:%M')) # Done processing batch. yield _(u'</ol>') sync_left = [ sync_data[x] for x in sync_data if sync_data[x]['downloaded'] is False ] if sync_left: lastsync = (min([x['time'] for x in sync_left]) - timedelta(seconds=1)).strftime('%Y-%m-%d %H:%M:%S')
def import_quills(self, blogurl, username, password): """Import from Quills using Zope's XML-RPC interface.""" yield _(u'<p>Beginning Quills import. Attempting to get data...</p>') urlparts = urlparse.urlsplit(blogurl) urlnetloc = urlparts.netloc urlpath = urlparts.path if not urlpath.endswith('/'): urlpath += '/' # Trailing slash required for XML-RPC if username: #: We're using simple HTTP auth, which isn't the smartest thing to #: do, but Plone's default cookie-auth system is just a base64 #: encoding of username:password, which isn't any better. Quills #: runs on Plone 2.1 and 2.5, neither of which shipped with a more #: secure auth mechanism, so we'll just go with what works. HTTP #: auth fallback has been supported by every Zope 2.x release. urlnetloc = '%s:%s@%s' % (username, password, urlnetloc) useblogurl = urlparse.urlunsplit((urlparts.scheme, urlnetloc, urlpath, '', '')) conn = xmlrpclib.ServerProxy(useblogurl) title = conn.Title() data = conn.zine_export() yield _(u'<p>Got data. Parsing for weblog entries and replies.</p>') tags = {} posts = {} authors = {} yield _(u'<ol>') for entry in data: itemtags = [] for tag in entry['tags']: if tag in tags: itemtags.append(tags[tag]) else: newtag = Tag(gen_slug(tag), tag) tags[tag] = newtag itemtags.append(newtag) if entry['author'] in authors: author = authors[entry['author']] else: author = Author(entry['author'], '', '') authors[entry['author']] = author status = PLONE_STATUS.get(entry['status'], STATUS_PUBLISHED) body = reunicode(entry['body']) description = reunicode(entry['description']) subject = reunicode(entry['title']) parser = PLONE_PARSERS.get(entry['format'], 'zeml') pub_date = parse_plone_date(entry['date']) if description: #: Assume description is text/plain. Anything else is unlikely if parser in ['zeml', 'html']: body = u'<intro><p>%s</p></intro>%s' % (description, body) else: # We don't know how this parser works, so just insert # description before body, with a blank line in between body = u'%s\n\n%s' % (description, body) comments = {} for comment in entry['replies']: c_body = reunicode(comment['body']) c_author = comment['author'] if c_author in authors: c_author = authors[c_author] #: Fix for Jace's anon comments hack elif c_author.startswith('!'): c_author = c_author[1:] c_body = reunicode(comment['body']) c_subject = reunicode(comment['title']) if c_subject: c_body = '%s\n\n%s' % (c_subject, c_body) comments[comment['id']] = Comment( author = c_author, body = c_body, pub_date = parse_plone_date( comment['date']).astimezone(UTC), author_email = None, author_url = None, remote_addr = None, parent = comment['parent'], parser = 'text', status = COMMENT_MODERATED ) # Re-thread comments for comment in comments.values(): comment.parent = comments.get(comment.parent, None) posts[entry['id']] = Post( slug=gen_timestamped_slug(entry['id'], 'entry', pub_date), title=subject, link=entry['url'], pub_date=pub_date.astimezone(UTC), author=authors[entry['author']], intro=u'', body=body, tags=itemtags, categories=[], comments=comments.values(), comments_enabled=entry['allow_comments'], pings_enabled=True, uid=entry['id'], parser=parser, content_type='entry', status=status ) yield _(u'<li><strong>%s</strong> (by %s; %d comments)</li>') % ( subject, author.username, len(comments)) yield _(u'</ol>') self.enqueue_dump(Blog( title, blogurl, '', 'en', tags.values(), [], posts.values(), authors.values())) flash(_(u'Added imported items to queue.')) yield _(u'<p><strong>All done.</strong></p>')
def get_author(name): if name: author = authors.get(name) if author is None: author = authors[name] = Author(name, None) return author