def hentry_to_entry(hentry, feed, backfill, now): permalink = url = hentry.get('url') uid = hentry.get('uid') or url if not uid: return # hentry = mf2util.interpret(mf2py.Parser(url=url).to_dict(), url) # permalink = hentry.get('url') or url # uid = hentry.get('uid') or uid title = hentry.get('name') content = hentry.get('content') if not content: content = title title = None published = hentry.get('published') updated = hentry.get('updated') # retrieved time is now unless we're backfilling old posts retrieved = now if backfill and published: retrieved = published entry = Entry( uid=uid, retrieved=retrieved, permalink=permalink, published=published, updated=updated, title=title, content=content, content_cleaned=util.clean(content), author_name=hentry.get('author', {}).get('name'), author_photo=hentry.get('author', {}).get('photo') or (feed and fallback_photo(feed.origin)), author_url=hentry.get('author', {}).get('url')) for prop in 'in-reply-to', 'like-of', 'repost-of', 'syndication': value = hentry.get(prop) if value: entry.set_property(prop, value) return entry
def process_xml_feed_for_new_entries(feed, content, backfill, now): current_app.logger.debug('fetching xml feed: %s', feed) parsed = feedparser.parse(content, response_headers={ 'content-location': feed.feed, }) feed_props = parsed.get('feed', {}) default_author_url = feed_props.get('author_detail', {}).get('href') default_author_name = feed_props.get('author_detail', {}).get('name') default_author_photo = feed_props.get('logo') current_app.logger.debug('found {} entries'.format(len(parsed.entries))) # work from the bottom up (oldest first, usually) for p_entry in reversed(parsed.entries): current_app.logger.debug('processing entry {}'.format( str(p_entry)[:256])) permalink = p_entry.get('link') uid = p_entry.get('id') or permalink if not uid: continue if 'updated_parsed' in p_entry: updated = datetime.datetime.fromtimestamp( time.mktime(p_entry.updated_parsed)) else: updated = None if 'published_parsed' in p_entry: published = datetime.datetime.fromtimestamp( time.mktime(p_entry.published_parsed)) else: published = updated retrieved = now if backfill and published: retrieved = published title = p_entry.get('title') content = None content_list = p_entry.get('content') if content_list: content = content_list[0].value else: content = p_entry.get('summary') if title and content: title_trimmed = title.rstrip('...').rstrip('…') if content.startswith(title_trimmed): title = None for link in p_entry.get('links', []): if link.type == 'audio/mpeg' or link.type == 'audio/mp3': audio = AUDIO_ENCLOSURE_TMPL.format(href=link.get('href')) content = (content or '') + audio if (link.type == 'video/x-m4v' or link.type == 'video/x-mp4' or link.type == 'video/mp4'): video = VIDEO_ENCLOSURE_TMPL.format(href=link.get('href')) content = (content or '') + video entry = Entry( published=published, updated=updated, uid=uid, permalink=permalink, retrieved=retrieved, title=p_entry.get('title'), content=content, content_cleaned=util.clean(content), author_name=p_entry.get('author_detail', {}).get('name') or default_author_name, author_url=p_entry.get('author_detail', {}).get('href') or default_author_url, author_photo=default_author_photo or fallback_photo(feed.origin)) yield entry
def process_xml_feed_for_new_entries(feed, content, backfill, now): current_app.logger.debug('fetching xml feed: %s', str(feed)[:32]) parsed = feedparser.parse(content, response_headers={ 'content-location': feed.feed, }) feed_props = parsed.get('feed', {}) default_author_url = feed_props.get('author_detail', {}).get('href') default_author_name = feed_props.get('author_detail', {}).get('name') default_author_photo = feed_props.get('logo') current_app.logger.debug('found %d entries', len(parsed.entries)) # work from the bottom up (oldest first, usually) for p_entry in reversed(parsed.entries): current_app.logger.debug('processing entry %s', str(p_entry)[:32]) permalink = p_entry.get('link') uid = p_entry.get('id') or permalink if not uid: continue if 'updated_parsed' in p_entry and p_entry.updated_parsed: updated = datetime.datetime.fromtimestamp( time.mktime(p_entry.updated_parsed)) else: updated = None if 'published_parsed' in p_entry and p_entry.published_parsed: published = datetime.datetime.fromtimestamp( time.mktime(p_entry.published_parsed)) else: published = updated retrieved = now if backfill and published: retrieved = published title = p_entry.get('title') content = None content_list = p_entry.get('content') if content_list: content = content_list[0].value else: content = p_entry.get('summary') if title and content: title_trimmed = title.rstrip('...').rstrip('…') if content.startswith(title_trimmed): title = None for link in p_entry.get('links', []): link_type = link.get('type') if link_type in ['audio/mpeg', 'audio/mp3']: audio = AUDIO_ENCLOSURE_TMPL.format(href=link.get('href')) content = (content or '') + audio if link_type in ['video/x-m4v', 'video/x-mp4', 'video/mp4']: video = VIDEO_ENCLOSURE_TMPL.format(href=link.get('href')) content = (content or '') + video yield Entry( published=published, updated=updated, uid=uid, permalink=permalink, retrieved=retrieved, title=p_entry.get('title'), content=content, content_cleaned=util.clean(content), author_name=p_entry.get('author_detail', {}).get('name') or default_author_name, author_url=p_entry.get('author_detail', {}).get('href') or default_author_url, author_photo=default_author_photo or fallback_photo(feed.origin))
def hentry_to_entry(hentry, feed, backfill, now): def normalize_datetime(dt): if (dt and hasattr(dt, 'year') and hasattr(dt, 'month') and hasattr(dt, 'day')): # make sure published is in UTC and strip the timezone if hasattr(dt, 'tzinfo') and dt.tzinfo: return dt.astimezone(datetime.timezone.utc).replace( tzinfo=None) # convert datetime.date to datetime.datetime elif not hasattr(dt, 'hour'): return datetime.datetime(year=dt.year, month=dt.month, day=dt.day) permalink = url = hentry.get('url') uid = hentry.get('uid') or url if not uid: return # hentry = mf2util.interpret(mf2py.Parser(url=url).to_dict(), url) # permalink = hentry.get('url') or url # uid = hentry.get('uid') or uid # TODO repost = next(iter(hentry.get('repost-of', [])), None) title = hentry.get('name') content = hentry.get('content') summary = hentry.get('summary') if not content and summary: content = '{}<br/><br/><a href="{}">Read more</a>'.format( summary, permalink) if not content and hentry.get('type') == 'entry': content = title title = None published = normalize_datetime(hentry.get('published')) updated = normalize_datetime(hentry.get('updated')) deleted = normalize_datetime(hentry.get('deleted')) # retrieved time is now unless we're backfilling old posts retrieved = now if backfill and published and published < retrieved: retrieved = published author = hentry.get('author', {}) author_name = author.get('name') author_photo = author.get('photo') author_url = author.get('url') entry = Entry( uid=uid, retrieved=retrieved, permalink=permalink, published=published, updated=updated, deleted=deleted, title=title, content=content, content_cleaned=util.clean(content), author_name=author_name, author_photo=author_photo or (feed and fallback_photo(feed.origin)), author_url=author_url) # complex properties, convert from list of complex objects to a # list of URLs for prop in ('in-reply-to', 'like-of', 'repost-of'): values = hentry.get(prop) if values: entry.set_property(prop, [value['url'] for value in values if 'url' in value]) # simple properties, just transfer them over wholesale for prop in ('syndication', 'location', 'photo'): value = hentry.get(prop) if value: entry.set_property(prop, value) if 'start-str' in hentry: entry.set_property('start', hentry.get('start-str')) if 'end-str' in hentry: entry.set_property('end', hentry.get('end-str')) # set a flag for events so we can show RSVP buttons if hentry.get('type') == 'event': entry.set_property('event', True) # does it look like a jam? plain = hentry.get('content-plain') if plain and JAM_RE.match(plain): entry.set_property('jam', True) current_app.logger.debug('entry properties %s', entry.properties) return entry
from config import Config import sqlalchemy import sqlalchemy.orm from woodwind.models import Entry from woodwind import util engine = sqlalchemy.create_engine(Config.SQLALCHEMY_DATABASE_URI) Session = sqlalchemy.orm.sessionmaker(bind=engine) try: engine.execute('alter table entry add column content_cleaned text') except: pass try: session = Session() for entry in session.query(Entry).all(): print('processing', entry.id) entry.content_cleaned = util.clean(entry.content) session.commit() except: session.rollback() raise finally: session.close()