def find_or_insert_posts(posts, get_post_comments, find_or_insert_person, get_content, get_user, get_replycount, get_id, get_timestamp, is_reshare, source, process_post=None, process_reply=None, recentEvents=None, maxDays=None): 'generate each post that has a paper hashtag, adding to DB if needed' now = datetime.utcnow() saveEvents = [] for d in posts: post = None timeStamp = get_timestamp(d) if maxDays is not None and (now - timeStamp).days > maxDays: break if is_reshare(d): # just a duplicate (reshared) post, so skip continue content = get_content(d) isRec = content.find('#recommend') >= 0 or \ content.find('#mustread') >= 0 if not isRec: try: post = core.Post(d['id']) if getattr(post, 'etag', None) == d.get('etag', ''): yield post continue # matches DB record, so nothing to do except KeyError: pass hashtagDict = get_hashtag_dict(content) # extract tags and IDs if post is None: # extract data for saving post to DB try: paper = hashtagDict['paper'][0] # link to first paper except KeyError: continue # no link to a paper, so nothing to save. userID = get_user(d) author = find_or_insert_person(userID) d['author'] = author._id if isRec: # see if rec already in DB try: post = core.Recommendation((paper._id, author._id)) if getattr(post, 'etag', None) == d.get('etag', ''): yield post continue # matches DB record, so nothing to do except KeyError: # need to save new record to DB klass = core.Recommendation else: klass = core.Post d['text'] = content if process_post: process_post(d) d['sigs'] = get_topicIDs(hashtagDict, get_id(d), timeStamp, source) if post is None: # save to DB post = klass(docData=d, parent=paper) if isRec: try: topicsDict except NameError: topicsDict, subsDict = bulk.get_people_subs() bulk.deliver_rec(paper._id, d, topicsDict, subsDict) if recentEvents is not None: # add to monitor deque saveEvents.append(post) else: # update DB with new data and etag post.update(d) yield post if get_replycount(d) > 0: for c in get_post_comments(d['id']): if process_reply: process_reply(c) try: r = core.Reply(c['id']) if getattr(r, 'etag', None) != c.get('etag', ''): # update DB record with latest data r.update(dict(etag=c.get('etag', ''), text=get_content(c), updated=c.get('updated', ''))) continue # already stored in DB, no need to save except KeyError: pass userID = get_user(c) author = find_or_insert_person(userID) c['author'] = author._id c['text'] = get_content(c) c['replyTo'] = d['id'] r = core.Reply(docData=c, parent=post._parent_link) if recentEvents is not None: # add to monitor deque saveEvents.append(r) if saveEvents and recentEvents is not None: saveEvents.sort(lambda x,y:cmp(x.published, y.published)) for r in saveEvents: recentEvents.appendleft(r) # add to monitor deque
def find_or_insert_posts(posts, get_post_comments, find_or_insert_person, get_content, get_user, get_replycount, get_id, get_timestamp, is_reshare, source, process_post=None, process_reply=None, recentEvents=None, maxDays=None, citationType='discuss', citationType2='discuss', get_title=lambda x:x['title'], spnetworkOnly=True): 'generate each post that has a paper hashtag, adding to DB if needed' now = datetime.utcnow() saveEvents = [] for d in posts: post = None timeStamp = get_timestamp(d) if maxDays is not None and (now - timeStamp).days > maxDays: break if is_reshare(d): # just a duplicate (reshared) post, so skip continue content = get_content(d) if spnetworkOnly and content.find('#spnetwork') < 0: continue # ignore posts lacking our spnetwork hashtag isRec = content.find('#recommend') >= 0 or \ content.find('#mustread') >= 0 try: post = core.Post(get_id(d)) if getattr(post, 'etag', None) == d.get('etag', ''): yield post continue # matches DB record, so nothing to do except KeyError: pass hashtagDict = get_hashtag_dict(content) # extract tags and IDs if post is None: # extract data for saving post to DB try: papers = hashtagDict['paper'] paper = papers[0] # link to first paper except KeyError: continue # no link to a paper, so nothing to save. userID = get_user(d) author = find_or_insert_person(userID) d['author'] = author._id d['text'] = content if process_post: process_post(d) d['sigs'] = get_topicIDs(hashtagDict, get_id(d), timeStamp, source) if isRec: # record rec type try: d['citationType'] = hashtagDict['rec'][0] except KeyError: # handle bad rec hashtag d['citationType'] = 'recommend' else: # use default citation type d['citationType'] = citationType if post is None: # save to DB post = core.Post(docData=d, parent=paper) if len(papers) > 1: # save 2ary citations post.add_citations(papers[1:], citationType2) try: topicsDict except NameError: topicsDict, subsDict = bulk.get_people_subs() bulk.deliver_rec(paper._id, d, topicsDict, subsDict) if recentEvents is not None: # add to monitor deque saveEvents.append(post) else: # update DB with new data and etag post.update(d) yield post if get_replycount(d) > 0: for c in get_post_comments(get_id(d)): if process_reply: process_reply(c) try: r = core.Reply(get_id(c)) if getattr(r, 'etag', None) != c.get('etag', ''): # update DB record with latest data r.update(dict(etag=c.get('etag', ''), text=get_content(c), updated=c.get('updated', ''))) continue # already stored in DB, no need to save except KeyError: pass userID = get_user(c) author = find_or_insert_person(userID) c['author'] = author._id c['text'] = get_content(c) c['replyTo'] = get_id(d) if isRec: # record the type of post c['sourcetype'] = 'rec' else: c['sourcetype'] = 'post' r = core.Reply(docData=c, parent=post._parent_link) if recentEvents is not None: # add to monitor deque saveEvents.append(r) if saveEvents and recentEvents is not None: saveEvents.sort(lambda x,y:cmp(x.published, y.published)) for r in saveEvents: recentEvents.appendleft(r) # add to monitor deque
def find_or_insert_posts(posts, get_post_comments, find_or_insert_person, get_content, get_user, get_replycount, get_id, get_timestamp, is_reshare, source, process_post=None, process_reply=None, recentEvents=None, maxDays=None): 'generate each post that has a paper hashtag, adding to DB if needed' now = datetime.utcnow() saveEvents = [] for d in posts: post = None timeStamp = get_timestamp(d) if maxDays is not None and (now - timeStamp).days > maxDays: break if is_reshare(d): # just a duplicate (reshared) post, so skip continue content = get_content(d) isRec = content.find('#recommend') >= 0 or \ content.find('#mustread') >= 0 if not isRec: try: post = core.Post(d['id']) if getattr(post, 'etag', None) == d.get('etag', ''): yield post continue # matches DB record, so nothing to do except KeyError: pass hashtagDict = get_hashtag_dict(content) # extract tags and IDs if post is None: # extract data for saving post to DB try: paper = hashtagDict['paper'][0] # link to first paper except KeyError: continue # no link to a paper, so nothing to save. userID = get_user(d) author = find_or_insert_person(userID) d['author'] = author._id if isRec: # see if rec already in DB try: post = core.Recommendation((paper._id, author._id)) if getattr(post, 'etag', None) == d.get('etag', ''): yield post continue # matches DB record, so nothing to do except KeyError: # need to save new record to DB klass = core.Recommendation else: klass = core.Post d['text'] = content if process_post: process_post(d) d['sigs'] = get_topicIDs(hashtagDict, get_id(d), timeStamp, source) if post is None: # save to DB post = klass(docData=d, parent=paper) if isRec: try: topicsDict except NameError: topicsDict, subsDict = bulk.get_people_subs() bulk.deliver_rec(paper._id, d, topicsDict, subsDict) if recentEvents is not None: # add to monitor deque saveEvents.append(post) else: # update DB with new data and etag post.update(d) yield post if get_replycount(d) > 0: for c in get_post_comments(d['id']): if process_reply: process_reply(c) try: r = core.Reply(c['id']) if getattr(r, 'etag', None) != c.get('etag', ''): # update DB record with latest data r.update( dict(etag=c.get('etag', ''), text=get_content(c), updated=c.get('updated', ''))) continue # already stored in DB, no need to save except KeyError: pass userID = get_user(c) author = find_or_insert_person(userID) c['author'] = author._id c['text'] = get_content(c) c['replyTo'] = d['id'] r = core.Reply(docData=c, parent=post._parent_link) if recentEvents is not None: # add to monitor deque saveEvents.append(r) if saveEvents and recentEvents is not None: saveEvents.sort(lambda x, y: cmp(x.published, y.published)) for r in saveEvents: recentEvents.appendleft(r) # add to monitor deque
def find_or_insert_posts(posts, get_post_comments, find_or_insert_person, get_content, get_user, get_replycount, get_id, get_timestamp, is_reshare, source, process_post=None, process_reply=None, recentEvents=None, maxDays=None, citationType='discuss', citationType2='discuss', get_title=lambda x:x['title'], spnetworkOnly=True): 'generate each post that has a paper hashtag, adding to DB if needed' now = datetime.utcnow() saveEvents = [] for d in posts: post = None timeStamp = get_timestamp(d) if maxDays is not None and (now - timeStamp).days > maxDays: break if is_reshare(d): # just a duplicate (reshared) post, so skip continue content = get_content(d) try: post = core.Post(get_id(d)) if getattr(post, 'etag', None) == d.get('etag', ''): yield post continue # matches DB record, so nothing to do except KeyError: pass if spnetworkOnly and content.find('#spnetwork') < 0: if post: post.delete() # remove old Post: no longer tagged! continue # ignore posts lacking our spnetwork hashtag # extract tags and IDs: citations, topics, primary = get_citations_types_and_topics(content) try: primary_paper_ID = citations[primary] paper = get_paper(primary,primary_paper_ID[1]) except KeyError: continue # no link to a paper, so nothing to save. if post and post.parent != paper: # changed primary binding! post.delete() # delete old binding post = None # must resave to new binding d['text'] = content if process_post: process_post(d) d['sigs'] = get_topicIDs(topics, get_id(d),timeStamp, source) d['citationType'] = citations[primary][0] oldCitations = {} if post is None: # save to DB userID = get_user(d) author = find_or_insert_person(userID) d['author'] = author._id post = core.Post(docData=d, parent=paper) try: topicsDict except NameError: topicsDict, subsDict = bulk.get_people_subs() bulk.deliver_rec(paper._id, d, topicsDict, subsDict) if recentEvents is not None: # add to monitor deque saveEvents.append(post) else: # update DB with new data and etag post.update(d) for c in getattr(post, 'citations', ()): # index old citations oldCitations[c.parent] = c for ref, meta in citations.iteritems(): # add / update new citations if ref != primary: paper2 = get_paper(ref, meta[1]) try: # if already present, just update citationType if changed c = oldCitations[paper2] if c.citationType != meta[0]: c.update(dict(citationType=meta[0])) del oldCitations[paper2] # don't treat as old citation except KeyError: post.add_citations([paper2], meta[0]) for c in oldCitations.values(): c.delete() # delete citations no longer present in updated post yield post if get_replycount(d) > 0: for c in get_post_comments(get_id(d)): if process_reply: process_reply(c) try: r = core.Reply(get_id(c)) if getattr(r, 'etag', None) != c.get('etag', ''): # update DB record with latest data r.update(dict(etag=c.get('etag', ''), text=get_content(c), updated=c.get('updated', ''))) continue # already stored in DB, no need to save except KeyError: pass userID = get_user(c) author = find_or_insert_person(userID) c['author'] = author._id c['text'] = get_content(c) c['replyTo'] = get_id(d) r = core.Reply(docData=c, parent=post._parent_link) if recentEvents is not None: # add to monitor deque saveEvents.append(r) if saveEvents and recentEvents is not None: saveEvents.sort(lambda x,y:cmp(x.published, y.published)) for r in saveEvents: recentEvents.appendleft(r) # add to monitor deque