Exemple #1
0
def find_or_insert_posts(posts, get_post_comments, find_or_insert_person,
                         get_content, get_user, get_replycount,
                         get_id, get_timestamp, is_reshare, source,
                         process_post=None, process_reply=None,
                         recentEvents=None, maxDays=None):
    'generate each post that has a paper hashtag, adding to DB if needed'
    now = datetime.utcnow()
    saveEvents = []
    for d in posts:
        post = None
        timeStamp = get_timestamp(d)
        if maxDays is not None and (now - timeStamp).days > maxDays:
            break
        if is_reshare(d): # just a duplicate (reshared) post, so skip
            continue
        content = get_content(d)
        isRec = content.find('#recommend') >= 0 or \
                content.find('#mustread') >= 0
        if not isRec:
            try:
                post = core.Post(d['id'])
                if getattr(post, 'etag', None) == d.get('etag', ''):
                    yield post
                    continue # matches DB record, so nothing to do
            except KeyError:
                pass
        hashtagDict = get_hashtag_dict(content) # extract tags and IDs
        if post is None: # extract data for saving post to DB
            try:
                paper = hashtagDict['paper'][0] # link to first paper
            except KeyError:
                continue # no link to a paper, so nothing to save.
            userID = get_user(d)
            author = find_or_insert_person(userID)
            d['author'] = author._id
            if isRec: # see if rec already in DB
                try:
                    post = core.Recommendation((paper._id, author._id))
                    if getattr(post, 'etag', None) == d.get('etag', ''):
                        yield post
                        continue # matches DB record, so nothing to do
                except KeyError: # need to save new record to DB
                    klass = core.Recommendation
            else:
                klass = core.Post
        d['text'] =  content
        if process_post:
            process_post(d)
        d['sigs'] = get_topicIDs(hashtagDict, get_id(d),
                                 timeStamp, source)
        if post is None: # save to DB
            post = klass(docData=d, parent=paper)
            if isRec:
                try:
                    topicsDict
                except NameError:
                    topicsDict, subsDict = bulk.get_people_subs()
                bulk.deliver_rec(paper._id, d, topicsDict, subsDict)
            if recentEvents is not None: # add to monitor deque
                saveEvents.append(post)
        else: # update DB with new data and etag
            post.update(d)
        yield post
        if get_replycount(d) > 0:
            for c in get_post_comments(d['id']):
                if process_reply:
                    process_reply(c)
                try:
                    r = core.Reply(c['id'])
                    if getattr(r, 'etag', None) != c.get('etag', ''):
                        # update DB record with latest data
                        r.update(dict(etag=c.get('etag', ''),
                                      text=get_content(c),
                                      updated=c.get('updated', '')))
                    continue # already stored in DB, no need to save
                except KeyError:
                    pass
                userID = get_user(c)
                author = find_or_insert_person(userID)
                c['author'] = author._id
                c['text'] =  get_content(c)
                c['replyTo'] = d['id']
                r = core.Reply(docData=c, parent=post._parent_link)
                if recentEvents is not None: # add to monitor deque
                    saveEvents.append(r)

    if saveEvents and recentEvents is not None:
        saveEvents.sort(lambda x,y:cmp(x.published, y.published))
        for r in saveEvents:
            recentEvents.appendleft(r) # add to monitor deque
Exemple #2
0
def find_or_insert_posts(posts, get_post_comments, find_or_insert_person,
                         get_content, get_user, get_replycount,
                         get_id, get_timestamp, is_reshare, source,
                         process_post=None, process_reply=None,
                         recentEvents=None, maxDays=None,
                         citationType='discuss', citationType2='discuss',
                         get_title=lambda x:x['title'],
                         spnetworkOnly=True):
    'generate each post that has a paper hashtag, adding to DB if needed'
    now = datetime.utcnow()
    saveEvents = []
    for d in posts:
        post = None
        timeStamp = get_timestamp(d)
        if maxDays is not None and (now - timeStamp).days > maxDays:
            break
        if is_reshare(d): # just a duplicate (reshared) post, so skip
            continue
        content = get_content(d)
        if spnetworkOnly and content.find('#spnetwork') < 0:
            continue # ignore posts lacking our spnetwork hashtag
        isRec = content.find('#recommend') >= 0 or \
                content.find('#mustread') >= 0
        try:
            post = core.Post(get_id(d))
            if getattr(post, 'etag', None) == d.get('etag', ''):
                yield post
                continue # matches DB record, so nothing to do
        except KeyError:
            pass
        hashtagDict = get_hashtag_dict(content) # extract tags and IDs
        if post is None: # extract data for saving post to DB
            try:
                papers = hashtagDict['paper']
                paper = papers[0] # link to first paper
            except KeyError:
                continue # no link to a paper, so nothing to save.
            userID = get_user(d)
            author = find_or_insert_person(userID)
            d['author'] = author._id
        d['text'] =  content
        if process_post:
            process_post(d)
        d['sigs'] = get_topicIDs(hashtagDict, get_id(d),
                                 timeStamp, source)
        if isRec: # record rec type
            try:
                d['citationType'] = hashtagDict['rec'][0]
            except KeyError: # handle bad rec hashtag
                d['citationType'] = 'recommend'
        else: # use default citation type
            d['citationType'] = citationType
        if post is None: # save to DB
            post = core.Post(docData=d, parent=paper)
            if len(papers) > 1: # save 2ary citations
                post.add_citations(papers[1:], citationType2)
            try:
                topicsDict
            except NameError:
                topicsDict, subsDict = bulk.get_people_subs()
            bulk.deliver_rec(paper._id, d, topicsDict, subsDict)
            if recentEvents is not None: # add to monitor deque
                saveEvents.append(post)
        else: # update DB with new data and etag
            post.update(d)
        yield post
        if get_replycount(d) > 0:
            for c in get_post_comments(get_id(d)):
                if process_reply:
                    process_reply(c)
                try:
                    r = core.Reply(get_id(c))
                    if getattr(r, 'etag', None) != c.get('etag', ''):
                        # update DB record with latest data
                        r.update(dict(etag=c.get('etag', ''),
                                      text=get_content(c),
                                      updated=c.get('updated', '')))
                    continue # already stored in DB, no need to save
                except KeyError:
                    pass
                userID = get_user(c)
                author = find_or_insert_person(userID)
                c['author'] = author._id
                c['text'] =  get_content(c)
                c['replyTo'] = get_id(d)
                if isRec: # record the type of post
                    c['sourcetype'] = 'rec'
                else:
                    c['sourcetype'] = 'post'
                r = core.Reply(docData=c, parent=post._parent_link)
                if recentEvents is not None: # add to monitor deque
                    saveEvents.append(r)

    if saveEvents and recentEvents is not None:
        saveEvents.sort(lambda x,y:cmp(x.published, y.published))
        for r in saveEvents:
            recentEvents.appendleft(r) # add to monitor deque
Exemple #3
0
def find_or_insert_posts(posts,
                         get_post_comments,
                         find_or_insert_person,
                         get_content,
                         get_user,
                         get_replycount,
                         get_id,
                         get_timestamp,
                         is_reshare,
                         source,
                         process_post=None,
                         process_reply=None,
                         recentEvents=None,
                         maxDays=None):
    'generate each post that has a paper hashtag, adding to DB if needed'
    now = datetime.utcnow()
    saveEvents = []
    for d in posts:
        post = None
        timeStamp = get_timestamp(d)
        if maxDays is not None and (now - timeStamp).days > maxDays:
            break
        if is_reshare(d):  # just a duplicate (reshared) post, so skip
            continue
        content = get_content(d)
        isRec = content.find('#recommend') >= 0 or \
                content.find('#mustread') >= 0
        if not isRec:
            try:
                post = core.Post(d['id'])
                if getattr(post, 'etag', None) == d.get('etag', ''):
                    yield post
                    continue  # matches DB record, so nothing to do
            except KeyError:
                pass
        hashtagDict = get_hashtag_dict(content)  # extract tags and IDs
        if post is None:  # extract data for saving post to DB
            try:
                paper = hashtagDict['paper'][0]  # link to first paper
            except KeyError:
                continue  # no link to a paper, so nothing to save.
            userID = get_user(d)
            author = find_or_insert_person(userID)
            d['author'] = author._id
            if isRec:  # see if rec already in DB
                try:
                    post = core.Recommendation((paper._id, author._id))
                    if getattr(post, 'etag', None) == d.get('etag', ''):
                        yield post
                        continue  # matches DB record, so nothing to do
                except KeyError:  # need to save new record to DB
                    klass = core.Recommendation
            else:
                klass = core.Post
        d['text'] = content
        if process_post:
            process_post(d)
        d['sigs'] = get_topicIDs(hashtagDict, get_id(d), timeStamp, source)
        if post is None:  # save to DB
            post = klass(docData=d, parent=paper)
            if isRec:
                try:
                    topicsDict
                except NameError:
                    topicsDict, subsDict = bulk.get_people_subs()
                bulk.deliver_rec(paper._id, d, topicsDict, subsDict)
            if recentEvents is not None:  # add to monitor deque
                saveEvents.append(post)
        else:  # update DB with new data and etag
            post.update(d)
        yield post
        if get_replycount(d) > 0:
            for c in get_post_comments(d['id']):
                if process_reply:
                    process_reply(c)
                try:
                    r = core.Reply(c['id'])
                    if getattr(r, 'etag', None) != c.get('etag', ''):
                        # update DB record with latest data
                        r.update(
                            dict(etag=c.get('etag', ''),
                                 text=get_content(c),
                                 updated=c.get('updated', '')))
                    continue  # already stored in DB, no need to save
                except KeyError:
                    pass
                userID = get_user(c)
                author = find_or_insert_person(userID)
                c['author'] = author._id
                c['text'] = get_content(c)
                c['replyTo'] = d['id']
                r = core.Reply(docData=c, parent=post._parent_link)
                if recentEvents is not None:  # add to monitor deque
                    saveEvents.append(r)

    if saveEvents and recentEvents is not None:
        saveEvents.sort(lambda x, y: cmp(x.published, y.published))
        for r in saveEvents:
            recentEvents.appendleft(r)  # add to monitor deque
Exemple #4
0
def find_or_insert_posts(posts, get_post_comments, find_or_insert_person,
                         get_content, get_user, get_replycount,
                         get_id, get_timestamp, is_reshare, source,
                         process_post=None, process_reply=None,
                         recentEvents=None, maxDays=None,
                         citationType='discuss', citationType2='discuss',
                         get_title=lambda x:x['title'],
                         spnetworkOnly=True):
    'generate each post that has a paper hashtag, adding to DB if needed'
    now = datetime.utcnow()
    saveEvents = []
    for d in posts:
        post = None
        timeStamp = get_timestamp(d)
        if maxDays is not None and (now - timeStamp).days > maxDays:
            break
        if is_reshare(d): # just a duplicate (reshared) post, so skip
            continue
        content = get_content(d)
        try:
            post = core.Post(get_id(d))
            if getattr(post, 'etag', None) == d.get('etag', ''):
                yield post
                continue # matches DB record, so nothing to do
        except KeyError:
            pass
        if spnetworkOnly and content.find('#spnetwork') < 0:
            if post:
                post.delete() # remove old Post: no longer tagged!
            continue # ignore posts lacking our spnetwork hashtag
        # extract tags and IDs:
        citations, topics, primary = get_citations_types_and_topics(content)
        try:
            primary_paper_ID = citations[primary]
            paper = get_paper(primary,primary_paper_ID[1])
        except KeyError:
            continue # no link to a paper, so nothing to save.
        if post and post.parent != paper: # changed primary binding!
            post.delete() # delete old binding
            post = None # must resave to new binding
        d['text'] =  content
        if process_post:
            process_post(d)
        d['sigs'] = get_topicIDs(topics, get_id(d),timeStamp, source)
        d['citationType'] = citations[primary][0]
        oldCitations = {}
        if post is None: # save to DB
            userID = get_user(d)
            author = find_or_insert_person(userID)
            d['author'] = author._id
            post = core.Post(docData=d, parent=paper)
            try:
                topicsDict
            except NameError:
                topicsDict, subsDict = bulk.get_people_subs()
            bulk.deliver_rec(paper._id, d, topicsDict, subsDict)
            if recentEvents is not None: # add to monitor deque
                saveEvents.append(post)
        else: # update DB with new data and etag
            post.update(d)
            for c in getattr(post, 'citations', ()): # index old citations
                oldCitations[c.parent] = c
        for ref, meta in citations.iteritems(): # add / update new citations
            if ref != primary:
                paper2 = get_paper(ref, meta[1])
                try: # if already present, just update citationType if changed
                    c = oldCitations[paper2]
                    if c.citationType != meta[0]:
                        c.update(dict(citationType=meta[0]))
                    del oldCitations[paper2] # don't treat as old citation
                except KeyError:
                    post.add_citations([paper2], meta[0])
        for c in oldCitations.values():
            c.delete() # delete citations no longer present in updated post
        yield post
        if get_replycount(d) > 0:
            for c in get_post_comments(get_id(d)):
                if process_reply:
                    process_reply(c)
                try:
                    r = core.Reply(get_id(c))
                    if getattr(r, 'etag', None) != c.get('etag', ''):
                        # update DB record with latest data
                        r.update(dict(etag=c.get('etag', ''),
                                      text=get_content(c),
                                      updated=c.get('updated', '')))
                    continue # already stored in DB, no need to save
                except KeyError:
                    pass
                userID = get_user(c)
                author = find_or_insert_person(userID)
                c['author'] = author._id
                c['text'] =  get_content(c)
                c['replyTo'] = get_id(d)
                r = core.Reply(docData=c, parent=post._parent_link)
                if recentEvents is not None: # add to monitor deque
                    saveEvents.append(r)

    if saveEvents and recentEvents is not None:
        saveEvents.sort(lambda x,y:cmp(x.published, y.published))
        for r in saveEvents:
            recentEvents.appendleft(r) # add to monitor deque