def update_df():
    '''Load newly crawled events, then update document frequency table'''
    tf_idf_api = StanfordTFIDFApi()
    
    # load newly crawled events
    unanalyzed_events = keyword_dao.load_unanalyzed_event()
    
    # compute incremental document frequency with loaded new events
    computed_df = tf_idf_api.compute_df([event['detail'] for event in unanalyzed_events])
    
    # update df table
    keyword_dao.update_df_table(computed_df)

    # mark analyzed events
    keyword_dao.mark_analyzed_event([event['id'] for event in unanalyzed_events])
def keyword_extract():
    '''Extract keywords for newly crawled events, which have already been used to update df'''
    # load pre-computed document frequency data
    df_cache = keyword_dao.load_df_table()
    
    # initialize two instances for computing ner and tf-idf
    ner_api = StanfordNERApi()
    tf_idf_api = StanfordTFIDFApi()
    
    # load events those haven't been analyzed for keywords
    unparsed_events = keyword_dao.load_unextracted_events()
    
    extracted_events = []
    for event in unparsed_events:
        try:
            logger.info('Start to analyze event %s', event['source_url'])
            
            item = dict()
            item['id'] = event['id']
            item['source_url'] = event['source_url']
            detail = event['detail'].encode('UTF8') \
                if type(event['detail']) == unicode else event['detail']
            
            # extract keyword based one name entity recogonition
            item['ner'] = ner_api.ner_groupby_ner(detail)
            
            # extract keyword based tf-idf, and pos
            item['tf'] = tf_idf_api.tf_idf_groupby_pos(detail, df_cache)
            
            extracted_events.append(item)
            
            logger.debug(json.dumps(item))
            logger.info('Analyze event %s successfully', event['source_url'])
        except Exception, e:
            logger.error(e.message, exc_info=True)
            
        time.sleep(1) # cannot request server too frequent
        
        # store extracted events as a batch of 10         
        if len(extracted_events) % 10 == 0 and len(extracted_events) != 0:
            keyword_dao.save_extracted_event(extracted_events)
            extracted_events = []