def update_df(): '''Load newly crawled events, then update document frequency table''' tf_idf_api = StanfordTFIDFApi() # load newly crawled events unanalyzed_events = keyword_dao.load_unanalyzed_event() # compute incremental document frequency with loaded new events computed_df = tf_idf_api.compute_df([event['detail'] for event in unanalyzed_events]) # update df table keyword_dao.update_df_table(computed_df) # mark analyzed events keyword_dao.mark_analyzed_event([event['id'] for event in unanalyzed_events])
def keyword_extract(): '''Extract keywords for newly crawled events, which have already been used to update df''' # load pre-computed document frequency data df_cache = keyword_dao.load_df_table() # initialize two instances for computing ner and tf-idf ner_api = StanfordNERApi() tf_idf_api = StanfordTFIDFApi() # load events those haven't been analyzed for keywords unparsed_events = keyword_dao.load_unextracted_events() extracted_events = [] for event in unparsed_events: try: logger.info('Start to analyze event %s', event['source_url']) item = dict() item['id'] = event['id'] item['source_url'] = event['source_url'] detail = event['detail'].encode('UTF8') \ if type(event['detail']) == unicode else event['detail'] # extract keyword based one name entity recogonition item['ner'] = ner_api.ner_groupby_ner(detail) # extract keyword based tf-idf, and pos item['tf'] = tf_idf_api.tf_idf_groupby_pos(detail, df_cache) extracted_events.append(item) logger.debug(json.dumps(item)) logger.info('Analyze event %s successfully', event['source_url']) except Exception, e: logger.error(e.message, exc_info=True) time.sleep(1) # cannot request server too frequent # store extracted events as a batch of 10 if len(extracted_events) % 10 == 0 and len(extracted_events) != 0: keyword_dao.save_extracted_event(extracted_events) extracted_events = []