def keyword_extract(): '''Extract keywords for newly crawled events, which have already been used to update df''' # load pre-computed document frequency data df_cache = keyword_dao.load_df_table() # initialize two instances for computing ner and tf-idf ner_api = StanfordNERApi() tf_idf_api = StanfordTFIDFApi() # load events those haven't been analyzed for keywords unparsed_events = keyword_dao.load_unextracted_events() extracted_events = [] for event in unparsed_events: try: logger.info('Start to analyze event %s', event['source_url']) item = dict() item['id'] = event['id'] item['source_url'] = event['source_url'] detail = event['detail'].encode('UTF8') \ if type(event['detail']) == unicode else event['detail'] # extract keyword based one name entity recogonition item['ner'] = ner_api.ner_groupby_ner(detail) # extract keyword based tf-idf, and pos item['tf'] = tf_idf_api.tf_idf_groupby_pos(detail, df_cache) extracted_events.append(item) logger.debug(json.dumps(item)) logger.info('Analyze event %s successfully', event['source_url']) except Exception, e: logger.error(e.message, exc_info=True) time.sleep(1) # cannot request server too frequent # store extracted events as a batch of 10 if len(extracted_events) % 10 == 0 and len(extracted_events) != 0: keyword_dao.save_extracted_event(extracted_events) extracted_events = []
extracted_events.append(item) logger.debug(json.dumps(item)) logger.info('Analyze event %s successfully', event['source_url']) except Exception, e: logger.error(e.message, exc_info=True) time.sleep(1) # cannot request server too frequent # store extracted events as a batch of 10 if len(extracted_events) % 10 == 0 and len(extracted_events) != 0: keyword_dao.save_extracted_event(extracted_events) extracted_events = [] if len(extracted_events) > 0: #save residue to db keyword_dao.save_extracted_event(extracted_events) if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG, format="%(levelname)s - %(asctime)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S",) update_df() keyword_extract() # text = ''' # Sunday’s attack in Lahore was the deadliest bombing targeting Pakistan’s Christians since more than 100 parishioners # were killed at Peshawar’s All Saints Church in August 2013. The militant Islamist group Jamaat-ul-Ahrar, # a vicious offshoot of the Pakistani Taliban, claimed responsibility for the attack. The same group was responsible for the # Youhanabad attacks in Lahore, a year ago. Ehsanullah Ehsan, a spokesperson for the group, which sees all non-Muslims as potential # targets, said the attack was calculated to show that they still retained the ability to strike deep into Pakistan’s heartlands —