コード例 #1
0
def keyword_extract():
    '''Extract keywords for newly crawled events, which have already been used to update df'''
    # load pre-computed document frequency data
    df_cache = keyword_dao.load_df_table()
    
    # initialize two instances for computing ner and tf-idf
    ner_api = StanfordNERApi()
    tf_idf_api = StanfordTFIDFApi()
    
    # load events those haven't been analyzed for keywords
    unparsed_events = keyword_dao.load_unextracted_events()
    
    extracted_events = []
    for event in unparsed_events:
        try:
            logger.info('Start to analyze event %s', event['source_url'])
            
            item = dict()
            item['id'] = event['id']
            item['source_url'] = event['source_url']
            detail = event['detail'].encode('UTF8') \
                if type(event['detail']) == unicode else event['detail']
            
            # extract keyword based one name entity recogonition
            item['ner'] = ner_api.ner_groupby_ner(detail)
            
            # extract keyword based tf-idf, and pos
            item['tf'] = tf_idf_api.tf_idf_groupby_pos(detail, df_cache)
            
            extracted_events.append(item)
            
            logger.debug(json.dumps(item))
            logger.info('Analyze event %s successfully', event['source_url'])
        except Exception, e:
            logger.error(e.message, exc_info=True)
            
        time.sleep(1) # cannot request server too frequent
        
        # store extracted events as a batch of 10         
        if len(extracted_events) % 10 == 0 and len(extracted_events) != 0:
            keyword_dao.save_extracted_event(extracted_events)
            extracted_events = []
コード例 #2
0
            extracted_events.append(item)
            
            logger.debug(json.dumps(item))
            logger.info('Analyze event %s successfully', event['source_url'])
        except Exception, e:
            logger.error(e.message, exc_info=True)
            
        time.sleep(1) # cannot request server too frequent
        
        # store extracted events as a batch of 10         
        if len(extracted_events) % 10 == 0 and len(extracted_events) != 0:
            keyword_dao.save_extracted_event(extracted_events)
            extracted_events = []
             
    if len(extracted_events) > 0: #save residue to db
        keyword_dao.save_extracted_event(extracted_events)

if __name__ == '__main__':
    
    logging.basicConfig(level=logging.DEBUG,
                        format="%(levelname)s - %(asctime)s - %(name)s - %(message)s",
                        datefmt="%Y-%m-%d %H:%M:%S",)
    update_df()
    keyword_extract()
       
# text = '''
#         Sunday’s attack in Lahore was the deadliest bombing targeting Pakistan’s Christians since more than 100 parishioners 
#         were killed at Peshawar’s All Saints Church in August 2013. The militant Islamist group Jamaat-ul-Ahrar, 
#         a vicious offshoot of the Pakistani Taliban, claimed responsibility for the attack. The same group was responsible for the 
#         Youhanabad attacks in Lahore, a year ago. Ehsanullah Ehsan, a spokesperson for the group, which sees all non-Muslims as potential 
#         targets, said the attack was calculated to show that they still retained the ability to strike deep into Pakistan’s heartlands —