def process(self, duplicate_removal=False): print 'processing...' session = self.Session() flag = True sleepcount = 0 while flag==True: flag = True # sleep sleepcount interval time mysleep( sleepcount ) if(sleepcount==0): sleepcount = 1 else: sleepcount = sleepcount + 1 #endif #print '[WeixiaoSim : ' + get_current_time_str() + '] - new loop to see if we have newly found events...' for instance in session.query(Events).filter(Events.status=='0').filter(Events.city=='beijing').order_by(Events.date, Events.time).limit(10000): #flag = True sleepcount = 0 #print 'Processing event - ' + instance.title.encode('utf-8') + ' ' + instance.place.encode('utf-8') source = instance.source title = instance.title desc = instance.desc date = instance.date category = instance.category time = instance.time place = instance.place fee = instance.fee feelist = instance.feelist imageurl = instance.image originurl = instance.link #print place.encode('utf-8') #print date #print time if is_num(fee) == False : instance.status = '2' print 'Note: fee (' + fee + ') is strange. So we will skip it...' session.commit() continue #endif # determine if the event is obsolete one year, month, day = get_date_detail(date) if validate_time(time) == False : instance.status = '2' print 'Note: time (' + time + ') is strange. So we will skip it...' session.commit() continue else : hour, minute = get_time_detail(time) event_time = build_datetime(year, month, day, hour, minute) current_china_datetime = datetime.now(pytz.timezone('Asia/Shanghai')) if (event_time.isoformat(' ') <= current_china_datetime.isoformat(' ')) : print '[WeixiaoSim : ' + get_current_time_str() + 'processing: ' + title + ' with date '+ date + ' ' + time + ' is obsolete. So we will skip it...' instance.status = '1' session.commit() continue #endif loc_details = getDetailedInfo(place) if loc_details['status']==1: #raw_input("Press Enter to continue...") # FIXME - put this strange address into TBD_address table print '[WeixiaoSim : ' + get_current_time_str() + 'processing: ' + title + ' with place (' + place + ') is strange. So we will skip it...' instance.status = '3' session.commit() continue #endif #print loc_details['formatted_address'].encode('utf-8') #print loc_details['province'].encode('utf-8') #print loc_details['city'].encode('utf-8') #print loc_details['areaname'].encode('utf-8') #print loc_details['areacode'].encode('utf-8') #print loc_details['longitude'] #print loc_details['latitude'] #wrap info into potentialItem potentialItem = {} potentialItem['source'] = to_unicode_or_bust(source) potentialItem['title'] = to_unicode_or_bust(title) potentialItem['desc'] = to_unicode_or_bust(desc) potentialItem['category'] = to_unicode_or_bust(category) potentialItem['date'] = to_unicode_or_bust(date) potentialItem['time'] = to_unicode_or_bust(time) potentialItem['place'] = to_unicode_or_bust(place) potentialItem['fee'] = to_unicode_or_bust(fee) potentialItem['feelist'] = to_unicode_or_bust(feelist) potentialItem['imageurl'] = to_unicode_or_bust(imageurl) potentialItem['originurl'] = to_unicode_or_bust(originurl) potentialItem['formatted_address'] = to_unicode_or_bust(loc_details['formatted_address']) potentialItem['province'] = to_unicode_or_bust(loc_details['province']) potentialItem['city'] = to_unicode_or_bust(loc_details['city']) potentialItem['areaname'] = to_unicode_or_bust(loc_details['areaname']) potentialItem['areacode'] = to_unicode_or_bust(loc_details['areacode']) potentialItem['longitude'] = to_unicode_or_bust(loc_details['longitude']) potentialItem['latitude'] = to_unicode_or_bust(loc_details['latitude']) # get all similar items (Q1) from search engine with criteria (query inputs) searchengine = Solr() # same - city, areacode, date, time q_areacode = 'areacode:' + loc_details['areacode'] q_eventdate = 'eventdate:"' + date + '"' q_eventtime = 'eventtime:"' + time + '"' query = {} query['q'] = q_areacode.encode('utf-8') + ' AND ' + q_eventdate.encode('utf-8') +' AND ' + q_eventtime.encode('utf-8') # almost - keywords from title and description # FIXME, now we do not provide this feature # this flag used to turn on and turn off the duplicate removal feature #duplicate_removal = False; instance.status = '1' if(duplicate_removal==True): # if len(Q1) == 0, regard this item as new item Q1 = searchengine.process(query) if ( len(Q1) == 0 ): #put this item to lele repository self.addToLeleRepository(potentialItem) else: # if not, create WeixiaoTask to determine if it is a new item or not self.createWeixiaoSimTask( potentialItem, Q1) #raw_input("Press Enter to continue...") #end if else: print 'no duplicate removal feature ...' try: self.addToLeleRepository(potentialItem) except: print 'exception happening' instance.status = '4' #end try-except #endif # label this item as analyzed in the table of db - lelespider session.commit()
def process(self, duplicate_removal=False): print 'processing...' session = self.Session() flag = True sleepcount = 0 while flag == True: flag = True # sleep sleepcount interval time mysleep(sleepcount) if (sleepcount == 0): sleepcount = 1 else: sleepcount = sleepcount + 1 #endif #print '[WeixiaoSim : ' + get_current_time_str() + '] - new loop to see if we have newly found events...' for instance in session.query(Events).filter( Events.status == '0').filter( Events.city == 'beijing').order_by( Events.date, Events.time).limit(10000): #flag = True sleepcount = 0 #print 'Processing event - ' + instance.title.encode('utf-8') + ' ' + instance.place.encode('utf-8') source = instance.source title = instance.title desc = instance.desc date = instance.date category = instance.category time = instance.time place = instance.place fee = instance.fee feelist = instance.feelist imageurl = instance.image originurl = instance.link #print place.encode('utf-8') #print date #print time if is_num(fee) == False: instance.status = '2' print 'Note: fee (' + fee + ') is strange. So we will skip it...' session.commit() continue #endif # determine if the event is obsolete one year, month, day = get_date_detail(date) if validate_time(time) == False: instance.status = '2' print 'Note: time (' + time + ') is strange. So we will skip it...' session.commit() continue else: hour, minute = get_time_detail(time) event_time = build_datetime(year, month, day, hour, minute) current_china_datetime = datetime.now( pytz.timezone('Asia/Shanghai')) if (event_time.isoformat(' ') <= current_china_datetime.isoformat(' ')): print '[WeixiaoSim : ' + get_current_time_str( ) + 'processing: ' + title + ' with date ' + date + ' ' + time + ' is obsolete. So we will skip it...' instance.status = '1' session.commit() continue #endif loc_details = getDetailedInfo(place) if loc_details['status'] == 1: #raw_input("Press Enter to continue...") # FIXME - put this strange address into TBD_address table print '[WeixiaoSim : ' + get_current_time_str( ) + 'processing: ' + title + ' with place (' + place + ') is strange. So we will skip it...' instance.status = '3' session.commit() continue #endif #print loc_details['formatted_address'].encode('utf-8') #print loc_details['province'].encode('utf-8') #print loc_details['city'].encode('utf-8') #print loc_details['areaname'].encode('utf-8') #print loc_details['areacode'].encode('utf-8') #print loc_details['longitude'] #print loc_details['latitude'] #wrap info into potentialItem potentialItem = {} potentialItem['source'] = to_unicode_or_bust(source) potentialItem['title'] = to_unicode_or_bust(title) potentialItem['desc'] = to_unicode_or_bust(desc) potentialItem['category'] = to_unicode_or_bust(category) potentialItem['date'] = to_unicode_or_bust(date) potentialItem['time'] = to_unicode_or_bust(time) potentialItem['place'] = to_unicode_or_bust(place) potentialItem['fee'] = to_unicode_or_bust(fee) potentialItem['feelist'] = to_unicode_or_bust(feelist) potentialItem['imageurl'] = to_unicode_or_bust(imageurl) potentialItem['originurl'] = to_unicode_or_bust(originurl) potentialItem['formatted_address'] = to_unicode_or_bust( loc_details['formatted_address']) potentialItem['province'] = to_unicode_or_bust( loc_details['province']) potentialItem['city'] = to_unicode_or_bust(loc_details['city']) potentialItem['areaname'] = to_unicode_or_bust( loc_details['areaname']) potentialItem['areacode'] = to_unicode_or_bust( loc_details['areacode']) potentialItem['longitude'] = to_unicode_or_bust( loc_details['longitude']) potentialItem['latitude'] = to_unicode_or_bust( loc_details['latitude']) # get all similar items (Q1) from search engine with criteria (query inputs) searchengine = Solr() # same - city, areacode, date, time q_areacode = 'areacode:' + loc_details['areacode'] q_eventdate = 'eventdate:"' + date + '"' q_eventtime = 'eventtime:"' + time + '"' query = {} query['q'] = q_areacode.encode( 'utf-8') + ' AND ' + q_eventdate.encode( 'utf-8') + ' AND ' + q_eventtime.encode('utf-8') # almost - keywords from title and description # FIXME, now we do not provide this feature # this flag used to turn on and turn off the duplicate removal feature #duplicate_removal = False; instance.status = '1' if (duplicate_removal == True): # if len(Q1) == 0, regard this item as new item Q1 = searchengine.process(query) if (len(Q1) == 0): #put this item to lele repository self.addToLeleRepository(potentialItem) else: # if not, create WeixiaoTask to determine if it is a new item or not self.createWeixiaoSimTask(potentialItem, Q1) #raw_input("Press Enter to continue...") #end if else: print 'no duplicate removal feature ...' try: self.addToLeleRepository(potentialItem) except: print 'exception happening' instance.status = '4' #end try-except #endif # label this item as analyzed in the table of db - lelespider session.commit()