Ejemplo n.º 1
0
    def process(self, duplicate_removal=False):
        print 'processing...' 
        session = self.Session()
        flag = True
        sleepcount = 0
        while flag==True:
            flag = True

            # sleep sleepcount interval time
            mysleep( sleepcount )

            if(sleepcount==0):
                sleepcount = 1
            else:
                sleepcount = sleepcount + 1
            #endif

            #print '[WeixiaoSim : ' + get_current_time_str()  + '] - new loop to see if we have newly found events...' 
            for instance in session.query(Events).filter(Events.status=='0').filter(Events.city=='beijing').order_by(Events.date, Events.time).limit(10000):
                #flag = True
                sleepcount = 0
                #print 'Processing event - ' + instance.title.encode('utf-8') + ' ' + instance.place.encode('utf-8')
                source = instance.source
                title = instance.title
                desc = instance.desc
                date = instance.date
                category = instance.category
                time = instance.time
                place = instance.place
                fee = instance.fee
                feelist = instance.feelist
                imageurl = instance.image
                originurl = instance.link
                #print place.encode('utf-8')
                #print date
                #print time

                if is_num(fee) == False :
                    instance.status = '2'
                    print 'Note: fee (' + fee + ') is strange. So we will skip it...'
                    session.commit()
                    continue
                #endif

                # determine if the event is obsolete one
                year, month, day = get_date_detail(date)
                if validate_time(time) == False :
                    instance.status = '2'
                    print 'Note: time (' + time + ') is strange. So we will skip it...'
                    session.commit()
                    continue
                else :
                    hour, minute = get_time_detail(time)
                
                event_time = build_datetime(year, month, day, hour, minute)
                current_china_datetime = datetime.now(pytz.timezone('Asia/Shanghai'))
                if (event_time.isoformat(' ') <= current_china_datetime.isoformat(' ')) :
                    print '[WeixiaoSim : ' + get_current_time_str()  + 'processing: ' + title + ' with date '+ date + ' ' + time + ' is obsolete. So we will skip it...'
                    instance.status = '1'
                    session.commit()
                    continue
                #endif                 

                loc_details = getDetailedInfo(place)
                if loc_details['status']==1:
                    #raw_input("Press Enter to continue...")
                    # FIXME - put this strange address into TBD_address table
                    print '[WeixiaoSim : ' + get_current_time_str()  + 'processing: ' + title + ' with place (' + place + ') is strange. So we will skip it...'
                    instance.status = '3'
                    session.commit()
                    continue
                #endif
                    
                #print loc_details['formatted_address'].encode('utf-8')
                #print loc_details['province'].encode('utf-8')
                #print loc_details['city'].encode('utf-8')
                #print loc_details['areaname'].encode('utf-8')
                #print loc_details['areacode'].encode('utf-8')
                #print loc_details['longitude']
                #print loc_details['latitude']
            
                #wrap info into potentialItem
                potentialItem = {}
                potentialItem['source'] = to_unicode_or_bust(source)
                potentialItem['title'] = to_unicode_or_bust(title)
                potentialItem['desc'] = to_unicode_or_bust(desc)
                potentialItem['category'] = to_unicode_or_bust(category)
                potentialItem['date'] = to_unicode_or_bust(date)
                potentialItem['time'] = to_unicode_or_bust(time)
                potentialItem['place'] = to_unicode_or_bust(place)
                potentialItem['fee'] = to_unicode_or_bust(fee)
                potentialItem['feelist'] = to_unicode_or_bust(feelist)
                potentialItem['imageurl'] = to_unicode_or_bust(imageurl)
                potentialItem['originurl'] = to_unicode_or_bust(originurl)
                potentialItem['formatted_address'] = to_unicode_or_bust(loc_details['formatted_address'])
                potentialItem['province'] = to_unicode_or_bust(loc_details['province'])
                potentialItem['city'] = to_unicode_or_bust(loc_details['city'])
                potentialItem['areaname'] = to_unicode_or_bust(loc_details['areaname'])
                potentialItem['areacode'] = to_unicode_or_bust(loc_details['areacode'])
                potentialItem['longitude'] = to_unicode_or_bust(loc_details['longitude'])
                potentialItem['latitude'] = to_unicode_or_bust(loc_details['latitude'])
    
                # get all similar items (Q1) from search engine with criteria (query inputs)
                searchengine = Solr()
                # same - city, areacode, date, time
                q_areacode = 'areacode:' + loc_details['areacode']
                q_eventdate = 'eventdate:"' + date + '"'
                q_eventtime = 'eventtime:"' + time + '"'
                
                query = {}
                query['q'] = q_areacode.encode('utf-8') + ' AND ' + q_eventdate.encode('utf-8') +' AND ' + q_eventtime.encode('utf-8')
                # almost - keywords from title and description
                # FIXME, now we do not provide this feature

                # this flag used to turn on and turn off the duplicate removal feature
                #duplicate_removal = False;
                instance.status = '1'
                if(duplicate_removal==True):
                    # if len(Q1) == 0, regard this item as new item
                    Q1 = searchengine.process(query)
                    if ( len(Q1) == 0 ):
                        #put this item to lele repository
                        self.addToLeleRepository(potentialItem)
                    else: 
                        # if not, create WeixiaoTask to determine if it is a new item or not
                        self.createWeixiaoSimTask( potentialItem, Q1)
                        #raw_input("Press Enter to continue...")
                    #end if
                else:
                    print 'no duplicate removal feature ...'
                    try:
                        self.addToLeleRepository(potentialItem)
                    except:
                        print 'exception happening'
                        instance.status = '4'
                    #end try-except
                #endif

                # label this item as analyzed in the table of db - lelespider
                session.commit()
Ejemplo n.º 2
0
    def process(self, duplicate_removal=False):
        print 'processing...'
        session = self.Session()
        flag = True
        sleepcount = 0
        while flag == True:
            flag = True

            # sleep sleepcount interval time
            mysleep(sleepcount)

            if (sleepcount == 0):
                sleepcount = 1
            else:
                sleepcount = sleepcount + 1
            #endif

            #print '[WeixiaoSim : ' + get_current_time_str()  + '] - new loop to see if we have newly found events...'
            for instance in session.query(Events).filter(
                    Events.status == '0').filter(
                        Events.city == 'beijing').order_by(
                            Events.date, Events.time).limit(10000):
                #flag = True
                sleepcount = 0
                #print 'Processing event - ' + instance.title.encode('utf-8') + ' ' + instance.place.encode('utf-8')
                source = instance.source
                title = instance.title
                desc = instance.desc
                date = instance.date
                category = instance.category
                time = instance.time
                place = instance.place
                fee = instance.fee
                feelist = instance.feelist
                imageurl = instance.image
                originurl = instance.link
                #print place.encode('utf-8')
                #print date
                #print time

                if is_num(fee) == False:
                    instance.status = '2'
                    print 'Note: fee (' + fee + ') is strange. So we will skip it...'
                    session.commit()
                    continue
                #endif

                # determine if the event is obsolete one
                year, month, day = get_date_detail(date)
                if validate_time(time) == False:
                    instance.status = '2'
                    print 'Note: time (' + time + ') is strange. So we will skip it...'
                    session.commit()
                    continue
                else:
                    hour, minute = get_time_detail(time)

                event_time = build_datetime(year, month, day, hour, minute)
                current_china_datetime = datetime.now(
                    pytz.timezone('Asia/Shanghai'))
                if (event_time.isoformat(' ') <=
                        current_china_datetime.isoformat(' ')):
                    print '[WeixiaoSim : ' + get_current_time_str(
                    ) + 'processing: ' + title + ' with date ' + date + ' ' + time + ' is obsolete. So we will skip it...'
                    instance.status = '1'
                    session.commit()
                    continue
                #endif

                loc_details = getDetailedInfo(place)
                if loc_details['status'] == 1:
                    #raw_input("Press Enter to continue...")
                    # FIXME - put this strange address into TBD_address table
                    print '[WeixiaoSim : ' + get_current_time_str(
                    ) + 'processing: ' + title + ' with place (' + place + ') is strange. So we will skip it...'
                    instance.status = '3'
                    session.commit()
                    continue
                #endif

                #print loc_details['formatted_address'].encode('utf-8')
                #print loc_details['province'].encode('utf-8')
                #print loc_details['city'].encode('utf-8')
                #print loc_details['areaname'].encode('utf-8')
                #print loc_details['areacode'].encode('utf-8')
                #print loc_details['longitude']
                #print loc_details['latitude']

                #wrap info into potentialItem
                potentialItem = {}
                potentialItem['source'] = to_unicode_or_bust(source)
                potentialItem['title'] = to_unicode_or_bust(title)
                potentialItem['desc'] = to_unicode_or_bust(desc)
                potentialItem['category'] = to_unicode_or_bust(category)
                potentialItem['date'] = to_unicode_or_bust(date)
                potentialItem['time'] = to_unicode_or_bust(time)
                potentialItem['place'] = to_unicode_or_bust(place)
                potentialItem['fee'] = to_unicode_or_bust(fee)
                potentialItem['feelist'] = to_unicode_or_bust(feelist)
                potentialItem['imageurl'] = to_unicode_or_bust(imageurl)
                potentialItem['originurl'] = to_unicode_or_bust(originurl)
                potentialItem['formatted_address'] = to_unicode_or_bust(
                    loc_details['formatted_address'])
                potentialItem['province'] = to_unicode_or_bust(
                    loc_details['province'])
                potentialItem['city'] = to_unicode_or_bust(loc_details['city'])
                potentialItem['areaname'] = to_unicode_or_bust(
                    loc_details['areaname'])
                potentialItem['areacode'] = to_unicode_or_bust(
                    loc_details['areacode'])
                potentialItem['longitude'] = to_unicode_or_bust(
                    loc_details['longitude'])
                potentialItem['latitude'] = to_unicode_or_bust(
                    loc_details['latitude'])

                # get all similar items (Q1) from search engine with criteria (query inputs)
                searchengine = Solr()
                # same - city, areacode, date, time
                q_areacode = 'areacode:' + loc_details['areacode']
                q_eventdate = 'eventdate:"' + date + '"'
                q_eventtime = 'eventtime:"' + time + '"'

                query = {}
                query['q'] = q_areacode.encode(
                    'utf-8') + ' AND ' + q_eventdate.encode(
                        'utf-8') + ' AND ' + q_eventtime.encode('utf-8')
                # almost - keywords from title and description
                # FIXME, now we do not provide this feature

                # this flag used to turn on and turn off the duplicate removal feature
                #duplicate_removal = False;
                instance.status = '1'
                if (duplicate_removal == True):
                    # if len(Q1) == 0, regard this item as new item
                    Q1 = searchengine.process(query)
                    if (len(Q1) == 0):
                        #put this item to lele repository
                        self.addToLeleRepository(potentialItem)
                    else:
                        # if not, create WeixiaoTask to determine if it is a new item or not
                        self.createWeixiaoSimTask(potentialItem, Q1)
                        #raw_input("Press Enter to continue...")
                    #end if
                else:
                    print 'no duplicate removal feature ...'
                    try:
                        self.addToLeleRepository(potentialItem)
                    except:
                        print 'exception happening'
                        instance.status = '4'
                    #end try-except
                #endif

                # label this item as analyzed in the table of db - lelespider
                session.commit()