Ejemplo n.º 1
0
def get_save_selling_num(url):
    printset = set(string.printable)
    all_rows = []
    ret_dict = {}
    prius_num = 0
    prius_ytd = 0
    tmp_months_to_show = []
    month_list_entity = ds.get_saved_months('month_list')
    if month_list_entity != None:
        tmp_months_to_show = month_list_entity.month_list

    soup = BeautifulSoup(urllib2.urlopen(url).read(), "html.parser")
    table = soup.find('table', attrs={'border':'1'})
    if table == None:
        table = soup.find('table', attrs={'border':'3'})
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        one_row = []
        cols = row.find_all('td')
        for entry in cols:
            one_row.append(entry.text.strip().lower().replace('\n',' '))
#        print one_row
        all_rows.append(one_row)

    month_str = all_rows[0][2].split()
    month = month_str[1] + month_dict[month_str[0].replace('.','')]
    if len(tmp_months_to_show) == 0:
        tmp_months_to_show.append(month)
        ds.save_updated_months('month_list', tmp_months_to_show)
    elif month not in tmp_months_to_show:
        #tmp_months_to_show is sorted list of months
        for i,m in enumerate(tmp_months_to_show):
            if int(month) > int(m):
                tmp_months_to_show.insert(i, month)
                break
            else:
                continue
        if i == len(tmp_months_to_show)-1:
            tmp_months_to_show.append(month)
        ds.save_updated_months('month_list', tmp_months_to_show)

    for v in all_rows[1:]:
        name_tmp = v[1].replace('*','').strip().split()
        brand = name_tmp[0].strip()
        if 'prius' in name_tmp:
            brand = 'toyota'
            if name_tmp[0] == 'prius':
                model = 'prius x'
            else:
                model = ' '.join(name_tmp[1:])
        elif 'ram' in name_tmp:
            brand = 'ram'
            model = 'p/u'
        else:
            if not set(name_tmp[-1]).issubset(printset):
                del name_tmp[-1]
            model = ' '.join(name_tmp[1:])
        if brand == 'bmw':
            model = "3&4 series"
#        print brand,model
        ret_dict['brand'] = brand
        ret_dict['model'] = model
        ret_dict['month'] = month
        selling_no = int(v[2].replace(',',''))
        if len(v) < 5:
            if month_dict[month_str[0]] == '01':
                selling_no_ytd = selling_no
            else:
                selling_no_ytd = 0
        else:
            if int(month) < 201310:
                if v[4] == 'n/a':
                    selling_no_ytd = 0
                else:
                    selling_no_ytd = int(v[4].replace(',',''))
            else:
                selling_no_ytd = int(v[5].replace(',',''))
        if model == 'prius':
            prius_num = selling_no
            prius_ytd = selling_no_ytd
            selling_no = prius_num
            selling_no_ytd = prius_ytd
        elif 'prius' in model:
            prius_num += selling_no
            prius_ytd += selling_no_ytd
            selling_no = prius_num
            selling_no_ytd = prius_ytd
            model = 'prius'
            ret_dict['model'] = 'prius'
        elif model == 'corolla/matrix':
            model = 'corolla'
            ret_dict['model'] = 'corolla'
        elif model in ['ram p/u','ram','p/u']:
            model = 'p/u'
            ret_dict['model'] = 'p/u'

        if model in TOP_30_TRUCKS:
            ret_dict['car_type'] = 'truck'
        elif model in TOP_30_CARS:
            ret_dict['car_type'] = 'car'
        elif model in TOP_30_SUVS:
            ret_dict['car_type'] = 'suv'
        elif model in TOP_VANS:
            ret_dict['car_type'] = 'van'
        else:
            ret_dict['car_type'] = 'car'
            print "##########model type not found for %s"%model
        ret_dict['selling_no'] = selling_no
        ret_dict['selling_no_ytd'] = selling_no_ytd
        entry_key = str(brand)+str(model)+str(month)
        ds.save_entity(entry_key, ret_dict)

    monthly_total_perbrand = {}
    monthly_total_perbrand['month'] = month
    data_entities = ds.get_all_entities_bymonth(month)
    for b in TREND_TOP_V_BRAND:
        monthly_total_perbrand['brand'] = b
        monthly_total_perbrand['monthly_total_no'] = 0
        for i in data_entities:
            if b == i.brand:
                monthly_total_perbrand['monthly_total_no'] += i.selling_no
#        print monthly_total_perbrand
        ds.save_monthly_total_perbrand(b+month, monthly_total_perbrand)
    monthly_total_pertype = {}
    monthly_total_pertype['month'] = month
    for t in TREND_TOP_V_TYPE:
        monthly_total_pertype['car_type'] = t
        monthly_total_pertype['monthly_total_no'] = 0
        for i in data_entities:
            if t == "all":
                monthly_total_pertype['monthly_total_no'] += i.selling_no
            elif t == i.car_type:
                monthly_total_pertype['monthly_total_no'] += i.selling_no
#        print monthly_total_pertype
        ds.save_monthly_total_pertype(t+month, monthly_total_pertype)
Ejemplo n.º 2
0
def get_save_tweet(debug=True):
    search_tweet_id = 0
    save_tweet_id = 0
    tweet = None
    target_count = 50
    target_vehicles = ALL_TOP_V_SEARCH
    tweet_id_entity = ds.get_tweet_id('tweet_id')
    if tweet_id_entity != None:
        search_tweet_id = tweet_id_entity.saved_id
        print "####Got saved tweet id %d"%search_tweet_id
    if debug == True:
        target_count = 50
        target_vehicles = TESTING_V

    for query_str in target_vehicles:
#        print "Searching for %s, since tweet id=%d"%(query_str, search_tweet_id)
        tweets_list = api.GetSearch(term=query_str, lang="en", \
                                 count=target_count,result_type='recent', since_id=search_tweet_id)
        for tweet in tweets_list:
            sum_lat = sum_lng = 0
            ret_dict = {}
            if tweet.truncated == True:
                print "####got truncated tweet"
                continue
            if tweet.place != None and tweet.place['country_code'] == "US":
                coordinate_list = tweet.place['bounding_box']['coordinates'][0]
                num = len(coordinate_list)
                for co in coordinate_list:
                    sum_lng += co[0]
                    sum_lat += co[1]
                centerlng = float(sum_lng)/num;
                centerlat = float(sum_lat)/num;
                #has both coordinate and place
                ret_dict['coordinate'] = [centerlng, centerlat]
                ret_dict['place'] = tweet.place['full_name']
            elif tweet.coordinates != None:
                #has only coordinate
                ret_dict['coordinate'] = tweet.coordinates['coordinates']
                if is_coordinate_in_us(ret_dict['coordinate']):
                    ret_dict['place'] = get_place_reverse_geocode(ret_dict['coordinate'])
                    if ret_dict['place'] == None:
                        continue
#                    print "####only coordinate, place=%s"%ret_dict['place']
                else:
                    continue
            elif tweet.user != None:
                #has only place, note place may be not in US
                location = tweet.user.location.strip().encode('utf-8','ignore')
                if location != "":
                    ret_dict['place'] = location
                    ret_dict['coordinate'] = get_coordinate_geocode(location)
                    if ret_dict['coordinate']==None or \
                      is_coordinate_in_us(ret_dict['coordinate'])==False:
                        continue
#                    print "####only place, coordinate=%s"%ret_dict['coordinate']
                else:
                    continue
            if ret_dict != {}:
                if ret_dict['coordinate'] == []:
                    continue
                ret_dict['model'] = query_str
                tweet_date = datetime.strptime(tweet.created_at,'%a %b %d %H:%M:%S +0000 %Y')\
                                                    .replace(tzinfo=pytz.UTC)
                ret_dict['month'] = tweet_date.strftime("%Y%m")
                ret_dict['text'] = tweet.text.strip().encode("utf-8",'ignore')
                ret_dict['tag'] = 'UNKNOWN'
                additional_str = "".join(re.findall("[a-zA-Z]+", ret_dict['text']))
                entry_key = ret_dict['model'].split()[1] + additional_str[:5] \
                            + str(ret_dict['coordinate'][0]) + str(ret_dict['coordinate'][1])
                print ret_dict
                ds.save_entity(entry_key, ret_dict)
        #save the last tweet id of one query str aftet for loop
        if tweet != None:
            save_tweet_id = tweet.id
    print "####save the last tweet id to %d"%save_tweet_id
    ds.save_tweet_id('tweet_id', save_tweet_id)