Beispiel #1
0
def dedup_sqoot_data_hard(firsttime=False):
    '''
    Summary: Further dedup coupons by checking deals under common fields vs. their locations.
    '''
    last_deduphard_end_time = read_sqoot_log('deduphard')
    deduphard_start_time = datetime.now(pytz.utc)

    describe_section("dedup_sqoot_data_hard IS BEGINNING..", show_time())

    # Grab all active deals on display to users for deduping.
    deals_to_dedup = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False,
                                               is_duplicate=False, online=False, status='considered-active')
    if (not firsttime) and last_deduphard_end_time:
        # If not first time, further filter down to only the newly added unique deals for deduping.
        deals_to_dedup = deals_to_dedup.filter(date_added__gt=last_deduphard_end_time)

    crosscheck_by_field(deals_to_dedup, 'coupon_directlink')
    crosscheck_by_field(deals_to_dedup, 'merchant_name')
    print "FINISHED DEDUPING HARD....", show_time()

    deduphard_end_time = datetime.now(pytz.utc)
    write_sqoot_log('deduphard', deduphard_start_time, deduphard_end_time)
    print '\n'
    print "GOOD NEWS! dedup_sqoot_data_hard IS ALL DONE AND LOGGING IT", show_time()
    reset_db_queries()
Beispiel #2
0
def savedown_sqoot_data():
    request_parameters = {
        'api_key': settings.SQOOT_PUBLIC_KEY,
    }
    print "\nSQOOT DATA LOAD STARTING..", show_time()

    categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters, timeout=5).json()['categories']
    categories_dict = establish_categories_dict(categories_array)
    reorganized_categories_array = reorganize_categories_list(categories_array)
    for category_dict in reorganized_categories_array:
        get_or_create_category(category_dict, categories_dict)

    # loading coupons and merchants
    describe_section("CHECKING THE LATEST DEAL DATA FROM SQOOT..", show_time())
    request_parameters['per_page'] = ITEMS_PER_PAGE
    active_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total']
    page_count = int(math.ceil(active_deal_count / float(request_parameters['per_page'])))

    print '%s deals detected, estimating %s pages to iterate' % (active_deal_count, page_count), show_time()

    describe_section("STARTING TO DOWNLOAD SQOOT DEALS..", show_time())

    sqoot_file = open("sqoot_output.json", "w")
    sqoot_file.write("[")
    for p in range(page_count):
        request_parameters['page'] = p + 1
        print '## Fetching page %s...' % (p + 1), show_time()
        response_in_json = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()
        sqoot_file.write(json.dumps(response_in_json))
        sqoot_file.write(",")
    sqoot_file.write("]")
    sqoot_file.flush()
    sqoot_file.close()
Beispiel #3
0
def run_thru_full_cycle(args):
    '''
    Summary: A wrapper function to run daily refresh, validate, dedup and clean functions consecutively.

    Note: Takes 'firsttime' argument.
    '''
    firsttime = True if 'firsttime' in args else False
    describe_section("FULLCYCLE STARTING..", show_time())
    refresh_sqoot_data(firsttime=firsttime)
    clean_out_sqoot_data(firsttime=firsttime)
    validate_sqoot_data(firsttime=firsttime)
    dedup_sqoot_data_hard(firsttime=firsttime)
    describe_section("ALL DONE!! :)", show_time())
Beispiel #4
0
def savedown_sqoot_data():
    request_parameters = {
        'api_key': settings.SQOOT_PUBLIC_KEY,
    }
    print "\nSQOOT DATA LOAD STARTING..", show_time()

    categories_array = requests.get(SQOOT_API_URL + 'categories',
                                    params=request_parameters,
                                    timeout=5).json()['categories']
    categories_dict = establish_categories_dict(categories_array)
    reorganized_categories_array = reorganize_categories_list(categories_array)
    for category_dict in reorganized_categories_array:
        get_or_create_category(category_dict, categories_dict)

    # loading coupons and merchants
    describe_section("CHECKING THE LATEST DEAL DATA FROM SQOOT..", show_time())
    request_parameters['per_page'] = ITEMS_PER_PAGE
    active_deal_count = requests.get(SQOOT_API_URL + 'deals',
                                     params=request_parameters,
                                     timeout=5).json()['query']['total']
    page_count = int(
        math.ceil(active_deal_count / float(request_parameters['per_page'])))

    print '%s deals detected, estimating %s pages to iterate' % (
        active_deal_count, page_count), show_time()

    describe_section("STARTING TO DOWNLOAD SQOOT DEALS..", show_time())

    sqoot_file = open("sqoot_output.json", "w")
    sqoot_file.write("[")
    for p in range(page_count):
        request_parameters['page'] = p + 1
        print '## Fetching page %s...' % (p + 1), show_time()
        response_in_json = requests.get(SQOOT_API_URL + 'deals',
                                        params=request_parameters,
                                        timeout=5).json()
        sqoot_file.write(json.dumps(response_in_json))
        sqoot_file.write(",")
    sqoot_file.write("]")
    sqoot_file.flush()
    sqoot_file.close()
Beispiel #5
0
def validate_sqoot_data(firsttime=False, pulseonly=False):
    '''
    Summary: Fetch a deal page and validate deal information and availabilty.
    '''

    last_validate_end_time = read_sqoot_log('validate')
    validate_start_time = datetime.now(pytz.utc)
    describe_section("validate_sqoot_data IS BEGINNING..", show_time())
    all_active_deals_on_display = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False,
                                                            is_duplicate=False, online=False)\
                                                    .filter(Q(status='unconfirmed') | Q(status='considered-active'))
    print "...VALIDATING", len(all_active_deals_on_display), "DEALS:"

    validators = Pool(15)
    validators.map(go_validate, zip(list(all_active_deals_on_display), repeat(last_validate_end_time), repeat(firsttime), repeat(pulseonly)))

    print "FINISHED VALIDATING....", show_time()

    validate_end_time = datetime.now(pytz.utc)
    write_sqoot_log('validate', validate_start_time, validate_end_time)
    print '\n'
    print "GOOD NEWS! validate_sqoot_data IS ALL DONE AND LOGGING IT", show_time()
    reset_db_queries()
Beispiel #6
0
def clean_out_sqoot_data(firsttime=False):
    '''
    Summary: Internal garbage collection cycle that finds and soft-delete all
             irrelevant and stale local coupons and merchants.
    Note:
    * First find all true duplicate deals and soft-delete them
    * Second, find all folded deals (i.e. is_duplicate=True, related_deal__isnull=True)
      that are stale (either expired or inactive, both implied and confirmed), and soft-delete them
    * Third, find all unique deals that are stale, check for folded deals (if so, reassign)
      and soft-delete them.
    * Fourth, find all inactive merchants (no active deals), and soft-delete them.
    '''
    from core.signals import delete_object

    last_refresh_start_time = read_sqoot_log('refresh') if firsttime == False else None
    cleanout_start_time = datetime.now(pytz.utc)
    describe_section("clean_out_sqoot_data IS BEGINNING..", show_time())
    affected_merchant_list = [] # collect a list of merchant pks whose coupons are being soft-deleted

    # First
    true_duplicate_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False,
                                                     is_duplicate=True, related_deal__isnull=True)
    deals_for_update = copy(true_duplicate_deals)
    affected_merchant_list += [c.merchant.pk for c in true_duplicate_deals]
    true_duplicate_deals.update(is_deleted=True)
    # triggering deletion of duplicated coupons from search index
    for coupon in deals_for_update:
        print 'Deleted %s' % coupon.id
        delete_object.send(sender=Coupon, instance=coupon)
    print '~*~*~*~*~*~*~*~*~* First finished ~*~*~*~*~*~*~*~*~*'

    # Second
    if last_refresh_start_time:
        folded_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False,
                                                 is_duplicate=True, related_deal__isnull=False)\
                                         .filter(Q(last_modified__lt=last_refresh_start_time)\
                                               | Q(status='confirmed-inactive')\
                                               | Q(end__lt=datetime.now(pytz.utc)))
    else:
        folded_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False,
                                                 is_duplicate=True, related_deal__isnull=False)\
                                         .filter(Q(status='confirmed-inactive')\
                                               | Q(end__lt=datetime.now(pytz.utc)))
    affected_merchant_list += [c.merchant.pk for c in folded_deals]

    deals_to_signal = []
    deals_to_signal += [c.pk for c in folded_deals.filter(Q(status='confirmed-inactive')
                                                        | Q(end__lt=datetime.now(pytz.utc)))]
    if last_refresh_start_time:
        deals_to_signal += [c.pk for c in folded_deals.filter(last_modified__lt=last_refresh_start_time)]
        folded_deals.filter(last_modified__lt=last_refresh_start_time).update(status='implied-inactive', is_deleted=True)
    folded_deals.filter(status='confirmed-inactive').update(is_deleted=True)
    folded_deals.filter(end__lt=datetime.now(pytz.utc)).update(is_deleted=True)

    deals_to_signal = list(set(deals_to_signal))
    for coupon in Coupon.all_objects.filter(pk__in=deals_to_signal):
        print 'Deleted %s' % coupon.id
        delete_object.send(sender=Coupon, instance=coupon)
    print '~*~*~*~*~*~*~*~*~* Second finished ~*~*~*~*~*~*~*~*~*'

    # Third (Second -> Third; the order matters)
    if last_refresh_start_time:
        non_dup_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=False)\
                                          .filter(Q(last_modified__lt=last_refresh_start_time)\
                                                | Q(status='confirmed-inactive')\
                                                | Q(end__lt=datetime.now(pytz.utc)))
    else:
        non_dup_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=False)\
                                          .filter(Q(status='confirmed-inactive')\
                                                | Q(end__lt=datetime.now(pytz.utc)))
    affected_merchant_list += [c.merchant.pk for c in non_dup_deals]
    deals_with_folded_deals = [c.pk for c in non_dup_deals if Coupon.all_objects.filter(related_deal=c, is_deleted=False).count() != 0]
    for i in deals_with_folded_deals:
        reassign_representative_deal(Coupon.all_objects.get(pk=i))

    deals_to_signal = []
    deals_to_signal += [c.pk for c in non_dup_deals.filter(Q(status='confirmed-inactive')
                                                         | Q(end__lt=datetime.now(pytz.utc)))]
    if last_refresh_start_time:
        deals_to_signal += [c.pk for c in non_dup_deals.filter(last_modified__lt=last_refresh_start_time)]
        non_dup_deals.filter(last_modified__lt=last_refresh_start_time).update(status='implied-inactive', is_deleted=True)
    non_dup_deals.filter(status='confirmed-inactive').update(is_deleted=True)
    non_dup_deals.filter(end__lt=datetime.now(pytz.utc)).update(is_deleted=True)

    deals_to_signal = list(set(deals_to_signal))
    for coupon in Coupon.all_objects.filter(pk__in=deals_to_signal):
        print 'Deleted %s' % coupon.id
        delete_object.send(sender=Coupon, instance=coupon)
    print '~*~*~*~*~*~*~*~*~* Third finished ~*~*~*~*~*~*~*~*~*'

    # Fourth
    affected_merchant_list = list(set(affected_merchant_list))
    inactive_merchant_list = []
    for m_pk in affected_merchant_list:
        miq = Merchant.all_objects.get(pk=m_pk) # miq == merchant-in-question
        num_of_active_coupons_from_miq = Coupon.all_objects.filter(ref_id_source='sqoot',\
                                                                   merchant=miq, is_deleted=False).count()
        if num_of_active_coupons_from_miq:
            continue
        else:
            inactive_merchant_list.append(miq.pk)
    Merchant.all_objects.filter(pk__in=inactive_merchant_list).update(is_deleted=True)
    cleanout_end_time = datetime.now(pytz.utc)
    write_sqoot_log('cleanout', cleanout_start_time, cleanout_end_time)
    print '\n'
    print "GOOD NEWS! cleanout_sqoot_data IS ALL DONE AND LOGGING IT", show_time()
    reset_db_queries()
Beispiel #7
0
def refresh_sqoot_data(indirectload=False, firsttime=False):
    '''
    Summary: Iterate through Sqoot's entire coupon payload and download and update accordingly.
    '''
    last_refresh_start_time = read_sqoot_log('refresh')
    refresh_start_time = datetime.now(pytz.utc) # Use UTC time to compare & update coupon's 'last_modified' field
    request_parameters = {
        'api_key': settings.SQOOT_PUBLIC_KEY,
    }
    print "\nSQOOT DATA LOAD STARTING..", show_time()

    describe_section("ESTABLISHING CATEGORY DICTIONARY..", show_time())
    request_try = 1
    while True:
        try:
            categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters, timeout=5).json()['categories']
            request_try = 1
            break
        except:
            print "Request timed out after 5 seconds. Let's wait another 5 seconds and try again."
            time.sleep(5)
            request_try += 1
            print "Trying for the {} time...".format(request_try)
    categories_dict = establish_categories_dict(categories_array) # Returns a dict with child: parent categories
    reorganized_categories_array = reorganize_categories_list(categories_array) # list of dict with 'category_name', and 'category_slug'
    for category_dict in reorganized_categories_array:
        get_or_create_category(category_dict, categories_dict)

    # loading coupons and merchants
    describe_section("CHECKING THE LATEST DEAL DATA FROM SQOOT..", show_time())
    request_parameters['per_page'] = ITEMS_PER_PAGE
    while True:
        try:
            sqoot_active_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total']
            request_try = 1
            break
        except:
            print "Request timed out after 5 seconds. Let's wait another 5 seconds and try again."
            time.sleep(5)
            request_try += 1
            print "Trying for the {} time...".format(request_try)
    page_count = int(math.ceil(sqoot_active_deal_count / float(request_parameters['per_page'])))
    print '%s deals detected, estimating %s pages to iterate' % (sqoot_active_deal_count, page_count), show_time()

    describe_section("STARTING TO DOWNLOAD SQOOT DEALS..", show_time())
    # Since there's only one country & dealtype for all sqoot deals - no need to check it for each coupon
    country_model       = get_or_create_country()
    dealtype_model      = get_or_create_dealtype()

    sqoot_output_deals = None
    if indirectload:
        sqoot_output_deals = json.loads(open("sqoot_output.json","r").read())

    for p in range(page_count):
        request_parameters['page'] = p + 1
        print "\n"
        print '## Fetching page {} out of {}...'.format(p + 1, page_count), show_time()
        print "\n"

        if indirectload:
            response_in_json = sqoot_output_deals[p]
        else:
            while True:
                try:
                    response_in_json = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()
                    request_try = 1
                    break
                except:
                    print "Request timed out after 5 seconds. Let's wait another 5 seconds and try again."
                    time.sleep(5)
                    request_try += 1
                    print "Trying for the {} time...".format(request_try)

        active_coupon_ids = [] # List of sqoot coupon ids to hold all active deal ids per page, as set 'page' request_parameters.
        deals_data = response_in_json['deals']
        for deal_data in deals_data:
            sqoot_coupon_id = int(deal_data['deal']['id'])
            active_coupon_ids.append(sqoot_coupon_id)

            deal_last_updated = parse(deal_data['deal']['updated_at']+'+0000')
            if (not firsttime) and last_refresh_start_time and (deal_last_updated < last_refresh_start_time):
                continue

            is_online_bool = deal_data['deal']['online']
            merchant_data_dict = deal_data['deal']['merchant']
            update_coupon_data(deal_data, categories_dict, merchant_data_dict, is_online_bool, dealtype_model, country_model)
            print '-' * 60

            reset_db_queries()

        Coupon.all_objects.filter(ref_id_source='sqoot', ref_id__in=active_coupon_ids).update(last_modified=datetime.now(pytz.utc))
        refresh_end_time = datetime.now(pytz.utc)

    write_sqoot_log('refresh', refresh_start_time, refresh_end_time)
    print '\n'
    print "GOOD NEWS! refresh_sqoot_data IS ALL DONE AND LOGGING IT", show_time()
    reset_db_queries()
    return refresh_start_time, refresh_end_time
Beispiel #8
0
def analyze_sqoot_deals():
    request_parameters = {
        'api_key': settings.SQOOT_PUBLIC_KEY,
    }

    # describe_section("Retrieving the latest categories..\n")
    # categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters).json()['categories']
    # category_slugs = [c['category']['slug'] for c in categories_array]

    describe_section("Retrieving the latest providers..\n")
    providers_array = requests.get(SQOOT_API_URL + 'providers', params=request_parameters, timeout=5).json()['providers']
    provider_slugs = [c['provider']['slug'] for c in providers_array]

    describe_section("Importing the latest 50 US cities..\n")
    target_cities = top_50_us_cities_dict

    describe_section("Checking total sqoot deals available..\n")
    total_deals_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total']

    TARGET_RADIUS = 50 # miles
    request_parameters['radius'] = TARGET_RADIUS
    describe_section("Checking sqoot deals currently available in {} mi radius of the following cities..\n".format(TARGET_RADIUS))
    for city in target_cities:
        request_parameters['location'] = target_cities[city]
        per_city_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total']
        print city, ': ', per_city_deal_count
    print 'total sqoot deal count: ', total_deals_count

    del request_parameters['location']

    describe_section("Preparing to check deal availablity from the following providers..\n")
    for p in provider_slugs:
        print p

    for p in provider_slugs:
        request_parameters['provider_slugs'] = p
        per_p_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total']
        if per_p_deal_count < 100:
            print "total deals available from {} too small: {}".format(p, per_p_deal_count)
            print "Skipping.."
            continue
        else:
            describe_section("Checking deals from {} for each city..\n".format(p))

        for city in target_cities:
            request_parameters['location'] = target_cities[city]
            per_city_and_p_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total']
            print city, ': ', per_city_and_p_deal_count
        print 'total {} deal count:  {}'.format(p, per_p_deal_count)
        del request_parameters['location']
Beispiel #9
0
def analyze_sqoot_deals():
    request_parameters = {
        'api_key': settings.SQOOT_PUBLIC_KEY,
    }

    # describe_section("Retrieving the latest categories..\n")
    # categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters).json()['categories']
    # category_slugs = [c['category']['slug'] for c in categories_array]

    describe_section("Retrieving the latest providers..\n")
    providers_array = requests.get(SQOOT_API_URL + 'providers',
                                   params=request_parameters,
                                   timeout=5).json()['providers']
    provider_slugs = [c['provider']['slug'] for c in providers_array]

    describe_section("Importing the latest 50 US cities..\n")
    target_cities = top_50_us_cities_dict

    describe_section("Checking total sqoot deals available..\n")
    total_deals_count = requests.get(SQOOT_API_URL + 'deals',
                                     params=request_parameters,
                                     timeout=5).json()['query']['total']

    TARGET_RADIUS = 50  # miles
    request_parameters['radius'] = TARGET_RADIUS
    describe_section(
        "Checking sqoot deals currently available in {} mi radius of the following cities..\n"
        .format(TARGET_RADIUS))
    for city in target_cities:
        request_parameters['location'] = target_cities[city]
        per_city_deal_count = requests.get(SQOOT_API_URL + 'deals',
                                           params=request_parameters,
                                           timeout=5).json()['query']['total']
        print city, ': ', per_city_deal_count
    print 'total sqoot deal count: ', total_deals_count

    del request_parameters['location']

    describe_section(
        "Preparing to check deal availablity from the following providers..\n")
    for p in provider_slugs:
        print p

    for p in provider_slugs:
        request_parameters['provider_slugs'] = p
        per_p_deal_count = requests.get(SQOOT_API_URL + 'deals',
                                        params=request_parameters,
                                        timeout=5).json()['query']['total']
        if per_p_deal_count < 100:
            print "total deals available from {} too small: {}".format(
                p, per_p_deal_count)
            print "Skipping.."
            continue
        else:
            describe_section(
                "Checking deals from {} for each city..\n".format(p))

        for city in target_cities:
            request_parameters['location'] = target_cities[city]
            per_city_and_p_deal_count = requests.get(
                SQOOT_API_URL + 'deals', params=request_parameters,
                timeout=5).json()['query']['total']
            print city, ': ', per_city_and_p_deal_count
        print 'total {} deal count:  {}'.format(p, per_p_deal_count)
        del request_parameters['location']