Beispiel #1
0
def build_context(args):
    '''
    This creates a dictionary of variables we'll be using throughout the script.
    args are from parse_args
    '''
    context = args
    
    currentdate = context['query_date']
    currentyear = datetime.datetime.now().strftime("%Y")
    currentmonth = datetime.datetime.now().strftime("%m")
    input_filename = os.path.basename(context['s3_input'])
    output_base = context['filebase'] + '__' + currentdate + '__' + \
                  input_filename.replace('.csv', '') 
    
    # local stuff
    context['currentdate'] = currentdate
    context['volume_directory'] = 'pylogs/'
    context['log'] = os.path.join(
        context['volume_directory'], 
        output_base + '.log'
    )
    
    # digital ocean
    if not context['token']: 
        context['token'] = os.environ.get('DO_TOKEN')
    manager = digitalocean.Manager(token=context['token'])
    my_droplets = manager.get_all_droplets()
    vols =  manager.get_all_volumes()
    mydrop = [_ for _ in my_droplets if _.ip_address == get_ip_address()][0]
    context['droplet'] = mydrop
    context['droplet_id'] = mydrop.id
    
    # s3 stuff
    if 's3://' not in context['s3_input']:
        raise "Improperly formatted -s3 or --s3-input flag"
    context['input'] = download_from_s3(context['s3_input'], new_dir='pylogs/')
    context['auth'] = 'pylogs/{}__{}__tokens.json'.format(mydrop.id, currentdate)
    context['s3_bucket'] = s3.get_bucket(context['s3_input'])
    context['s3_key'] = context['s3_input'].split('input/')[0]
    context['s3_path'] = os.path.join(
        context['s3_key'],
        'output/user_meta_many/' #, currentyear, currentmonth,
    )
    context['s3_log'] = os.path.join(
        's3://' + context['s3_bucket'], 'logs', 
        output_base + '.log'
    )
    context['s3_log_done'] = os.path.join(
        context['s3_key'],
        'logs/user_meta_many/', currentyear, currentmonth, 
        output_base + '.log'
    )
    context['s3_auth'] = os.path.join(
        's3://' + context['s3_bucket'], 'tokens/used', 
        os.path.basename(context['auth'])
    )

    return context
Beispiel #2
0
def crawl(settings):
    logger.info('crawl started')
    try:
        connection = postgres.get_connection(settings)
    except:
        logger.error('could not connect to postgres db')
    else:
        bucket = s3.get_bucket(settings)
        queue = Queue(maxsize=0)
        session = requests.session()
        for thread_index in range(settings['threading']['num_threads']):
            worker = Thread(target=process_index, args=(queue, thread_index, session, connection, bucket))
            worker.setDaemon(True)
            logger.info('thread %s > starting' % (thread_index))
            worker.start()
        for index_page in range(settings['leboncoin']['start_page'], settings['leboncoin']['end_page']):
            queue.put('http://www.leboncoin.fr/ventes_immobilieres/offres/?o={page}'.format(page=index_page))
        queue.join()
        logger.info('crawl ended')
#!/usr/bin/env python
import s3

bucket_name = "d.defold.com"
archive_root = "archive"
bucket = s3.get_bucket(bucket_name)

keep = [
    "5791ee6d96b87e50eee5acd70abaa4026fefef28",  #1.2.170
    "4ebe7a1d548eae2398717ed46f9d7d1b103d5503",  #1.2.169
    "e22f6d2f81e7c53ebcbfefe703ff22ce5da252c0",  #1.2.168
    "96f7a5e4f617d5f6f4645f30a3e6ff656689435d",  #1.2.167
    "5295afb3878441fb12f497df8831148525dcfb10",  #1.2.166
    "6fac6e80f09ab297093e3ff65a7f45ad56e06e33",  #1.2.165
    "ff34def383f372b1f302916374310bd498105384",  #1.2.171 beta
    "a98007b48691529b59fb099fc369d81518059d00",  #1.2.171 editor-alpha (stable)
    "stable",
    "beta",
    "alpha",
    "dev",
    "editor-alpha"
]

for key in bucket.list(prefix=archive_root):
    parts = key.name.split("/")
    sha1 = parts[1]
    if sha1 not in keep:
        print("Deleting %s" % key.name)
        key.delete()