コード例 #1
0
def upload_rewards_from_html_database(html_database, offset=0, chunk_size=10000, output_failed_projects=True):
    if output_failed_projects:
        Rewards.failed_projects_output_file = open('failed_reward_project_ids.txt', 'wb')
    print
    "Getting existing rewards"
    t1 = time.time()
    kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter')
    existing_project_ids = set([row['projectid'] for row in kickstarter_db.query(
        'select distinct projectid from reward')])
    print
    '\ttook {0}'.format(time.time() - t1)

    while True:
        # have to reconnect or else it will lose connection after a while
        kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter')
        print
        "Getting html from database, from {0} to {1}".format(offset, offset + chunk_size)
        t1 = time.time()
        results = html_database.query('select projectid, html, url from reward_html limit {0}, {1}'.format(offset, chunk_size))
        print
        '\ttook {0}'.format(time.time() - t1)
        print
        "Parsing html"
        t1 = time.time()
        rewards = []
        # see if the offset is past the end of the database
        end_of_database = True
        for index, row in enumerate(results):
            end_of_database = False
            if row['projectid'] not in existing_project_ids:
                r = Rewards(row['html'], row['projectid']).get_rewards()
                if r is not None:
                    rewards += r
        # break if offset is past the end of the database
        if end_of_database:
            print
            "Nothing to parse: end of database"
            break
        print
        '\ttook {0}'.format(time.time() - t1)
        print
        "Uploading projects"
        t1 = time.time()
        kickstarter_db['reward'].insert_many(ensure=False, rows=rewards)
        print
        '\ttook {0}'.format(time.time() - t1)

        offset += chunk_size

    if output_failed_projects:
        Rewards.failed_projects_output_file.close()
コード例 #2
0
def update_backers_db_from_queue(project_queue, fail_queue, process_id):
    ff_profile = os.getcwd() + "\\lib\\quick_firefox"
    browser = splinter.Browser('firefox', profile=ff_profile)
    while True:
        p = project_queue.get()
        if p is None:  # signal that there are no more projects
            browser.quit()
            project_queue.task_done()
            break
        url = p['url']
        projectid = p['id']
        try:
            backers = get_backers_from_url(url=url,
                                           max_wait=30,
                                           browser=browser)
            data = tablib.Dataset()
            data.headers = ['projectid', 'userid', 'name', 'raw_location']
            for backer in backers:
                row = (projectid, backer['id'], backer['name'],
                       backer['raw_location'])
                data.append(row)
            db = db_connections.get_fungrosencrantz_schema(
                schema='kickstarter')
            db_connections.uploadOutputFile(data=data, db=db, table='backer')
            del db
            print
            "Success: " + url
        except:
            fail_queue.put(url)
            print
            "Failed: " + url
        project_queue.task_done()
コード例 #3
0
def select():
    db = db_connections.get_fungrosencrantz_schema('crowdrise')
    t0 = time.time()
    q = [x for x in db.query('select * from html limit 300')]
    logging.debug('reading {} files took {} seconds'.format(
        len(q),
        time.time() - t0))
コード例 #4
0
def run(start_date, end_date, file_name):
    """

    :param start_date: Consider projects that have a start date after this date (format YYYY/MM/DD)
    :param end_date: Consider projects that have a start date before this date (format YYYY/MM/DD)
    :param file_name: Output file name.
    :return:
    """
    queries = {
        'project':
        """select * from project left join location on project.location_slug = location.slug
    where start_date between {} and {};""".format(start_date, end_date),
        'reward':
        """select * from reward join (project left join location on project.location_slug = location.slug) on reward.projectid = project.id
    where start_date between {} and {};""".format(start_date, end_date),
        'update':
        """select * from `update` join (project left join location on project.location_slug = location.slug) on `update`.projectid = project.id
    where start_date between {} and {};""".format(start_date, end_date)
    }

    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    writer = pd.ExcelWriter(file_name + '.xlsx',
                            engine='xlsxwriter',
                            options={'strings_to_urls': False})

    for table in queries:
        print
        "downloading {}".format(table)
        df = pd.read_sql(queries[table], db.executable)
        print
        "inserting into spreadsheet"
        df.to_excel(writer, table)
    print
    "saving spreadsheet"
    writer.save()
コード例 #5
0
def generate_statistics_file(table, columns, table_columns=None):
    db = db_connections.get_fungrosencrantz_schema(traditional=True)
    for c in columns:
        if table_columns is None:
            assert c in db[table].columns
        else:
            assert c in table_columns
    q, headers = generate_full_statistics_query(table=table, columns=columns)

    data = []
    try:
        for i, row in enumerate(db.query(q)):
            row['Name'] = columns[i]
            data.append(row)
    except:
        print(q)
        time.sleep(0.2)
        raise
    tablib_data_unordered = tablib.Dataset()
    tablib_data_unordered.dict = data

    tablib_data = tablib.Dataset()
    for header in ['Name'] + headers:
        tablib_data.append_col(tablib_data_unordered[header], header=header)

    tablib_data.headers = ['Name'] + headers
    with open('{0}_statistics.xlsx'.format(table), 'wb') as f:
        f.write(tablib_data.xlsx)
コード例 #6
0
def download_update_pages(skip_existing=True, only_updated_projects=False):
    db = db_connections.get_fungrosencrantz_schema(
        'kickstarter_new')  # TODO: change after move

    orig_dir = os.getcwd()
    os.chdir('/mnt/data/scrape/kickstarter_updates')

    with open('all_urls', 'w') as f:
        for row in db.query('select url from all_files'):
            url = row['url']
            if url.endswith('/'):
                f.write(url + 'updates' + '\n')
            else:
                f.write(url + '/updates' + '\n')

    if only_updated_projects:
        raise NotImplementedError()
    else:
        if skip_existing:
            subprocess.call(
                'wget -i all_urls --no-clobber --force-directories --output-file=wget.log',
                shell=True)
        else:
            subprocess.call(
                'wget -i all_urls --timestamping --force-directories --output-file=wget.log',
                shell=True)

    os.chdir(orig_dir)
コード例 #7
0
def scrape_location(page_source):
    tree = lxml.html.fromstring(page_source)
    locations = dict()
    location_sections = tree.xpath('//div[@class="project-location"]/a')

    # this comes from the website
    LOCATION_KEYS = [
        'name', 'short_name', 'country', 'id', 'is_root', 'state', 'urls',
        'type', 'displayable_name', 'slug'
    ]
    for l_section in location_sections:
        location = json.loads(s=l_section.attrib['data-location'])
        if location.keys() != LOCATION_KEYS:
            print
            location.keys()
            raise Exception("Bad location")
        location['status'] = location['state']
        del location['state']
        del location['urls']
        name = location['displayable_name']
        locations[name] = location
    db = db_connections.get_fungrosencrantz_schema(schema='kickstarter',
                                                   traditional=True)
    db_connections.uploadOutputFile(data=locations.values(),
                                    db=db,
                                    table='location',
                                    strict=True)
コード例 #8
0
def run():
    from utils.useful_functions import ensure_directory
    sql = "select id as project_id, goal, amount_pledged, backer_count, category, year(start_date) as start_year, currency from project  where currency is not Null"
    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    df = pd.read_sql(sql=sql, con=db.executable)
    convert_kickstarter_currency_df_to_usd(df, replace=True)
    base_directory = '/usr/share/nginx/html/crowdfunding/my/scott/kickstarter/goal_vs_actual'
    original_directory = os.getcwd()
    os.chdir(base_directory)
    data_directory = 'analysis_output'

    # def start_date_to_str(row):
    #    return row['start_date'].isoformat()
    # df['start_date'] = df.apply(start_date_to_str, axis=1)
    mapping = dict()
    for year_category, year_category_df in df.groupby(
        ['start_year', 'category']):
        year = year_category[0]
        category = year_category[1]
        cur_directory = '{}/{}'.format(data_directory, year)
        ensure_directory(cur_directory)
        output_file = '{}/{}.json'.format(cur_directory, category)
        if year not in mapping.keys():
            mapping[year] = dict()
        mapping[year][category] = output_file
        with open(output_file, 'w') as f:
            json.dump(year_category_df.to_dict(orient='records'), f)

    ensure_directory(data_directory)
    with open('{}/mapping.json'.format(data_directory), 'w') as f:
        json.dump(mapping, f)
    os.chdir(original_directory)
コード例 #9
0
def delete_redundant_directories():
    db = db_connections.get_fungrosencrantz_schema('crowdrise')

    urls = [x['url'] + '/*' for x in db.query('select url from fundraiser where url not like "%/fundraiser/%";')]

    urls_split = split_array_into_chunks(urls, 10000)

    for chunk in urls_split:
        with open('directories_to_remove', 'w') as f:
            f.write(" ".join(chunk))
        subprocess.call('rm -Rf `cat directories_to_remove`', shell=True)
コード例 #10
0
def get_projects_with_null_backerid(offset=0,
                                    limit=1000,
                                    project_table='project',
                                    backer_table='backer'):
    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    q = """SELECT id, url FROM {0} left join backer on {0}.id = {1}.projectid
            where projectid is null and backers_count > 50
            order by backers_count asc""".format(project_table, backer_table)
    if limit is not None:
        q = q + "\nlimit {0}, {1}".format(offset, limit)
    return db.query(q)
コード例 #11
0
def get_projects_with_null_backerid(offset=0,
                                    limit=1000,
                                    project_table='project',
                                    backer_table='backer'):
    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    results = db.query(
        """SELECT id, url FROM {0} left join backer on {0}.id = {1}.projectid
                            where projectid is null and backers_count > 0
                            limit {2}, {3}""".format(project_table,
                                                     backer_table, offset,
                                                     limit))
    return [x for x in results]
コード例 #12
0
def download_update_pages(workers):
    kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter')
    # get the urls and their corresponding ids to download
    ids_for_base_urls = db_connections.get_ids_for_base_urls(
        db=kickstarter_db, table_name='project')
    webpageDownloader.urls_to_database(
        ids_for_base_urls=ids_for_base_urls,
        db_connector=db_connections.get_intermediate_db,
        html_table_name='update_html',
        url_append='/updates',
        verbose_level=2,
        auto_adjust_workers=False,
        max_workers=workers,
        chunk_size=104)
コード例 #13
0
def upload_scraped_projects(scraped_projects,
                            backer_table='backer',
                            schema='kickstarter'):
    db = db_connections.get_fungrosencrantz_schema(schema)
    data = tablib.Dataset()
    data.headers = tuple(db[backer_table].columns)
    for i in range(len(projects)):
        row = {
            data.headers[0]: projects[i][0],
            data.headers[1]: projects[i][1],
            data.headers[2]: projects[i][2]
        }
        data.append(row)
    return data
コード例 #14
0
def download_reward_pages(workers):
    kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter')
    ids_for_base_urls = db_connections.get_ids_for_base_urls(
        db=kickstarter_db,
        table_name='project',
        query_append="",
        scrape_table_name='reward',
        scrape_table_column_name='projectid')
    webpageDownloader.urls_to_database(ids_for_base_urls,
                                       db_connections.get_intermediate_db,
                                       'reward_html',
                                       url_append='/rewards',
                                       chunk_size=104,
                                       max_workers=workers,
                                       verbose_level=2)
コード例 #15
0
def query_to_excel(query, chunk_size=10, outfile='output.xlsx', schema='Kiva'):
    from utils.useful_functions import split_array_into_chunks
    writer = pd.ExcelWriter(outfile,
                            engine='xlsxwriter',
                            options={'strings_to_urls': False})
    db = db_connections.get_fungrosencrantz_schema(schema)
    df = pd.read_sql(query, con=db.executable)
    print
    "done downloading"
    sheet_num = 0
    for df_chunk in split_array_into_chunks(df, chunk_size=chunk_size):
        sheet_num += 1
        if len(df_chunk) == 0:
            break
        df_chunk.to_excel(writer, sheet_name='sheet{}'.format(sheet_num))
    writer.save()
コード例 #16
0
def output_founder_graphs():
    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    fig = plt.figure(figsize=(8, 6))

    query = """
        SELECT
          collapsed_projects_by_founder                  AS `num_projects_by_founder`,
          sum(num_successful) `num_successful`,
          sum(num_failed) `num_failed`,
          sum(num_other) `num_other`,
          sum(num_successful)/sum(projects_by_founder) `success_rate`,
          count(*) `num_founders`
        FROM (
               SELECT
                 founder_id,
                 count(*)                            AS projects_by_founder,
                 #IF(count(*) <= 10, count(*), 11) AS collapsed_projects_by_founder,
                 count(*) AS collapsed_projects_by_founder,
                 sum(status = 'successful')          AS num_successful,
                 sum(status = 'failed') `num_failed`,
                 sum(status not in ('successful', 'failed')) `num_other`
               FROM project
                 JOIN location ON project.location_slug = location.slug
               WHERE location.country = 'US' AND start_date < (SELECT min(start_date)
                                FROM project
                                WHERE status = 'live')
               GROUP BY founder_id) AS t1
        GROUP BY collapsed_projects_by_founder;
    """
    df = pd.read_sql(query, db.engine)

    y = df['num_founders'].values
    x = df['num_projects_by_founder'].values
    # size = np.sqrt(df['num_founders'].values * 200)

    ax = fig.add_axes((.18, .2, .80, .75))
    plot = ax.scatter(x, y, marker='o', c=df['success_rate'], cmap='gray_r')
    fig.colorbar(plot, label='Success Rate')

    plt.title('Number of Projects Made by a Founder vs Number of Founders')
    plt.xlabel('Number of Projects Made by Founder')
    plt.ylabel('Number of Founders')

    ax.set_yscale('log')

    plt.savefig('plot_founder_distribution.png', dpi=350)
コード例 #17
0
def download_small_backers(workers):
    html_db = db_connections.get_intermediate_db()
    kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter')
    # get the urls and their corresponding ids to download
    ids_for_base_urls = db_connections.get_ids_for_base_urls(
        db=kickstarter_db,
        table_name='project',
        query_append='and backers_count <= 50 and backers_count > 0',
        scrape_table_name='backer',
        scrape_table_column_name='projectid')
    webpageDownloader.urls_to_database(
        ids_for_base_urls=ids_for_base_urls,
        db_connector=db_connections.get_intermediate_db,
        html_table_name='backer_html',
        url_append='/backers',
        verbose_level=2,
        auto_adjust_workers=False,
        max_workers=workers)
コード例 #18
0
def clean_sitemap_table(db=None):
    if db is None:
        db = db_connections.get_fungrosencrantz_schema('crowdrise')
    q = """UPDATE sitemap
SET loc=left(loc, char_length(loc)-1)
WHERE right(loc, 1) = '/'
LIMIT 100000000000;"""
    db.query(q)

    q = """UPDATE sitemap
SET loc=REPLACE(loc, 'http:', 'https:')
WHERE loc LIKE 'http:%'
LIMIT 100000000000;"""
    db.query(q)

    q = """DELETE FROM sitemap
WHERE loc='https://www.crowdrise.com';"""
    db.query(q)
コード例 #19
0
def download_plot_data():
    db = db_connections.get_fungrosencrantz_schema('kickstarter')

    query = """
        SELECT
          concat(year(start_date), 'Q', quarter(start_date)) AS `quarter`,
          backer_count, new_backers, repeat_backers, amount_pledged, currency,
          start_date, category, status, goal, founder_id
        FROM project
          JOIN location ON project.location_slug = location.slug
        WHERE location.country = 'US'
          AND NOT (year(start_date) = year(now()) AND quarter(start_date) = quarter(now()))
        order by rand(1)
    """
    full_df = pd.read_sql(query, db.engine, parse_dates=['start_date'])
    useful_functions.convert_currency(
        full_df, currency_column='currency', money_column='amount_pledged', date_column='start_date')
    del full_df['currency']
    del full_df['start_date']

    full_df.to_pickle('plot_data.pickle')
コード例 #20
0
def upload_community_data(community_data=None):
    if community_data is None:
        with open('community_data.pickle', 'rb') as f:
            community_data = pickle.load(f)

    project_data = []
    backers_in_city_data = []
    backers_in_country_data = []
    for project in community_data:
        backers_in_city_data += project.pop('backers_by_city')
        backers_in_country_data += project.pop('backers_by_country')
        project_data.append(project)

    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    # db_connections.uploadOutputFile(data=project_data, db=db, table='project')
    db_connections.uploadOutputFile(data=backers_in_city_data,
                                    db=db,
                                    table='backers_in_city')
    db_connections.uploadOutputFile(data=backers_in_country_data,
                                    db=db,
                                    table='backers_in_country')
    '''
コード例 #21
0
def run():
    db = db_connections.get_fungrosencrantz_schema('crowdrise')
    chunk_size = 100

    html_data = []
    for chunk_index, chunk in enumerate(
            split_array_into_chunks([
                x['file_name'] for x in db.query(
                    '''select file_name from all_files left join html on all_files.file_name = html.url
where html.url is null;''')
            ],
                                    chunk_size=chunk_size)):
        t0 = time.time()
        for file_index, file_path in enumerate(chunk):
            cur_index = chunk_index * chunk_size + file_index
            try:
                with open(file_path, 'rb') as f:
                    f_read = f.read()
            except IOError:
                logging.error(traceback.format_exc())
                try:
                    with open(file_path.replace('E:\\', ''), 'rb') as f:
                        f_read = f.read()
                except IOError:
                    with open('files_not_found', 'a') as f:
                        f.write(file_path + '\n')
                    continue
            # logging.debug('{}: {}'.format(cur_index, file_path))
            html_data.append(
                dict(
                    url=file_path,
                    html=f_read,
                    last_scrape=time.gmtime(os.path.getmtime(file_path)),
                ))
        db['html'].insert_many(html_data, ensure=False)
        logging.debug('inserted through {}; took {}'.format(
            cur_index,
            time.time() - t0))
        html_data = []
コード例 #22
0
def get_all_pages_to_download(db=None):
    if db is None:
        db = db_connections.get_fungrosencrantz_schema('crowdrise')
    q = """SELECT
    CONCAT(t2.loc,
            '/fundraiser/',
            REPLACE(t2.loc,
                'https://www.crowdrise.com/',
                '')) AS loc
FROM
    sitemap AS t1
        JOIN
    sitemap AS t2 ON t1.loc = t2.loc
        AND t1.category = 'users'
        AND t2.category = 'fundraisers'
UNION
SELECT loc FROM sitemap;
    """
    logging.info('Querying database')
    r = db.query(q)
    logging.info('Writing URLs to file')
    with open('pages_to_download.txt', 'w') as f:
        for row in r:
            f.write('{}\n'.format(row['loc']))
コード例 #23
0
def get_main_page_html():
    import splinter
    import os
    import time

    ff_profile = os.getcwd() + "\\lib\\quick_firefox"
    browser = splinter.Browser('firefox', profile=ff_profile)

    for i in range(5):
        try:
            browser.visit(
                'https://www.kickstarter.com/discover/advanced?sort=newest')
            break
        except:
            "Trying to load page again..."

    with codecs.open('output_before.html', 'wb', encoding='utf-8') as f:
        f.write(browser.html)

    num_reloads = 0
    while browser.title.find("We're sorry, but something went wrong") != -1:
        time.sleep(2)
        browser.reload()
        num_reloads += 1
        print
        "Oh no!"
        if num_reloads > 5: break

    # to make sure we don't load too many urls
    print
    "Getting projects in the big database"
    project_db = db_connections.get_fungrosencrantz_schema('kickstarter')
    all_project_urls = set(
        [x['url'] for x in project_db.query('select url from project')])
    print
    "Getting projects in the intermediate database"
    intermediate_db = db_connections.get_intermediate_db()
    urls_to_scrape = set([
        x['url']
        for x in intermediate_db.query('select url from urls_to_scrape')
    ])

    print
    "Starting download...."
    wait_time = 0
    wait_step = 0.05
    max_wait = 60
    max_tries = 3
    num_tries = 0
    num_hrefs = 0
    last_num_hrefs = 0
    extra_pages = None
    while True:
        orig_html_length = len(browser.html)
        button = browser.find_by_xpath(
            '//div[@class="load_more"]/a[@role="button"]')[-1]
        button.click()
        time.sleep(wait_step)
        while True:
            project_hrefs = browser.find_by_xpath(
                '//h6[@class="project-title"]/a')
            num_hrefs = len(project_hrefs)
            if num_hrefs != last_num_hrefs:
                break
            print
            "Waiting..."
            time.sleep(wait_step)
            wait_time += wait_step
            if wait_time >= max_wait: break
        if wait_time >= max_wait:
            num_tries += 1
            if num_tries >= max_tries:
                break
        else:
            num_tries = 0
            last_num_hrefs = num_hrefs
        print
        "{0} projects loaded".format(num_hrefs)
        last_href = project_hrefs[-1]['href']
        last_href = last_href.replace("?ref=newest", "")
        if last_href in all_project_urls or last_href in urls_to_scrape:
            if extra_pages is None:
                extra_pages = 3
            else:
                extra_pages += -1
            if extra_pages == 0:
                break

    page_source = browser.html
    browser.quit()
    return page_source
コード例 #24
0
import sys

if '../' not in sys.path:
    sys.path.insert(0, '../')
from unused_scripts import db_connections

db1 = db_connections.get_fungrosencrantz_schema('TestDBforScott')
db2 = db_connections.get_fungrosencrantz_schema('Kiva')

for table in db1.tables:
    original_count = db1.query(
        'select count(*) as cnt from {}'.format(table)).next()['cnt']
    new_count = db2.query(
        'select count(*) as cnt from {}'.format(table)).next()['cnt']

    if original_count != new_count:
        print(table, original_count, new_count)
コード例 #25
0
        with open('../{}_desc_stats.latex'.format(outfile_name), 'w') as f:
            f.write(desc_df.to_latex())
        desc_df.to_csv('../{}_desc_stats.csv'.format(outfile_name))

        print(desc_df.to_latex())
        print(desc_df)


if __name__ == '__main__':
    from unused_scripts import db_connections
    import logging

    logging.basicConfig(level=logging.DEBUG)
    os.chdir('kickstarter')
    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    '''
    get_descriptive_stats_latex(db=db, table="""
    (select amount_raised, donation_count, IFNULL(team_members, 1) as team_members from (
SELECT
  fundraiser.url,
  fundraiser.total_raised   AS amount_raised,
  IFNULL(donation_count, 0) AS donation_count
FROM fundraiser
  LEFT JOIN (SELECT
               url,
               count(*) AS donation_count
             FROM donation
             GROUP BY url) AS t1
    ON fundraiser.url = t1.url) as t2
コード例 #26
0
def run():
    logging.basicConfig(level=logging.DEBUG)
    os.chdir('kickstarter')
    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    # get_kiva_stats(db)
    get_kickstarter_stats(db)
コード例 #27
0
def output_graphs(category=None, category_in_title=None):
    db = db_connections.get_fungrosencrantz_schema('kickstarter')
    fig = plt.figure(figsize=(8, 6))
    if category is None:
        file_name_prepend = 'plot_all_'
        category_requirement = ' is not null '
    else:
        file_name_prepend = 'plot_{}_'.format(category.replace(' ', '').replace('&', '_and_'))
        category_requirement = ' = "{}"'.format(category)

    query = """
        SELECT
          concat(year(start_date), 'Q', quarter(start_date)) AS `quarter`,
          sum(new_backers)                                   AS `new_backers`,
          sum(repeat_backers)                                AS `repeat_backers`,
          std(new_backers)                                   AS `new_backers_std`,
          std(repeat_backers)                                AS `repeat_backers_std`
        FROM project JOIN location ON project.location_slug = location.slug
        WHERE location.country = 'US'
          AND new_backers IS NOT NULL AND project.repeat_backers IS NOT NULL
          AND NOT (year(start_date) = year(now()) AND quarter(start_date) = quarter(now()))
          AND category {}
        GROUP BY year(start_date), quarter(start_date);
    """.format(category_requirement)
    df = pd.read_sql(query, db.engine)
    plot_quarterly_stacked_df(fig=fig,
                              df=df,
                              filename=file_name_prepend + 'backers_split.png',
                              xlabel='Starting Quarter',
                              ylabel='Number of Backers',
                              title='Number of Backers by Quarter{}'.format(
                                  '' if category_in_title is None else " ({})  ".format(category_in_title)),
                              x_series=df['quarter'],
                              y_series=df['repeat_backers'],
                              y_series_2=df['new_backers'],
                              y_series_labels=['Repeat Backers', 'New Backers'])

    plot_num_projects_per_day(db=db, fig=fig,
                              file_name_prepend=file_name_prepend,
                              category_requirement=category_requirement,
                              category=category_in_title, )

    query = """
    select amount_pledged, currency, start_date, concat(year(start_date), 'Q', quarter(start_date)) as `quarter`
    from project JOIN location ON project.location_slug = location.slug
    where location.country = 'US'
          AND start_date is not null and amount_pledged is not null and category {}
       AND NOT (year(start_date) = year(now()) and quarter(start_date) = quarter(now()))
    order by rand();
    """.format(category_requirement)

    pledged_df = pd.read_sql(query, db.engine, parse_dates=['start_date'])
    useful_functions.convert_currency(
        pledged_df, currency_column='currency', money_column='amount_pledged', date_column='start_date')
    del pledged_df['currency']
    del pledged_df['start_date']

    df = pledged_df.groupby(by=['quarter']).sum()
    plot_quarterly_df(fig=fig,
                      df=df,
                      filename=file_name_prepend + 'pledged_total.png',
                      xlabel='Starting Quarter',
                      ylabel='Amount Pledged (USD)',
                      title='Total Amount Pledged by Quarter{}'.format(
                          '' if category_in_title is None else " ({})  ".format(category_in_title)),
                      x_series=df.index,
                      y_series=df['amount_pledged'])

    df = pledged_df.groupby(by=['quarter']).mean()
    plot_quarterly_df(fig=fig,
                      df=df,
                      filename=file_name_prepend + 'pledged_avg.png',
                      xlabel='Starting Quarter',
                      ylabel='Amount Pledged (USD)',
                      title='Average Amount Pledged per Project by Quarter{}'.format(
                          '' if category_in_title is None else " ({})  ".format(category_in_title)),
                      x_series=df.index,
                      y_series=df['amount_pledged'])

    query = """
    SELECT
      concat(year(start_date), 'Q', quarter(start_date)) AS `quarter`,
      avg(backer_count)                                  AS `mean_backers`,
      sum(backer_count)                                  AS `total_backers`
    FROM project JOIN location ON project.location_slug = location.slug
    WHERE location.country = 'US'
          AND start_date IS NOT NULL and category {}
       AND NOT (year(start_date) = year(now()) and quarter(start_date) = quarter(now()))
    GROUP BY year(start_date), quarter(start_date)
    """.format(category_requirement)

    df = pd.read_sql(query, db.engine)
    plot_quarterly_df(fig=fig,
                      df=df,
                      filename=file_name_prepend + 'backers_avg.png',
                      xlabel='Starting Quarter',
                      ylabel='Number of Backers',
                      title='Average Number of Backers per Project by Quarter{}'.format(
                          '' if category_in_title is None else " ({})  ".format(category_in_title)),
                      x_series=df['quarter'],
                      y_series=df['mean_backers'])

    plot_quarterly_df(fig=fig,
                      df=df,
                      filename=file_name_prepend + 'backers_total.png',
                      xlabel='Starting Quarter',
                      ylabel='Number of Backers',
                      title='Total Number of Backers by Quarter{}'.format(
                          '' if category_in_title is None else " ({})  ".format(category_in_title)),
                      x_series=df['quarter'],
                      y_series=df['total_backers'])

    query = """
    SELECT
      concat(year(start_date), 'Q', quarter(start_date)) AS `quarter`,
      count(*) `count`
    FROM project JOIN location ON project.location_slug = location.slug
    WHERE location.country = 'US'
          AND start_date is not null and category {}
       AND NOT (year(start_date) = year(now()) and quarter(start_date) = quarter(now()))
    GROUP BY year(start_date), quarter(start_date)
    """.format(category_requirement)
    df = pd.read_sql(query, db.engine)
    plot_quarterly_df(fig=fig,
                      df=df,
                      filename=file_name_prepend + 'projects_total_quarterly.png',
                      xlabel='Starting Quarter',
                      ylabel='Number of Projects',
                      title='Number of Projects Started Each Quarter{}'.format(
                          '' if category_in_title is None else " ({})  ".format(category_in_title)),
                      x_series=df['quarter'],
                      y_series=df['count'])
コード例 #28
0
def scrape_new_html(limit=20, url_comment_id=dict(), test_url=None):
    theta_conn = db_connections.get_theta_postgres_db()
    theta_cur = theta_conn.cursor()
    theta_cur.execute('set search_path = "backend"')
    if test_url is not None:
        theta_cur.execute(
            "select loc, html from html where loc = '{}';".format(test_url))
    else:
        theta_cur.execute("""
            SELECT
              html.loc
            , html
            FROM html
              JOIN sitemap
                ON html.loc = sitemap.loc
            WHERE (last_scrape IS NULL OR lastmod > last_scrape)
                  AND html IS NOT NULL
                  --AND NOT ('fundraisers' = ANY (categories))
                  AND NOT ('static' = ALL (categories) OR html.loc = 'https://www.crowdrise.com')
            limit {};""".format(limit))
    html_data = theta_cur.fetchall()
    if len(html_data) == 0:
        theta_cur.close()
        theta_conn.close()
        return True
    all_data = dict(fundraiser=[],
                    user=[],
                    charity=[],
                    event=[],
                    special_user=[],
                    front_page_redirect=[],
                    user_project=[],
                    charity_event=[],
                    team=[],
                    donation=[])
    scraped_urls = []
    for url, html in html_data:
        scraped_urls.append(url)
        try:
            # root = lxml.html.fromstring(lxml.html.tostring(lxml.html.fromstring(html.encode('latin1'))).decode('utf8'))
            try:
                root = lxml.html.fromstring(
                    html.encode('latin1').decode('utf8'))
            except UnicodeDecodeError:
                logging.warning(
                    'unicode decode error for url "{}"'.format(url))
                theta_conn, theta_cur = keep_theta_conn_alive(
                    theta_conn, theta_cur)
                theta_cur.execute(
                    'insert into html_bad_encoding values (%s) on CONFLICT DO NOTHING ;',
                    [(url, )])
                theta_conn.commit()
                root = lxml.html.fromstring(
                    html.encode('latin1').decode('utf8', errors='ignore'))
            try:
                page_type = CrowdriseScraper.get_page_type(root)
            except NotImplementedError:
                theta_conn, theta_cur = keep_theta_conn_alive(
                    theta_conn, theta_cur)
                theta_cur.executemany(
                    "insert into unknown_page_type values (%s) on CONFLICT DO NOTHING;",
                    [(url, )])
                theta_conn.commit()
                continue
            page_data = CrowdriseScraper.get_crowdrise_data(
                page_type,
                root,
                url,
                latest_comment_id=url_comment_id.get(url))
            if page_data is not None:
                # file_data['file_path'] = cur_file_name
                page_data['url'] = url
                page_data['true_url'] = root.xpath(
                    '//meta[@property="og:url"]')[0].attrib['content'].replace(
                        'https://', '').replace('http://', '')
                page_data['base_true_url'] = None

                # file_data['last_scrape'] = time.gmtime(os.path.getmtime(cur_file_name))

                # handle data that requires its own table - eg the fundraisers each user has
                if 'projects' in page_data.keys():
                    projects = page_data.pop('projects')
                    all_data['user_project'] += [{
                        'username':
                        page_data['username'],
                        'project':
                        'www.crowdrise.com' + x
                    } for x in projects]
                if 'events' in page_data.keys():
                    events = page_data.pop('events')
                    all_data['charity_event'] += [{
                        'charity':
                        page_data['url'],
                        'event':
                        'www.crowdrise.com' + x
                    } for x in events]
                if 'team_members' in page_data.keys():
                    team_members = page_data.pop('team_members')
                    all_data['team'] += team_members

                if 'donations' in page_data.keys():
                    donations = page_data.pop('donations')
                    all_data['donation'] += donations

                all_data[page_type].append(page_data)
        except:
            print('failed on url "{}"'.format(url))
            logging.error('failed on url "{}"'.format(url))
            raise
    all_data['user_project'] = [
        x for x in all_data['user_project']
        if re.match(CROWDRISE_URL_RE, 'https://' + x['project'])
    ]
    db = db_connections.get_fungrosencrantz_schema('crowdrise')
    db_connections.multi_table_upload(data=all_data,
                                      db=db,
                                      ensure=True,
                                      process_num=None,
                                      chunk_size=3000)
    scrape_time = time.time()

    # update table with new entries
    db.query('truncate table _recently_updated')
    db.executable.execute(
        'insert ignore into _recently_updated values (%s, %s)',
        [(x, scrape_time) for x in scraped_urls])
    db.executable.execute("""
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        SELECT
          fundraiser.url,
          CASE WHEN fundraiser_url IS NULL # individual fundraiser
            THEN fundraiser.username
          ELSE # team fundraiser
            '' # give team total raised for fundraiser, then use `team` to give individual contributions
          END,
          coalesce(team_total_raised, total_raised),
          NULL,
          _recently_updated.last_scrape_unix,
          'fundraiser'
        FROM fundraiser
          join _recently_updated on _recently_updated.url = fundraiser.url
          LEFT JOIN team ON fundraiser.url = team.fundraiser_url
        GROUP BY fundraiser.url;
        
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        select fundraiser_url, username, amount_raised, goal, _recently_updated.last_scrape_unix, 'team' from team
        join _recently_updated on _recently_updated.url = team.fundraiser_url;
        
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        select charity.url, '', money_raised, null, _recently_updated.last_scrape_unix, 'charity' from charity
        join _recently_updated on _recently_updated.url = charity.url;
        
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        select event.url, '', amount_raised, goal, _recently_updated.last_scrape_unix, 'event' from event
        join _recently_updated on _recently_updated.url = event.url;
        
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        select user.url, username, money_raised, null, _recently_updated.last_scrape_unix, 'user' from user
        join _recently_updated on _recently_updated.url = user.url;
        """)

    q = """
    update html
    set last_scrape = to_timestamp({})
    where loc in ({});""".format(
        scrape_time, ", ".join(["'" + x + "'" for x in scraped_urls]))
    theta_conn, theta_cur = keep_theta_conn_alive(theta_conn, theta_cur)
    theta_cur.execute(q)

    if test_url is None and limit != 0:
        theta_conn.commit()

    theta_cur.close()
    theta_conn.close()
    if len(html_data) < limit or test_url is not None:
        return False
    else:
        return True
コード例 #29
0
def get_aggregate_funding_trend_df(status='successful',
                                   min_data_points=4,
                                   limit=None):
    db = db_connections.get_fungrosencrantz_schema(schema='kickstarter')
    sql = """
SELECT
    projectid,
    funding_trend.amount_pledged,
    project.currency,
    funding_trend.update_count,
    funding_trend.comment_count,
    funding_trend.backer_count,
    DATEDIFF(date_added, start_date) AS `day`,
    goal
FROM
    funding_trend
        JOIN
    project ON funding_trend.projectid = project.id
WHERE
    project.status = "{}"
ORDER BY projectid
{}
    """.format(status, 'limit {}'.format(limit) if limit is not None else "")
    logging.info('Acquiring data')
    complete_dataframe = pd.read_sql(sql, db.executable)

    logging.info('Converting currencies to USD')
    convert_kickstarter_currency_df_to_usd(complete_dataframe)
    complete_dataframe['percent_of_goal'] = np.divide(
        complete_dataframe['amount_pledged'], complete_dataframe['goal']) * 100
    del complete_dataframe['currency']

    # add interpolation on the various columns per project_id
    logging.info('Interpolating...')
    grouped = complete_dataframe.groupby('projectid')
    filled_df = None
    for projectid, sub_df in grouped:
        if 0 not in sub_df['day'].values:
            day_0_df = pd.DataFrame(
                [[projectid, 0, 0, 0, 0, 0, sub_df['goal'].values[0], 0]],
                columns=sub_df.columns)
            sub_df = day_0_df.append(sub_df)
        if len(sub_df) <= min_data_points:
            continue
        interp_functions = dict(
            percent_of_goal=interp1d(sub_df['day'], sub_df['percent_of_goal']),
            amount_pledged=interp1d(sub_df['day'], sub_df['amount_pledged']),
            update_count=interp1d(sub_df['day'], sub_df['update_count']),
            comment_count=interp1d(sub_df['day'], sub_df['comment_count']),
            backer_count=interp1d(sub_df['day'], sub_df['backer_count']))
        data = dict()
        num_days = sub_df['day'].max() + 1
        days_to_evaluate_for = range(num_days)
        data['projectid'] = pd.Series([projectid] * num_days)
        data['amount_pledged'] = pd.Series(
            interp_functions['amount_pledged'](days_to_evaluate_for))
        data['update_count'] = pd.Series([
            int(round(x))
            for x in interp_functions['update_count'](days_to_evaluate_for)
        ])
        data['comment_count'] = pd.Series([
            int(round(x))
            for x in interp_functions['comment_count'](days_to_evaluate_for)
        ])
        data['backer_count'] = pd.Series([
            int(round(x))
            for x in interp_functions['backer_count'](days_to_evaluate_for)
        ])
        data['day'] = pd.Series(days_to_evaluate_for)
        data['goal'] = pd.Series([sub_df['goal'].values[0]] * num_days)
        data['percent_of_goal'] = pd.Series(
            interp_functions['percent_of_goal'](days_to_evaluate_for))
        partial_filled_df = pd.DataFrame.from_dict(data=data)
        if filled_df is None:
            filled_df = partial_filled_df
        else:
            filled_df = filled_df.append(partial_filled_df)

    del filled_df['projectid']
    del filled_df['goal']

    logging.info('Grouping and taking median')
    grouped = filled_df.groupby('day')
    grouped_df = grouped.median()
    grouped_df['count'] = grouped.size()

    return grouped_df
    '''
コード例 #30
0
def run():
    db = db_connections.get_fungrosencrantz_schema('Kiva')
    api = KivaAPI()
    upload_new_loans_and_loan_lenders(db, api, from_file='missing_loan_data.json', update=False)