def harvest_citations(dois=None):
    if dois is None:
        pass
    else:
        dois = dois.split(',')
    db = db_connect()
    get_citations(db, dois)
Example #2
0
def fetch_trips(args):
	"""Calculates the OD flows from all o cells to all d cells for a given time-slice
	hour using timedist for start time weighting.
	
	Args:
		args: a 3-tuple o, d, where o is a list of the origin cell ids and
		d a list of destination cell ids
		10min interval of the day
	Returns:
		A list of trips 
	"""

	o, d = args #arguments are passed as tuple due to pool.map() limitations
	start = time.time()

	conn = util.db_connect()
	cur = conn.cursor()

	#fetch all trips
	cur.execute(open("SQL/03_Scaling_OD/fetch_trips.sql", 'r').read(), 
					{ 	"weekdays": WEEKDAYS,
						"speed": SPEED,
						"orig_cells": o,
						"dest_cells": d
					}
				)

	result = []
	for origin, destination, trip_scale_factor, start_time, end_time, start_interval_end in cur.fetchall():
		result.append(((origin,destination), (trip_scale_factor, start_time, end_time, start_interval_end)))

	conn.close()
	return result
def harvest_all():
    """Harvest commits, citations, mentions, metadata"""
    db = db_connect()
    dois = None
    get_commits()
    get_citations(db, dois)
    get_mentions(since_version=None)
    list_records(dois)
def build_df(adm_level, iso, extent_year):

    print 'starting loss for adm level {}'.format(adm_level)

    conn = util.db_connect()

    field_list = util.level_lkp(adm_level)
    field_text = ', '.join(field_list)

    sql = 'SELECT {}, thresh, year, sum(area) as area FROM loss '.format(field_text)

    if iso:
        sql += "WHERE iso = '{}' ".format(iso)

    sql += 'GROUP BY {}, thresh, year'.format(field_text)

    df = pd.read_sql(sql, conn)
    df = util.add_lookup(df, adm_level, conn)

    # Create expression to come up with a combined field name
    # if iso, just Country, if adm1, Country_Adm1_Name, etc
    df['Country_Index'] = eval(util.country_text_lookup(adm_level))
    
    # rename the 'year' column to the name that we'll need for our summary output table
    column_name_dict = {0: 'Country', 1: 'Country_Subnat1', 2: 'Country_Subnat1_Subnat2'}
    summary_col_name = column_name_dict[adm_level]
    
    df.rename(columns = {'year': summary_col_name}, inplace=True)
    
    is_first = True

    for thresh in [10, 15, 20, 25, 30, 50, 75]:
        df_subset = df[df.thresh == thresh].copy()       
        df_subset['All areas are in hectares'] = 'TREE COVER LOSS (>{}% CANOPY COVER)'.format(thresh)
 
        df_pivot = df_subset.pivot_table(index=['Country_Index', 'All areas are in hectares'], columns=summary_col_name, values='area')
        df_pivot['TOTAL 2001-2017'] = df_pivot.sum(axis=1)
        
        df_pivot = df_pivot.unstack('All areas are in hectares')
        df_pivot = df_pivot.swaplevel(0,1, axis=1)
        del df_pivot.index.name

        if is_first:
            output_df = df_pivot.copy()
            is_first = False

        else:
            output_df = pd.concat([output_df, df_pivot], axis=1, join_axes=[output_df.index])

    sheet_name_dict = {0: 'Country', 1: 'Subnat1', 2: 'Subnat2'}
    sheet_name = 'Loss (2001-2017) by {}'.format(sheet_name_dict[adm_level])

    return sheet_name, output_df
def build_df(adm_level, iso, extent_year):

    print 'starting extent{} for adm level {}'.format(extent_year, adm_level)

    field_list = util.level_lkp(adm_level)
    field_text = ', '.join(field_list)

    sql = 'SELECT {}, thresh, sum(area) as area FROM extent{} '.format(field_text, extent_year)

    if iso:
        sql += "WHERE iso = '{}' ".format(iso)

    sql += 'GROUP BY {}, thresh'.format(field_text)

    conn = util.db_connect()
    df = pd.read_sql(sql, conn)
    df = util.add_lookup(df, adm_level, conn)
    
    # remove thresh 0 values
    df = df[df.thresh != 0]

    # Create expression to come up with a combined field name
    # if iso, just Country, if adm1, Country_Adm1_Name, etc
    df['Country_Index'] = eval(util.country_text_lookup(adm_level))

    # Group by Country and thresh, sum area
    df = df.groupby(['Country_Index', 'thresh'])['area'].sum().reset_index()

    # Add larger index for merged column in output excel sheet
    df['All areas are in hectares'] = 'TREE COVER ({}) BY PERCENT CANOPY COVER'.format(extent_year)
    
    # convert int thresh to labeled thresh percent
    df['thresh'] = df.apply(lambda row: '>{}%'.format(str(row['thresh'])), axis=1)
    
    # rename the 'thresh' column to the name that we'll need for our summary output table
    column_name_dict = {0: 'Country', 1: 'Country_Subnat1', 2: 'Country_Subnat1_Subnat2'}
    summary_col_name = column_name_dict[adm_level]
    
    df.rename(columns = {'thresh': summary_col_name}, inplace=True)
    
    # pivot and remove column where thresh == 0
    df_pivot = df.pivot_table(index=['Country_Index', 'All areas are in hectares'], columns=summary_col_name, values='area')
    
    df_pivot = df_pivot.unstack('All areas are in hectares')
    df_pivot = df_pivot.swaplevel(0,1, axis=1)
    del df_pivot.index.name

    sheet_name_dict = {0: 'Country', 1: 'Subnat1', 2: 'Subnat2'}
    sheet_name = 'Extent ({}) by {}'.format(extent_year, sheet_name_dict[adm_level])

    return sheet_name, df_pivot
Example #6
0
def build_df(adm_level, iso, extent_year):

    print 'starting gain for adm level {}'.format(adm_level)

    field_list = util.level_lkp(adm_level)
    field_text = ', '.join(field_list)

    sql = 'SELECT {}, sum(area) as area FROM gain '.format(field_text)

    if iso:
        sql += "WHERE iso = '{}' ".format(iso)

    sql += 'GROUP BY {}'.format(field_text)

    conn = util.db_connect()
    df = pd.read_sql(sql, conn)
    df = util.add_lookup(df, adm_level, conn)

    # dealing with raw data, so need to convert it
    # all change column name for output spreadsheet
    df['area'] = df.area / 10000

    # Create expression to come up with a combined field name
    # if iso, just Country, if adm1, Country_Adm1_Name, etc
    df['Country_Index'] = eval(util.country_text_lookup(adm_level))

    # group and sum just in case, and to remove additional columns
    df = df.groupby(['Country_Index'])['area'].sum().reset_index()

    # Add larger index for merged column in output excel sheet
    df['All areas are in hectares'] = 'TREE COVER GAIN'
    df['Country'] = '(>50% CANOPY COVER)'

    # Pivot so that these columns (which all have the same values) go to the top of the DF as indices
    df = df.pivot_table(columns=['All areas are in hectares', 'Country'],
                        index='Country_Index')

    # remove extraneous area level
    df.columns = df.columns.droplevel(0)

    # remove the index name level as well
    del df.index.name

    sheet_name_dict = {0: 'Country', 1: 'Subnat1', 2: 'Subnat2'}
    sheet_type = sheet_name_dict[adm_level]

    sheet_name = 'Gain (2001-2012) by {}'.format(sheet_type)

    return sheet_name, df
Example #7
0
def upload_taz(feature):
	"""Uploads a TAZ polygon to the database.
	Args:
		feature: a geojson feature dict containing a TAZ polygon
	"""

	conn = util.db_connect()
	cur = conn.cursor()

	taz_id, linestr = util.parse_taz(feature)

	sql = "	INSERT INTO taz (taz_id, geom) \
			SELECT %(taz_id)s, ST_SetSRID(ST_MakePolygon(ST_GeomFromText(%(linestr)s)),4326);"
	cur.execute(sql, {"taz_id": taz_id, "linestr": linestr})
	conn.commit()
Example #8
0
def fetch_timedist():
	"""Fetches the time distribution of trips from the database by counting 
	the number of starting trips in every 10min interval of the day.

	Returns:
		A list of size 24*6 containing the number of trips that start in every 
		10min interval of the day.
	"""

	conn = util.db_connect()
	cur = conn.cursor()

	cur.execute(open("SQL/03_Scaling_OD/trip_timedist.sql", 'r').read(),
					{ 	"weekdays": WEEKDAYS,
						"speed": SPEED,
						"maxinterval": MAX_INTERVAL
					}
				)

	timedist = [None] * 24*6
	for interval, count in cur.fetchall():
		timedist[int(interval)] = count
	return timedist
	conn.close()
Example #9
0
def init():
	global conn, cur
	conn = util.db_connect()
	cur = conn.cursor()
	def setUp(self):
		self.conn = util.db_connect()
		self.cur = self.conn.cursor()
import psycopg2

import util, config  # local modules

util.db_login()
conn = util.db_connect()
cur = conn.cursor()

# create backup copy eant_pos_full that keeps all antennas even when eant_pos is clustered
print("Creating backup table eant_pos_original (takes a while)...")
cur.execute("DROP TABLE IF EXISTS eant_pos_original CASCADE")
conn.commit()
cur.execute("CREATE TABLE eant_pos_original (LIKE eant_pos);")
cur.execute("ALTER TABLE eant_pos_original ADD CONSTRAINT eant_pos_original_pkey PRIMARY KEY (id);")
cur.execute("INSERT INTO eant_pos_original SELECT * FROM eant_pos;")
conn.commit()


# create backup copy homebase_original before clustering
print("Creating backup table ehomebase_orignal (takes a while)...")
cur.execute("DROP TABLE IF EXISTS ehomebase_original CASCADE")
conn.commit()
cur.execute("CREATE TABLE ehomebase_original (LIKE ehomebase);")
cur.execute("ALTER TABLE ehomebase_original ADD CONSTRAINT ehomebase_original_pkey PRIMARY KEY (id);")
cur.execute("INSERT INTO ehomebase_original SELECT * FROM ehomebase;")
conn.commit()
"""
https://developer.github.com/v3/repos/commits/
"""
import requests
import concurrent.futures
import os
import re
from dateutil import parser
import datetime
from util import db_connect
import logging
import pymongo
db = db_connect()

logger = logging.getLogger(__name__)


class APIRateLimitExceeded(Exception):
    pass


class Repo:
    def __init__(self, url):
        self.url = url
        self.name = re.match(r'https?://github.com/(.*?)/?$', url).group(1)
        self.new_commits = None
        try:
            last_commit_date = db.commit.find({'repositoryURL': url}).sort('date', pymongo.DESCENDING)[0]['date']
            # add one second
            self.synced_until = (parser.parse(last_commit_date) + datetime.timedelta(0, 1)).isoformat()[0:-6] + 'Z'
        except IndexError:
Example #13
0
def calculate_od(args):
	"""Calculates the OD flows for one OD-pair and inserts into od table.

	Args:
		args: tuple od, values where od is a tuple of origin and destination cell ids (o,d)
			and values a tuple (trip_scale_factor, start_time, end_time, start_interval_end)
	"""

	global timedist

	od, values = args
	o, d = od
	trip_scale_factors, start_times, end_times, start_interval_ends = [list(x) for x in zip(*values)]

	#calculate time distribution
	no_trips = [0] * 24*6 #10min intervals
	for i in range(0, len(start_times)):
		start_interval_length = start_interval_ends[i] - start_times[i] #uncertainty in start time in minutes
		if start_interval_length < 0:
			#This trip must have been made with a speed higher than
            #50km/h since the computed end time is before the start
            #time. Skipping...
			continue
		elif start_interval_length < MAX_INTERVAL: #only count trips for time dist with precise trip start info
			interval = int(((start_interval_ends[i]/60 - start_interval_length/2) % (24*60))/10) #10min intervals
			no_trips[interval] += 1

	#calculate OD flows
	flows = numpy.array([0.0] * 24)
	for i in range(0, len(start_times)):
		start_interval = int(((start_times[i]/60) % (24*60))/10) #10min intervals
		end_interval = int(((start_interval_ends[i]/60) % (24*60))/10) #10min intervals
		scale_factor = trip_scale_factors[i]
		if scale_factor == 0:
			scale_factor = 1 #no scale factor for this user, count as 1 trip

		weight = [0.0] * 24*6 #weights for the trip in 10min intervals
		weight_function = lambda trips, dist: scale_factor*float(trips)/float(sum(dist[start_interval:end_interval+1])) #scale * trips/total_trips
		if sum(no_trips[start_interval:end_interval+1]) >= OD_SPECIFIC_TIMEDIST_THRESHOLD: #enough time dist for this OD info available
			weight[start_interval:end_interval+1] = [weight_function(trips, no_trips) for trips in no_trips[start_interval:end_interval+1]]
		else: #otherwise use timedist for all trips
			weight[start_interval:end_interval+1] = [weight_function(trips, timedist) for trips in timedist[start_interval:end_interval+1]]
		weight_hours = numpy.array([sum(weight[6*hour:6*hour+6]) for hour in range(0,24)])
		flows += weight_hours #add this (scaled) trip to the od flow

	#Upload OD flows to DB
	data = []
	for interval in range(0,24):
		if flows[interval] > 0:
			data.append((o,d,interval,flows[interval]))

	conn = util.db_connect()
	cur = conn.cursor()
	rows = [cur.mogrify("(%s, %s, %s, %s)", values) for values in data]

	if len(rows) > 0:
		sql = "INSERT INTO od (orig_cell, dest_cell, interval, flow) \
			   VALUES " + ", ".join(rows) + ";"
		cur.execute(sql)
		conn.commit()
	cur.close()

	end = time.time()
def initiate_connection(username, cluster):
    """ Initiate connection with Redshift cluster

    @param username: master username from replay.yaml
    @param cluster: cluster dictionary
    """

    response = None
    logger = logging.getLogger("SimpleReplayLogger")

    if cluster.get("is_serverless"):
        secret_name = get_secret(cluster.get('secret_name'),
                                 cluster.get("region"))
        response = {
            'DbUser': secret_name["admin_username"],
            'DbPassword': secret_name["admin_password"]
        }
    else:
        rs_client = client('redshift', region_name=cluster.get("region"))
        # get response from redshift to get cluster credentials using provided cluster info
        try:
            response = rs_client.get_cluster_credentials(
                DbUser=username,
                DbName=cluster.get("database"),
                ClusterIdentifier=cluster.get("id"),
                DurationSeconds=900,
                AutoCreate=False,
            )
        except rs_client.exceptions.ClusterNotFoundFault:
            logger.error(
                f"Cluster {cluster.get('id')} not found. Please confirm cluster endpoint, account, and region."
            )
            exit(-1)
        except Exception as e:
            logger.error(
                f"Unable to connect to Redshift. Confirm IAM permissions include Redshift::GetClusterCredentials."
                f" {e}")
            exit(-1)

    if response is None or response.get('DbPassword') is None:
        logger.error(f"Failed to retrieve credentials for user {username} ")
        response = None

    # define cluster string/dict
    cluster_string = {
        "username": response["DbUser"],
        "password": response["DbPassword"],
        "host": cluster.get("host"),
        "port": cluster.get("port"),
        "database": cluster.get("database"),
    }

    conn = None
    try:
        logger.info(f"Connecting to {cluster.get('id')}")
        conn = db_connect(
            host=cluster_string["host"],
            port=int(cluster_string["port"]),
            username=cluster_string["username"],
            password=cluster_string["password"],
            database=cluster_string["database"])  # yield to reuse connection
        yield conn
    except redshift_connector.error.Error as e:
        logger.error(
            f"Unable to connect to Redshift. Please confirm credentials. {e} ")
        exit(-1)
    except Exception as e:
        logger.error(f"Unable to connect to Redshift. {e}")
        exit(-1)
    finally:
        if conn is not None:
            conn.close()