def scrape_vimeo(query): # Extract a UTC datetime object utc_datetime = datetime.datetime.utcnow() # Connect to the DB; if we can't, this will fail anyway. db = SqlSoup(dbconfig.dburl) page = urllib2.urlopen(query).read() soup = BeautifulSoup(page) count_regex = re.compile('\+ Browse all ([0-9.]+)K videos') for license_code in license_codes: license = 'http://creativecommons.org/licenses/%s/3.0/' % license_code count_href = '/creativecommons/' + license_code count_text = soup.find('a', 'more', href=count_href).string count = count_regex.match(count_text).group(1) if count: # Vimeo notates number of videos as [N]K, where N may be a floating # point number, but the true number will never be floating, so just # do the multiplication, then convert it to an integer. real_count = int(float(count) * 1000) # Build a db row row = {} row['utc_time_stamp'] = utc_datetime row['site'] = 'http://vimeo.com/' row['license_uri'] = license row['count'] = real_count # Insert count into site_specific table db.site_specific.insert(**row) db.flush() else: raise Exception, "No count found at Vimeo for license: " + license
def main(): db = SqlSoup(dbconfig.dburl) for filename in sorted(glob.glob(FLICKR_DATA_BASE_PATH + '/*.csv')): if 'cumulative' in filename: continue utc_date_time = filename2utc_datetime(filename) csv_fd = csv.reader(open(filename)) print 'Importing', filename, for row in csv_fd: importable = csv_row2dict(row, utc_date_time) db.site_specific.insert(**importable) db.flush() # since that worked, rename the filename os.rename(filename, filename + '.imported') print '...done.'
class GenericFillData(object): def __init__(self): self.azienda_destinazione = "latelier" self.database = "promogest_db" self.port = "5432" self.user = "******" self.password = "******" self.host = "localhost" self.file_csv = "aliquota_iva.csv" self.mainSchema = "promogest2" self.connection() def connection(self): engine = create_engine('postgres:'+'//'+self.user+':' + self.password+ '@' + self.host + ':' + self.port + '/' + self.database, encoding='utf-8', convert_unicode=True ) engine.echo = True meta = MetaData(engine) self.pg_db_dest = SqlSoup(meta) self.pg_db_dest.schema = self.azienda_destinazione self.readFile() def readFile(self): spamReader = csv.reader(open(self.file_csv), delimiter=';', quotechar='"') self.fillDataContact(spamReader) def fillDataContact(self,spamReader): for row in spamReader: _art = self.pg_db_dest.aliquota_iva() _art.id=row[0] _art.denominazione_breve=row[1] _art.denominazione=row[2] _art.percentuale = row[3] _art.percentuale_detrazione = row[4] _art.descrizione_detrazione = row[5] _art.id_tipo = row[6] sqlalchemy.ext.sqlsoup.Session.add(_art) sqlalchemy.ext.sqlsoup.Session.commit() self.pg_db_dest.flush()
def main(infd, unix_time, dry_run = False): # Extract a UTC datetime object utc_datetime = datetime.datetime.utcfromtimestamp(unix_time) # Connect to the DB; if we can't, this will fail anyway. db = SqlSoup(dbconfig.dburl) # Scrape the results we just wgetted license2count = parse(infd) # Prepare any remaining DB columns, and write a CSV summary file extra_data = { 'utc_time_stamp': utc_datetime, 'site': 'http://www.flickr.com/'} importable = [] csv_values = {} license2flickr = dict((v,k) for k,v in flickr2license.items()) for key in license2count: row = {} row.update(extra_data) row['count'] = license2count[key] row['license_uri'] = key importable.append(row) csv_values[license2flickr[key]] = row['count'] if dry_run: print importable print csv_values else: # Write data to CSV file csv = open('./data/%s.csv.imported' % utc_datetime.date(), 'w') for license, count in csv_values.items(): csv.write('%s,%d\n' % (license,count)) csv.close() # Write data to database counts = {} for row in importable: db.site_specific.insert(**row) db.flush() counts[row['license_uri'].split('/')[4]] = row['count'] # Sort by license code lic_sorted = sorted(counts) # Join counts (sorted) with a comma cnt_sorted = ','.join(map(str, [counts[key] for key in lic_sorted])) # Write csv data to big historical file. WARNING: this presupposes # that license version and jurisdiction never change on Flickr hist = open('./data/counts-historical-Flickr.csv', 'a') hist.write(str(utc_datetime.date()) + ',Flickr,2.0,Unported,' + cnt_sorted + '\n') hist.close()
def main(query): # Extract a UTC datetime object utc_datetime = datetime.datetime.utcnow() # Connect to the DB; if we can't, this will fail anyway. db = SqlSoup(dbconfig.dburl) # Scrape for PDM count count = scrape_europeana_pdm(query) # Build a db row row = {} row['utc_time_stamp'] = utc_datetime row['site'] = 'http://europeana.eu/' row['license_uri'] = 'http://creativecommons.org/publicdomain/mark/1.0/' row['count'] = count # Insert count into site_specific table db.site_specific.insert(**row) db.flush()
def main(license,query): # Extract a UTC datetime object utc_datetime = datetime.datetime.utcnow() # Connect to the DB; if we can't, this will fail anyway. db = SqlSoup(dbconfig.dburl) # Scrape for CC-BY and all count. count = scrape_youtube(query) # Build a db row row = {} row['utc_time_stamp'] = utc_datetime row['site'] = 'http://www.youtube.com/' row['license_uri'] = license row['count'] = count # Insert count into site_specific table db.site_specific.insert(**row) db.flush()
class ZoomDatabase(object): """ Database access layer for jobs to interact with djangozoom. """ LOG_INFO = "i" LOG_WARN = "w" LOG_ERROR = "e" LOG_STEP_BEGIN = "sb" LOG_STEP_END = "se" LOG_JOB_BEGIN = "jb" LOG_JOB_END = "je" VALID_LOG = (LOG_INFO, LOG_WARN, LOG_ERROR, LOG_STEP_BEGIN, LOG_STEP_END, LOG_JOB_BEGIN, LOG_JOB_END) def __init__(self, db, job_id): self._db = db self._soup = SqlSoup(db) self._job_id = job_id def log(self, message, log_type=LOG_INFO): """Log a message against this job to the database for the UI. :param message: A string to log :param log_type: An ad hoc string denoting one of the log types. Symbolic names are available on this class. """ if not log_type in self.VALID_LOG: raise ValueError("Invalid log type %r" % log_type) self._soup.dz2_log.insert( job_id=self._job_id, logtype=log_type, message=message, timestamp=datetime.utcnow()) self._soup.session.commit() def flush(self): """ Flush any changes sqlsoup has accumulated (write updates to the DB). """ self._soup.flush() def add_bundle(self, bundle_name, code_revision=None): """ Store the app's bundle location (assuming this zoomdb's job has an associated project). :param bundle_name: The name of the application bundle. :param code_revision: The code revision the bundle was created from. """ if code_revision is None: code_revision = "" CODE_REVISION_FIELD_LENGTH = 512 if len(code_revision) > CODE_REVISION_FIELD_LENGTH: code_revision = (code_revision[0:(CODE_REVISION_FIELD_LENGTH - 3)] + "...") bundle = self._soup.dz2_appbundle.insert( project_id=self.get_project_id(), bundle_name=bundle_name, code_revision=code_revision, creation_date=datetime.utcnow()) self._soup.session.commit() return bundle def get_bundle(self, bundle_id): """Retrieve a bundle by database id. """ return self._soup.dz2_appbundle.filter( self._soup.dz2_appbundle.id == bundle_id).one() self._soup.session.commit() def get_all_bundles(self): """Get a list of all app_bundles for the current job's project.""" return list(self._soup.dz2_appbundle.filter( self._soup.dz2_appbundle.project_id == self.get_project_id())) def add_worker( self, bundle_id, instance_id, server_ip, server_port): """Record a worker added for an application. :param app_db_id: Application/Project database id :param bundle_id: The bundle database id, that the worker is running. :param instance_id: The ec2 instance id of the server. :param server_ip: The server ip address. :param server_port: The port on the server, the app is listening on. """ worker = self._soup.dz2_appserverdeployment.insert( project_id=self.get_project_id(), bundle_id=bundle_id, server_instance_id=instance_id, server_ip=server_ip, server_port=server_port, creation_date=datetime.utcnow()) self._soup.session.commit() return worker def search_workers(self, bundle_ids=None, active=True): """ Get workers in this job's project matching the supplied criteria. :param bundle_ids: Bundle DB ids to match; if None, any bundles in the app are included. :param active: If True (the default, limit results to currently active workers. """ asd = self._soup.dz2_appserverdeployment #bundle = self._soup.dz2_appbundle qs = asd.filter(asd.project_id == self.get_project_id()) # # same thing, but spelled out entirely explicitly # # including the association table. # session.query(Article).join( # (article_keywords, # Articles.id==article_keywords.c.article_id), # (Keyword, Keyword.id==article_keywords.c.keyword_id) # ) # if bundle_names: # qs = qs.join((bundle, # asd.bundle_id == bundle.id)).filter( # bundle.bundle_name._in(bundle_names)) # SR 3/4/11: for some reason the above doesn't work. F**k it, # i'll filter in Python. Sorry. if active is not None: if active: qs = qs.filter(asd.deactivation_date == None) else: qs = qs.filter(asd.deactivation_date != None) result = list(qs) if bundle_ids: result = filter(lambda b: b.bundle_id in bundle_ids, result) return result def search_allproject_workers(self, active=True, ip_address=None): """ Get workers from any project, not just my get_project_id() project, matching the provided criteria. """ asd = self._soup.dz2_appserverdeployment qs = asd if active is not None: if active: qs = qs.filter(asd.deactivation_date == None) else: qs = qs.filter(asd.deactivation_date != None) if ip_address is not None: qs = qs.filter(asd.server_ip == ip_address) return list(qs) def get_project_workers(self): """Get all AppServerDeployments for this job's project. Note that this includes inactive deployments.""" return list(self._soup.dz2_appserverdeployment.filter( self._soup.dz2_appserverdeployment.project_id == self.get_project_id())) def get_project_worker_by_id(self, deployment_id): """Get an AppServerDeployment record matching the given id.""" return self._soup.dz2_appserverdeployment.filter( self._soup.dz2_appserverdeployment.project_id == self.get_project_id()).filter( self._soup.dz2_appserverdeployment.id == deployment_id).one() def get_job(self): """Get the Job row.""" if not hasattr(self, "_job"): self._job = self._soup.dz2_job.filter( self._soup.dz2_job.id == self._job_id).one() return self._job def get_job_id(self): """Get my job ID.""" return self._job_id def get_project(self): """Get the project row.""" if not hasattr(self, "_project"): self._project = self._soup.dz2_project.filter( self._soup.dz2_project.id == self.get_project_id()).one() return self._project def get_project_id(self): """Get the project_id associated with this zoomdb's job.""" return self.get_job().project_id def add_config_guess(self, field, value, is_primary, basis): """Add a config guess for the user to review/confirm.""" self._soup.dz2_configguess.insert( project_id=self.get_project_id(), field=field, value=value, is_primary=is_primary, basis=basis) self._soup.session.commit() def get_project_virtual_hosts(self): """Get virtual hostname strings for this project. Always includes at least the one canonical vhost name.""" project = self.get_project() canonical_vhost_name = \ taskconfig.CANONICAL_VIRTUAL_HOST_FORMAT % ( taskconfig.PROJECT_SYSID_FORMAT % project.id) result = [canonical_vhost_name] if project.hostname_slug: result.append("%s.%s" % (project.hostname_slug, taskconfig.CUSTOMER_DNS_ROOT_DOMAIN)) # query the DB for matching VirtualHostname records vhosts = self._soup.dz2_virtualhostname.filter( self._soup.dz2_virtualhostname.project_id == project.id) for vh in vhosts: if vh.is_wildcard: result.append("*.%s" % vh.hostname) result.append(vh.hostname) return result def mark_postgis_enabled(self): project = self.get_project() project.database_type = "postgresql-gis" self._soup.session.commit()
class GenericFillData(object): def __init__(self): self.azienda_destinazione = "elisir" self.database = "promogest_db" self.port = "5432" self.user = "******" self.password = "******" self.host = "localhost" self.file_csv = "coppola_cli.csv" self.mainSchema = "promogest2" self.connection() def connection(self): engine = create_engine('postgres:'+'//'+self.user+':' + self.password+ '@' + self.host + ':' + self.port + '/' + self.database, encoding='utf-8', convert_unicode=True ) engine.echo = True meta = MetaData(engine) self.pg_db_dest = SqlSoup(meta) self.pg_db_dest.schema = self.azienda_destinazione self.readFile() def readFile(self): spamReader = csv.reader(open(self.file_csv), delimiter=';', quotechar='"') self.fillDataContact(spamReader) def fillDataContact(self,spamReader): for row in spamReader: _contatti = self.pg_db_dest.contatto() nomecognome = self.name(row[1]) _contatti.tipo_contatto="generico" _contatti.nome=nomecognome[1] _contatti.cognome=nomecognome[0] sqlalchemy.ext.sqlsoup.Session.add(_contatti) sqlalchemy.ext.sqlsoup.Session.commit() _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[3] _recapiti.tipo_recapito="Indirizzo" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) #sqlalchemy.ext.sqlsoup.Session.commit() #self.pg_db_dest.flush() _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[4] _recapiti.tipo_recapito="CAP" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[5] _recapiti.tipo_recapito="Citta'" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[6] _recapiti.tipo_recapito="Provincia" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) if row[7] !='': _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[7] _recapiti.tipo_recapito="Telefono" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) if row[8] !='': _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[8] _recapiti.tipo_recapito="Telefono" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) if row[9] !='': _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[9] _recapiti.tipo_recapito="Fax" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) if row[10] !='': _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[10] _recapiti.tipo_recapito="Info" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) if row[11] !='': _recapiti = self.pg_db_dest.recapito() _recapiti.recapito=row[11] _recapiti.tipo_recapito="Info" _recapiti.id_contatto=_contatti.id sqlalchemy.ext.sqlsoup.Session.add(_recapiti) sqlalchemy.ext.sqlsoup.Session.commit() self.pg_db_dest.flush() def name(self, data): rowlist = data.split(" ") if len(rowlist) == 4: cognome = rowlist[0] + " " +rowlist[1] nome = rowlist[2] + " " +rowlist[3] elif len(rowlist)==3: cognome = rowlist[0] + " " +rowlist[1] nome = rowlist[2] elif len(rowlist)==2: cognome = rowlist[0] nome = rowlist[1] elif len(rowlist) == 1: cognome = rowlist[0] nome = None elif len(rowlist) >4: cognome = rowlist nome = None return (cognome, nome)