def scrape_vimeo(query):
    # Extract a UTC datetime object
    utc_datetime = datetime.datetime.utcnow()
    # Connect to the DB; if we can't, this will fail anyway.
    db = SqlSoup(dbconfig.dburl)

    page = urllib2.urlopen(query).read()
    soup = BeautifulSoup(page)
    count_regex = re.compile('\+ Browse all ([0-9.]+)K videos')

    for license_code in license_codes:
        license = 'http://creativecommons.org/licenses/%s/3.0/' % license_code
        count_href = '/creativecommons/' + license_code
        count_text = soup.find('a', 'more', href=count_href).string
        count = count_regex.match(count_text).group(1)
        if count:
	    # Vimeo notates number of videos as [N]K, where N may be a floating
	    # point number, but the true number will never be floating, so just
	    # do the multiplication, then convert it to an integer.
	    real_count = int(float(count) * 1000)
            # Build a db row
            row = {}
            row['utc_time_stamp'] = utc_datetime
            row['site'] = 'http://vimeo.com/'
            row['license_uri'] = license
            row['count'] = real_count
            # Insert count into site_specific table
            db.site_specific.insert(**row)
            db.flush()
        else:
            raise Exception, "No count found at Vimeo for license: " + license
Beispiel #2
0
def main():
    db = SqlSoup(dbconfig.dburl)
    for filename in sorted(glob.glob(FLICKR_DATA_BASE_PATH + '/*.csv')):
        if 'cumulative' in filename:
            continue
        utc_date_time = filename2utc_datetime(filename)
        csv_fd = csv.reader(open(filename))
        print 'Importing', filename,
        for row in csv_fd:
            importable = csv_row2dict(row, utc_date_time)
            db.site_specific.insert(**importable)
            db.flush()
        # since that worked, rename the filename
        os.rename(filename, filename + '.imported')
        print '...done.'
class GenericFillData(object):
    def __init__(self):
        self.azienda_destinazione = "latelier"
        self.database = "promogest_db"
        self.port = "5432"
        self.user = "******"
        self.password = "******"
        self.host = "localhost"
        self.file_csv = "aliquota_iva.csv"
        self.mainSchema = "promogest2"
        self.connection()

    def connection(self):
        engine = create_engine('postgres:'+'//'+self.user+':'
                            + self.password+ '@'
                            + self.host + ':'
                            + self.port + '/'
                            + self.database,
                            encoding='utf-8',
                            convert_unicode=True )

        engine.echo = True
        meta = MetaData(engine)
        self.pg_db_dest = SqlSoup(meta)
        self.pg_db_dest.schema = self.azienda_destinazione
        self.readFile()
    
    def readFile(self):
        spamReader = csv.reader(open(self.file_csv), delimiter=';', quotechar='"')
        self.fillDataContact(spamReader)
    
    def fillDataContact(self,spamReader):
        for row in spamReader:
            _art = self.pg_db_dest.aliquota_iva()
            _art.id=row[0]
            _art.denominazione_breve=row[1]
            _art.denominazione=row[2]
            _art.percentuale = row[3]
            _art.percentuale_detrazione = row[4]
            _art.descrizione_detrazione = row[5]
            _art.id_tipo = row[6]

            sqlalchemy.ext.sqlsoup.Session.add(_art)
            sqlalchemy.ext.sqlsoup.Session.commit()

        self.pg_db_dest.flush()
def main(infd, unix_time, dry_run = False):
    # Extract a UTC datetime object
    utc_datetime = datetime.datetime.utcfromtimestamp(unix_time)
    # Connect to the DB; if we can't, this will fail anyway.
    db = SqlSoup(dbconfig.dburl)
    # Scrape the results we just wgetted
    license2count = parse(infd)
    # Prepare any remaining DB columns, and write a CSV summary file
    extra_data = {
        'utc_time_stamp': utc_datetime,
        'site': 'http://www.flickr.com/'}
    importable = []
    csv_values = {}
    license2flickr = dict((v,k) for k,v in flickr2license.items())
    for key in license2count:
        row = {}
        row.update(extra_data)
        row['count'] = license2count[key]
        row['license_uri'] = key
        importable.append(row)
        csv_values[license2flickr[key]] = row['count']
    if dry_run:
        print importable
        print csv_values
    else:
        # Write data to CSV file
        csv = open('./data/%s.csv.imported' % utc_datetime.date(), 'w')
        for license, count in csv_values.items():
            csv.write('%s,%d\n' % (license,count))
        csv.close()
        # Write data to database
        counts = {}
        for row in importable:
            db.site_specific.insert(**row)
            db.flush()
            counts[row['license_uri'].split('/')[4]] = row['count']

        # Sort by license code
        lic_sorted = sorted(counts)
        # Join counts (sorted) with a comma
        cnt_sorted = ','.join(map(str, [counts[key] for key in lic_sorted]))
        # Write csv data to big historical file.  WARNING: this presupposes
        # that license version and jurisdiction never change on Flickr
        hist = open('./data/counts-historical-Flickr.csv', 'a')
        hist.write(str(utc_datetime.date()) + ',Flickr,2.0,Unported,' + cnt_sorted + '\n')
        hist.close()
def main(query):
    # Extract a UTC datetime object
    utc_datetime = datetime.datetime.utcnow()
    # Connect to the DB; if we can't, this will fail anyway.
    db = SqlSoup(dbconfig.dburl)
    # Scrape for PDM count
    count = scrape_europeana_pdm(query)

    # Build a db row
    row = {}
    row['utc_time_stamp'] = utc_datetime
    row['site'] = 'http://europeana.eu/'
    row['license_uri'] = 'http://creativecommons.org/publicdomain/mark/1.0/'
    row['count'] = count

    # Insert count into site_specific table
    db.site_specific.insert(**row)
    db.flush()
def main(license,query):
    # Extract a UTC datetime object
    utc_datetime = datetime.datetime.utcnow()
    # Connect to the DB; if we can't, this will fail anyway.
    db = SqlSoup(dbconfig.dburl)
    # Scrape for CC-BY and all count.
    count = scrape_youtube(query)

    # Build a db row
    row = {}
    row['utc_time_stamp'] = utc_datetime
    row['site'] = 'http://www.youtube.com/'
    row['license_uri'] = license
    row['count'] = count

    # Insert count into site_specific table
    db.site_specific.insert(**row)
    db.flush()
Beispiel #7
0
class ZoomDatabase(object):
    """
    Database access layer for jobs to interact with djangozoom.
    """
    LOG_INFO = "i"
    LOG_WARN = "w"
    LOG_ERROR = "e"
    LOG_STEP_BEGIN = "sb"
    LOG_STEP_END = "se"
    LOG_JOB_BEGIN = "jb"
    LOG_JOB_END = "je"

    VALID_LOG = (LOG_INFO, LOG_WARN, LOG_ERROR, LOG_STEP_BEGIN, LOG_STEP_END,
                 LOG_JOB_BEGIN, LOG_JOB_END)

    def __init__(self, db, job_id):
        self._db = db
        self._soup = SqlSoup(db)
        self._job_id = job_id

    def log(self, message, log_type=LOG_INFO):
        """Log a message against this job to the database for the UI.

        :param message: A string to log
        :param log_type: An ad hoc string denoting one of the log types.
                         Symbolic names are available on this class.
        """
        if not log_type in self.VALID_LOG:
            raise ValueError("Invalid log type %r" % log_type)

        self._soup.dz2_log.insert(
            job_id=self._job_id,
            logtype=log_type,
            message=message,
            timestamp=datetime.utcnow())
        self._soup.session.commit()

    def flush(self):
        """
        Flush any changes sqlsoup has accumulated (write updates to the DB).
        """
        self._soup.flush()

    def add_bundle(self, bundle_name, code_revision=None):
        """
        Store the app's bundle location (assuming this zoomdb's job has an
        associated project).

        :param bundle_name: The name of the application bundle.
        :param code_revision: The code revision the bundle was created from.
        """

        if code_revision is None:
            code_revision = ""

        CODE_REVISION_FIELD_LENGTH = 512
        if len(code_revision) > CODE_REVISION_FIELD_LENGTH:
            code_revision = (code_revision[0:(CODE_REVISION_FIELD_LENGTH - 3)]
                             + "...")

        bundle = self._soup.dz2_appbundle.insert(
            project_id=self.get_project_id(),
            bundle_name=bundle_name,
            code_revision=code_revision,
            creation_date=datetime.utcnow())
        self._soup.session.commit()
        return bundle

    def get_bundle(self, bundle_id):
        """Retrieve a bundle by database id.
        """
        return self._soup.dz2_appbundle.filter(
            self._soup.dz2_appbundle.id == bundle_id).one()
        self._soup.session.commit()

    def get_all_bundles(self):
        """Get a list of all app_bundles for the current job's project."""
        return list(self._soup.dz2_appbundle.filter(
                self._soup.dz2_appbundle.project_id == self.get_project_id()))

    def add_worker(
        self, bundle_id, instance_id, server_ip, server_port):
        """Record a worker added for an application.

        :param app_db_id: Application/Project database id
        :param bundle_id: The bundle database id, that the worker is running.
        :param instance_id: The ec2 instance id of the server.
        :param server_ip: The server ip address.
        :param server_port: The port on the server, the app is listening on.
        """
        worker = self._soup.dz2_appserverdeployment.insert(
            project_id=self.get_project_id(),
            bundle_id=bundle_id,
            server_instance_id=instance_id,
            server_ip=server_ip,
            server_port=server_port,
            creation_date=datetime.utcnow())
        self._soup.session.commit()
        return worker

    def search_workers(self, bundle_ids=None, active=True):
        """
        Get workers in this job's project matching the supplied criteria.

        :param bundle_ids: Bundle DB ids to match; if None, any bundles in
                             the app are included.
        :param active: If True (the default, limit results to currently
                       active workers.
        """
        asd = self._soup.dz2_appserverdeployment
        #bundle = self._soup.dz2_appbundle
        qs = asd.filter(asd.project_id == self.get_project_id())

# # same thing, but spelled out entirely explicitly
# # including the association table.
# session.query(Article).join(
#     (article_keywords,
#     Articles.id==article_keywords.c.article_id),
#     (Keyword, Keyword.id==article_keywords.c.keyword_id)
#     )

        # if bundle_names:
        #     qs = qs.join((bundle,
        #                   asd.bundle_id == bundle.id)).filter(
        #         bundle.bundle_name._in(bundle_names))
        # SR 3/4/11: for some reason the above doesn't work. F**k it,
        # i'll filter in Python. Sorry.

        if active is not None:
            if active:
                qs = qs.filter(asd.deactivation_date == None)
            else:
                qs = qs.filter(asd.deactivation_date != None)

        result = list(qs)

        if bundle_ids:
            result = filter(lambda b: b.bundle_id in bundle_ids, result)

        return result

    def search_allproject_workers(self, active=True, ip_address=None):
        """
        Get workers from any project, not just my get_project_id() project,
        matching the provided criteria.
        """
        asd = self._soup.dz2_appserverdeployment

        qs = asd

        if active is not None:
            if active:
                qs = qs.filter(asd.deactivation_date == None)
            else:
                qs = qs.filter(asd.deactivation_date != None)

        if ip_address is not None:
            qs = qs.filter(asd.server_ip == ip_address)

        return list(qs)

    def get_project_workers(self):
        """Get all AppServerDeployments for this job's project.
        Note that this includes inactive deployments."""
        return list(self._soup.dz2_appserverdeployment.filter(
                self._soup.dz2_appserverdeployment.project_id ==
                self.get_project_id()))

    def get_project_worker_by_id(self, deployment_id):
        """Get an AppServerDeployment record matching the given id."""
        return self._soup.dz2_appserverdeployment.filter(
            self._soup.dz2_appserverdeployment.project_id ==
            self.get_project_id()).filter(
            self._soup.dz2_appserverdeployment.id ==
            deployment_id).one()

    def get_job(self):
        """Get the Job row."""
        if not hasattr(self, "_job"):
            self._job = self._soup.dz2_job.filter(
                self._soup.dz2_job.id == self._job_id).one()
        return self._job

    def get_job_id(self):
        """Get my job ID."""
        return self._job_id

    def get_project(self):
        """Get the project row."""
        if not hasattr(self, "_project"):
            self._project = self._soup.dz2_project.filter(
                self._soup.dz2_project.id == self.get_project_id()).one()
        return self._project

    def get_project_id(self):
        """Get the project_id associated with this zoomdb's job."""
        return self.get_job().project_id

    def add_config_guess(self, field, value, is_primary, basis):
        """Add a config guess for the user to review/confirm."""
        self._soup.dz2_configguess.insert(
            project_id=self.get_project_id(),
            field=field,
            value=value,
            is_primary=is_primary,
            basis=basis)
        self._soup.session.commit()

    def get_project_virtual_hosts(self):
        """Get virtual hostname strings for this project. Always includes at
        least the one canonical vhost name."""
        project = self.get_project()

        canonical_vhost_name = \
            taskconfig.CANONICAL_VIRTUAL_HOST_FORMAT % (
            taskconfig.PROJECT_SYSID_FORMAT % project.id)

        result = [canonical_vhost_name]

        if project.hostname_slug:
            result.append("%s.%s" % (project.hostname_slug,
                                     taskconfig.CUSTOMER_DNS_ROOT_DOMAIN))

        # query the DB for matching VirtualHostname records
        vhosts = self._soup.dz2_virtualhostname.filter(
            self._soup.dz2_virtualhostname.project_id ==
            project.id)

        for vh in vhosts:
            if vh.is_wildcard:
                result.append("*.%s" % vh.hostname)
            result.append(vh.hostname)

        return result

    def mark_postgis_enabled(self):
        project = self.get_project()
        project.database_type = "postgresql-gis"
        self._soup.session.commit()
class GenericFillData(object):
    def __init__(self):
        self.azienda_destinazione = "elisir"
        self.database = "promogest_db"
        self.port = "5432"
        self.user = "******"
        self.password = "******"
        self.host = "localhost"
        self.file_csv = "coppola_cli.csv"
        self.mainSchema = "promogest2"
        self.connection()

    def connection(self):
        engine = create_engine('postgres:'+'//'+self.user+':'
                            + self.password+ '@'
                            + self.host + ':'
                            + self.port + '/'
                            + self.database,
                            encoding='utf-8',
                            convert_unicode=True )

        engine.echo = True
        meta = MetaData(engine)
        self.pg_db_dest = SqlSoup(meta)
        self.pg_db_dest.schema = self.azienda_destinazione
        self.readFile()
    
    def readFile(self):
        spamReader = csv.reader(open(self.file_csv), delimiter=';', quotechar='"')
        self.fillDataContact(spamReader)
    
    def fillDataContact(self,spamReader):
        for row in spamReader:
            _contatti = self.pg_db_dest.contatto()
            nomecognome = self.name(row[1])
            _contatti.tipo_contatto="generico"
            _contatti.nome=nomecognome[1]
            _contatti.cognome=nomecognome[0]

            sqlalchemy.ext.sqlsoup.Session.add(_contatti)
            sqlalchemy.ext.sqlsoup.Session.commit()

            _recapiti = self.pg_db_dest.recapito()
            _recapiti.recapito=row[3]
            _recapiti.tipo_recapito="Indirizzo"
            _recapiti.id_contatto=_contatti.id
            sqlalchemy.ext.sqlsoup.Session.add(_recapiti)

            #sqlalchemy.ext.sqlsoup.Session.commit()
            #self.pg_db_dest.flush()

            _recapiti = self.pg_db_dest.recapito()
            _recapiti.recapito=row[4]
            _recapiti.tipo_recapito="CAP"
            _recapiti.id_contatto=_contatti.id
            sqlalchemy.ext.sqlsoup.Session.add(_recapiti)

            _recapiti = self.pg_db_dest.recapito()
            _recapiti.recapito=row[5]
            _recapiti.tipo_recapito="Citta'"
            _recapiti.id_contatto=_contatti.id
            sqlalchemy.ext.sqlsoup.Session.add(_recapiti)

            _recapiti = self.pg_db_dest.recapito()
            _recapiti.recapito=row[6]
            _recapiti.tipo_recapito="Provincia"
            _recapiti.id_contatto=_contatti.id
            sqlalchemy.ext.sqlsoup.Session.add(_recapiti)

            if row[7] !='':
                _recapiti = self.pg_db_dest.recapito()
                _recapiti.recapito=row[7]
                _recapiti.tipo_recapito="Telefono"
                _recapiti.id_contatto=_contatti.id
                sqlalchemy.ext.sqlsoup.Session.add(_recapiti)

            if row[8] !='':
                _recapiti = self.pg_db_dest.recapito()
                _recapiti.recapito=row[8]
                _recapiti.tipo_recapito="Telefono"
                _recapiti.id_contatto=_contatti.id
                sqlalchemy.ext.sqlsoup.Session.add(_recapiti)

            if row[9] !='':
                _recapiti = self.pg_db_dest.recapito()
                _recapiti.recapito=row[9]
                _recapiti.tipo_recapito="Fax"
                _recapiti.id_contatto=_contatti.id
                sqlalchemy.ext.sqlsoup.Session.add(_recapiti)

            if row[10] !='':
                _recapiti = self.pg_db_dest.recapito()
                _recapiti.recapito=row[10]
                _recapiti.tipo_recapito="Info"
                _recapiti.id_contatto=_contatti.id
                sqlalchemy.ext.sqlsoup.Session.add(_recapiti)

            if row[11] !='':
                _recapiti = self.pg_db_dest.recapito()
                _recapiti.recapito=row[11]
                _recapiti.tipo_recapito="Info"
                _recapiti.id_contatto=_contatti.id
                sqlalchemy.ext.sqlsoup.Session.add(_recapiti)
            sqlalchemy.ext.sqlsoup.Session.commit()
            self.pg_db_dest.flush()

    def name(self, data):
        rowlist = data.split(" ")
        if len(rowlist) == 4:
            cognome = rowlist[0] + " " +rowlist[1]
            nome = rowlist[2] + " " +rowlist[3]
        elif len(rowlist)==3:
            cognome = rowlist[0] + " " +rowlist[1]
            nome = rowlist[2]
        elif len(rowlist)==2:
            cognome = rowlist[0]
            nome = rowlist[1]
        elif len(rowlist) == 1:
            cognome = rowlist[0]
            nome = None
        elif len(rowlist) >4:
            cognome = rowlist
            nome = None
        return (cognome, nome)