def comment_info(comment): comment_name = u'comment_{0}'.format(comment['id']) if not Entity.by_name(comment_name): print("Caching new comment {0}".format(comment_name)) entity = Entity(comment_name) entity[u'body'] = comment['body'] DBSession.add(entity) DBSession.commit() return Entity.by_name(comment_name)
def issue_info(issue): issue_name = u'issue_{0}'.format(issue['id']) if not Entity.by_name(issue_name): print("Caching new issue {0}".format(issue_name)) entity = Entity(issue_name) entity[u'title'] = issue['title'] entity[u'number'] = issue['number'] DBSession.add(entity) DBSession.commit() return Entity.by_name(issue_name)
def recent_events(days=0, limit=0): DBSession.commit() events = DBSession.query(Entity) \ .filter(Entity.name.startswith('event\_', escape='\\')) \ .all() if days > 0: yesterday = datetime.now() - timedelta(days=days) events = filter(lambda event: event['created_at'] > yesterday, events) events.sort(key=lambda event: event['created_at'], reverse=True) if len(events) > limit > 0: events = events[:limit] return events
def test_model_count(self): child = self.entity.children.values()[1] child = child.children.values()[0] child = child.children.values()[0] child = child.children.values()[1] table, model = get_mapped_table_model_from_entity(child) eq_(int(DBSession.query(model).count()), 417)
def repo_info(repo): repo_name = repo.get('full_name', '{0}/{1}'.format(repo['owner']['login'], repo['name'])) if not Entity.by_name(repo_name): print("Caching new repository {0}".format(repo_name)) entity = Entity(repo_name) entity['name'] = repo['full_name'] # Evidently you cannot set facts to None. (?) if not repo['description']: entity['description'] = u'' else: entity['description'] = repo['description'] entity['url'] = repo['html_url'] entity['owner'] = user_info(repo['owner']).name DBSession.add(entity) DBSession.commit() return Entity.by_name(repo_name)
def user_info(user): user_name = u'user_{0}'.format(user['id']) if not Entity.by_name(user_name): print("Caching new user {0}".format(user_name)) entity = Entity(user_name) entity['login'] = user['login'] entity['gravatar'] = user['gravatar_id'] entity['avatar'] = u'http://www.gravatar.com/avatar/{0}?s=200' \ .format(user['gravatar_id']) # Not everyone has set a name for their account. if user.get('name'): entity[u'name'] = user['name'] else: entity[u'name'] = user['login'] DBSession.add(entity) DBSession.commit() return Entity.by_name(user_name)
def repo_info(repo): repo_name = repo.get( 'full_name', '{0}/{1}'.format(repo['owner']['login'], repo['name'])) if not Entity.by_name(repo_name): print("Caching new repository {0}".format(repo_name)) entity = Entity(repo_name) entity['name'] = repo['full_name'] # Evidently you cannot set facts to None. (?) if not repo['description']: entity['description'] = u'' else: entity['description'] = repo['description'] entity['url'] = repo['html_url'] entity['owner'] = user_info(repo['owner']).name DBSession.add(entity) DBSession.commit() return Entity.by_name(repo_name)
def event_info(event): event_name = u'event_{0}'.format(event['id']) if not Entity.by_name(event_name): print("Caching new event {0}".format(event_name)) entity = Entity(event_name) entity['name'] = event_name entity[u'actor'] = user_info(event['actor']).name try: entity[u'repo'] = repo_info(event['repo']['name']).name except: entity['repo'] = event['repo']['name'] entity[u'type'] = event['type'] entity[u'payload'] = event['payload'] entity[u'created_at'] = datetime.strptime(event['created_at'], '%Y-%m-%dT%H:%M:%SZ') if 'Comment' in event['type']: entity[u'comment'] = comment_info(event['payload']['comment']).name if 'Issue' in event['type']: entity['issue'] = issue_info(event['payload']['issue']).name DBSession.add(entity) DBSession.commit() return Entity.by_name(event_name)
def zip_exe_handler(self, entity): """ Handles self-extracting zip files """ self.log.debug("zip_exe_handler(%s)" % entity) entity[u'format'] = u'zip' dirname = os.path.dirname(entity[u'filename']) p = subprocess.Popen('unzip -o "%s"' % entity[u'filename'], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=dirname) out, err = p.communicate() if err: self.log.error("Error unzipping: " + err) else: # Delete compressed data after extracting os.unlink(entity[u'filename']) for line in out.split('\n'): if line.strip().startswith('inflating'): extracted = os.path.join(dirname, line.strip().split()[-1]) self.log.debug("extracted " + extracted) magic = utils.get_magic(extracted) # Create a new child Entity for each extracted file extracted = to_unicode(extracted) child = Entity.by_name(extracted) if not child: child = Entity(name=os.path.basename(extracted)) child[u'filename'] = extracted DBSession.add(child) child.parent = entity child[u'magic'] = to_unicode(magic) self.log.debug("Created %s" % child) else: child.parent = entity DBSession.flush() self.call_magic_handler(extracted, child)
def inject_test_knowledge(): monster = Entity(u'Monster') fairy = Entity(u'Fairy') rjbean = Entity(u'rjbean') monster[u'color'] = u'Green' monster[u'name'] = u'Lotharrr' fairy[u'flies'] = True fairy[u'name'] = u'Bell' rjbean[u'name'] = u'ralph' rjbean[u'flies'] = False rjbean[u'hacks'] = True DBSession.add(monster) DBSession.add(fairy) DBSession.add(rjbean) DBSession.commit()
def test_model_first_row(self): child = self.entity.children.values()[1] child = child.children.values()[0] child = child.children.values()[0] child = child.children.values()[1] table, model = get_mapped_table_model_from_entity(child) eq_(DBSession.query(model).first(), """http://data.gov/download/994/csv * Name: A. ROSENTHAL (PTY) LTD. * Street_Address: P.O. BOX 44198, 65 7TH STREET, DENMYR BUILDING * City: LINDEN * State: * Country: ZA * Postal_Code: 2104 * Effective_Date: 08/08/1997 * Expiration_Date: 08/08/2017 * Standard_Order: Y""")
def __repr__(self): str = '' from knowledge.model import Entity, with_characteristic, Fact, DBSession table_name = self._sa_class_manager.mapper.mapped_table.name # FIXME: # (ProgrammingError) CASE types integer and text cannot be matched # LINE 5: ...ger' THEN moksha_facts.int_value WHEN 'char' THEN #entity = Knowledge.query(Entity).filter(Entity.facts.any( # with_characteristic(u'table_name', table_name))).one() entity = DBSession.query(Fact).filter_by(key=u'table_name', char_value=table_name).one().entity str += entity.name for i, col in enumerate(entity[u'columns']): str += '\n * %s: %s' % (entity[u'column_names'][i], getattr(self, col)) return str
def tearDown(self): DBSession.remove()
def test_associating_facts_get_custom_default(self): apple = Entity('apple') DBSession.add(apple) DBSession.commit() apple['foo'] = u'bar' eq_(apple.get('baz', 'zomg'), 'zomg')
def polymorphic_csv_populator(self, entity): """ Reads the CSV into Knowledge. TODO: no dynamic dialects? """ try: #flush_after = asint(config.get('transaction_size', 1000)) repo = utils.get_fact_from_parents(u'repo', entity) custom_dialect = self.dialects.get(repo, None) if not custom_dialect: # Nothing to see hre, carry on. pass elif custom_dialect not in csv.list_dialects(): self.log.error("Dialect '%s' not found!" % custom_dialect) csv_reader = csv.reader(entity['filename'], dialect=custom_dialect) columns = None # TODO: See if this file has already been parsed! for i, line in enumerate(csv_reader): if i == 0: columns = line # then create a Table object with the appropriate columns table_name = u'civx_' + unicode(uuid.uuid4()).replace('-', '') entity[u'table_name'] = table_name entity[u'column_names'] = columns # The actual column names behind the scenes. CIVX will # map them to the 'column_names' entity[u'columns'] = [u'col_%d' % i for i in range(len(columns))] #DBSession.flush() table, model = utils.get_mapped_table_model_from_entity(entity) model.__table__ = table civx.model.models[model] = { 'csv': [entity[u'filename']], 'columns': entity[u'columns'], 'tmp_csv': {} } metadata.create_all() continue break self.log.info("%d entries in %r table" % ( DBSession.query(model).count(), entity[u'table_name'])) populate_csv(( utils.get_fact_from_parents('repo', entity), entity[u'filename'], model, self.engine), dialect=custom_dialect) self.log.info("%d entries in %r table" % ( DBSession.query(model).count(), entity[u'table_name'])) DBSession.commit() except Exception, e: self.log.error('Unable to parse file as CSV') self.log.exception(e)
def the_facts(): knowledge_query = DBSession.query(Entity).all() for entity in knowledge_query: print to_bytes(entity), to_bytes(entity.facts.values())
def data_gov_handler(self, url): """ data.gov handler. Entity(CIVX) |-Entity(data.gov) | |-Entity(Agency) | |-Entity(http://www.data.gov/raw/674) | | |-Fact(title), ... | | |-Entity(http://www.epa.gov/tri/tridata/tri08/early_data/statedata/basic/TRI_2008_TN_v08.exe) """ self.log.debug('data_gov_handler(%s)' % locals()) parsed_url = urlparse(url) hostname = parsed_url[1].replace('www.', '') data_types = ('csv', 'RDF', 'xml', 'kml', 'PDF', 'shapefile', 'XLS') fields = ('Agency', 'Sub-Agency', 'Category', 'Date Released', 'Date Updated', 'Time Period', 'Frequency', 'Description', 'Data.gov Data Category Type', 'Specialized Data Category Designation', 'Keywords', 'Unique ID', 'Citation', 'Agency Program Page', 'Agency Data Series Page', 'Unit of Analysis', 'Granularity', 'Geographic Coverage', 'Collection Mode', 'Data Collection Instrument', 'Data Dictionary/Variable List', 'Technical Documentation', 'Additional Metadata') # Our top-level data.gov entity data_gov = Entity.by_name('data.gov') if not data_gov: data_gov = Entity(name=u'data.gov') DBSession.add(data_gov) root = Entity.by_name(u'CIVX') if not root: root = Entity(name=u'CIVX') DBSession.add(root) data_gov.parent = root # See if this entity already exists #~ entity = Entity.by_name(url) #~ if entity: #~ self.log.info('Entity(%r) already exists; skipping.' % url) #~ return soup = self.get_soup(url) # If this is a raw data profile, grab the title of the dataset if '/raw/' in url: # Create a new Entity for this URL title = soup.find('h2', {'id': 'datasetName'}).string.decode('utf-8', 'replace') entity = Entity(name=title) entity[u'url'] = url entity[u'repo'] = hostname DBSession.add(entity) dest = [self.config['git_dir'], hostname] # Extract data for each field for field in fields: data = soup.find(text=field) if data and data.next and data.next.next: data = data.next.next.string if data: entity[unicode(field)] = data.decode('utf-8').strip() DBSession.flush() # Create seperate parent Agency Entity if u'Agency' in entity.facts: agency = Entity(name=entity[u'Agency']) agency.parent = data_gov parent = agency DBSession.add(agency) dest.append(entity[u'Agency']) if u'Sub-Agency' in entity.facts: subagency = Entity(name=entity[u'Sub-Agency']) subagency.parent = agency parent = subagency DBSession.add(subagency) dest.append(entity[u'Sub-Agency']) DBSession.flush() # Have the URL be the child of the agency or sub-agency entity.parent = parent # Elegant repo paths: data.gov/Agency[/Sub-Agency]/title/filename dest.append(entity.name) dest = os.path.join(*dest) if not os.path.isdir(dest): os.makedirs(dest) # Scrape all available raw data types downloads = soup.find_all('a', href=re.compile(r'^/download')) for button in downloads: data = button.string.split()[0] link = button['href'] if link: link = urljoin('http://explore.data.gov', link) entity[data.lower()] = link parsed_link = urlparse(link) file_name = parsed_link[2].split('/')[-1] raw = self.download_file(link) filename = os.path.join(dest, file_name) shutil.move(raw, filename) self.log.debug("Moved %s to %s" % (raw, filename)) # Create a new entity for this file file_entity = Entity(name=link) DBSession.add(file_entity) file_entity[u'filename'] = filename file_entity.parent = entity # Process this file accordingly self.call_magic_handler(filename, file_entity) # Find external map links map = soup.find('a', href=re.compile(r'^/externallink/map/')) if map: map = urllib.unquote(map.get('href', '')[18:]).split('/')[0].replace('###', '/') entity[u'map'] = map DBSession.flush() # If this is from a table of results, grab the title fom this row else: self.log.debug("entity[url] = %r" % entity[u'url']) raise NotImplementedError("Scraping titles from data.gov tables not yet supported")
def test_basic_two(self): apple = Entity('apple') eq_(apple.name, 'apple') DBSession.commit()
def twill_handler(self, url, links, login_func): """ This function uses twill to download the files defined in links and passing them to the magic handler to be processed. """ self.log.debug('twill_handler(%s)' % locals()) parsed_url = urlparse(url) hostname = parsed_url[1].replace('www.', '') parent = Entity.by_name(hostname) if not parent: parent = Entity(name=hostname) DBSession.add(parent) root = Entity.by_name(u'CIVX') if not root: root = Entity(name=u'CIVX') DBSession.add(root) parent.parent = root # See if this entity already exists entity = Entity.by_name(url) if entity: self.log.info('Entity(%r) already exists; skipping.' % url) return #DBSession.flush() for category, link_list in links.items(): dest = [self.config['git_dir'], hostname] if len(links) == 1: entity = parent else: entity = Entity(name=category) entity[u'url'] = url entity[u'repo'] = hostname entity.parent = parent dest.append(entity.name) DBSession.add(entity) DBSession.flush() dest = os.path.join(*dest) if not os.path.isdir(dest): os.makedirs(dest) b = self.get_browser() for link in link_list: # We might have timed out, try to log in again. login_func() b.go(link['href']) # Try to pick out the filename if there is a query if link['href'].find('=') >= 0: filename = urlparse(link['href'])[-2].split('=')[1] else: filename = link['href'].split('/')[-1] filename = os.path.join(dest, filename) save_html(filename) file_entity = Entity(name=link.contents[0]) file_entity[u'filename'] = filename file_entity[u'repo'] = hostname DBSession.add(file_entity) file_entity.parent = entity self.log.debug("Created entity %r (parent %r)" % ( file_entity.name, file_entity.parent.name)) magic = self.call_magic_handler(filename, file_entity) file_entity[u'magic'] = magic DBSession.flush()
def consume(self, url): """ This method attempts to scrape a URI. First it tries to figure out the protocol, then tries to pull a hostname out of the url. Then the git repo is initialized, and we take a close look at the url. If the hostnme is known to be tricky, it will have a special handler method written for it and leave from there. Otherwise it goes through the general path for its protocol, attempting to find useful data. When everything is done, the entity is updated and messages are sent out announcing that the scrape is done. """ self.log.debug("PolyScraper(%s)" % url) start = datetime.utcnow() # Try to pull a protocol off the URI protocol_end = url.find("://") protocol = "http" if not protocol_end == -1: protocol = url[:protocol_end] parsed_url = urlparse(url) hostname = parsed_url[1].replace('www.', '') # Set a hostname if none is set. if not hostname: hostname = u"localhost" # See if we already know about this URL entity = Entity.by_name(url) if entity: self.log.info('Entity(%r) already exists' % url) else: root = Entity.by_name(u'CIVX') if not root: root = Entity(name=u'CIVX') DBSession.add(root) DBSession.flush() parent = Entity.by_name(hostname) if not parent: parent = Entity(name=hostname) DBSession.add(parent) parent.parent = root self.log.debug("Created entity %r" % parent.name) entity = Entity(name=url) DBSession.add(entity) # hide the exact url entity from our tree entity.parent = parent self.log.debug("Created entity %r" % entity.name) #self.send_message('civx.knowledge.entities.new', { # 'msg': 'New entity created: %s' % url # }) DBSession.flush() # Initialize a git repo for this data source entity[u'repo'] = hostname #entity[u'url'] = url # Initialize the git repository for this domain #~ self.init_git_repo(repo=hostname) DBSession.flush() # Scrape the url (to a certain depth) for data num_downloads = 0 # Provide a URL handler method that is called with each file pass # in the soup entity for the link instead, so we can easily look # around the DOM and pull out titles, etc. if hostname in self.url_handlers: #self.url_handlers[hostname](self, soup_link, file_entity) self.url_handlers[hostname](self, url) else: # If we do not specifically handle this file, take a basic approach # based on the protocol. These could probably also be split off # into $protocol_handler methods. self.log.warning('Cannot find %s URL handler' % hostname) files = [] if protocol == "ftp": from ftplib import FTP self.log.debug("FTP support is not implemented yet.") elif protocol == "file": search_path = url[protocol_end+3:] local_files = [] if os.path.isdir(search_path): # Find all files in directory for directory in os.walk(search_path): dirpath = directory[0] for filename in directory[2]: local_files.append(os.path.join(dirpath, filename)) else: local_files.append(search_path) dest = os.path.join(self.config['git_dir'], hostname) # FIXME: what about for links to epa.gov from data.gov? # we probably want our own epa.gov repo namespace to download # and extract this to #if not os.path.isdir(dest): # self.log.debug("mkdir %s" % dest) # os.makedirs(dest) # I think this section is deprecated and unnecessary... #for ext in extensions.split(','): ## if link.endswith('%s' % ext) or '/%s/' % ext in link: ## entity[u'format'] = ext # if ext not in civx.model.models[Entity]: # civx.model.models[Entity][ext] = [] for path in local_files: #raw = self.download_file(link) #file_name = os.path.basename(link) #filename = to_unicode(os.path.join(dest, file_name)) #num_downloads += 1 #shutil.copy(raw, filename) #self.log.debug("Copied %s to %s" % (raw, # os.path.join(dest, file_name))) ##file_entity = Entity(name=os.path.basename(file_name)) #file_entity = Entity(name=link) ##file_entity[u'url'] = link #file_entity[u'filename'] = filename #file_entity[u'repo'] = hostname #DBSession.add(file_entity) #file_entity.parent = entity ##file_entity.parent = parent #self.log.debug("Created entity %r (parent %r)" % ( # file_entity.name, file_entity.parent.name)) file_path = os.path.split(path)[0] file_name = os.path.split(path)[1] self.log.info("%s is a local file" % file_name) files.append((file_path, file_name, path)) #files.append((os.path.dirname(filename), file_name, filename)) else: # Assume protocol is http """ #### f = urllib2.urlopen(url) # XXX: does this load everything into mem? if f.info().type == 'text/html': soup = self.get_soup(f.read()) else: # Assume the url is a link to a direct file # Save the file to disk. # throw file at magic handlers #### """ soup = self.get_soup(url) for link, soup_link in self.scrape_files_from_url(url, soup_links=True): parsed_link = urlparse(link) file_path = '/'.join(parsed_link[2].split('/')[:-1]) file_name = parsed_link[2].split('/')[-1] files.append((file_path, file_name, link)) for (file_path, file_name, link) in files: dest = self.config['git_dir'] + hostname + file_path local = os.path.exists(link) # See if this file already exists file_entity = Entity.by_name(link) #file_entity = Entity.by_name(os.path.basename(file_name)) if file_entity: self.log.info('Entity(%r) already exists; skipping.' % link) continue # FIXME: what about for links to epa.gov from data.gov? # we probably want our own epa.gov repo namespace to download # and extract this to if not os.path.isdir(dest): os.makedirs(dest) # I think this section is deprecated and unnecessary... for ext in extensions.split(','): # if link.endswith('%s' % ext) or '/%s/' % ext in link: # entity[u'format'] = ext if ext not in civx.model.models[Entity]: civx.model.models[Entity][ext] = [] raw = self.download_file(link) filename = os.path.join(dest, file_name) num_downloads += 1 if local: self.log.debug("Copied %s to %s" % (raw, filename)) shutil.copy(raw, filename) else: self.log.debug("Moved %s to %s" % (raw, filename)) shutil.move(raw, filename) #file_entity = Entity(name=os.path.basename(file_name)) file_entity = Entity(name=link) #file_entity[u'url'] = link file_entity[u'filename'] = filename file_entity[u'repo'] = hostname DBSession.add(file_entity) file_entity.parent = entity #file_entity.parent = parent self.log.debug("Created entity %r (parent %r)" % ( file_entity.name, file_entity.parent.name)) # Determine the file magic, and call the appropriate handler file_entity[u'magic'] = self.call_magic_handler(filename, file_entity) DBSession.flush() # To do this stuff we'll need to return an entity from the url handler? #if 'num_files' in entity.facts: # num_files = int(entity['num_files']) # print repr(num_files) # # if num_files != num_downloads: # self.log.info('Downloaded %d more files from previous scrape' % # num_downloads - num_files) # entity[u'num_files'] += num_downloads # else: #entity[u'num_files'] = num_downloads #if u'date_added' not in entity.facts: # entity[u'date_added'] = unicode(datetime.utcnow()) #entity[u'date_last_scraped'] = unicode(datetime.utcnow()) if 'changelog' not in entity.facts: entity[u'changelog'] = [] finish = datetime.utcnow() changelog = { u'start_time': unicode(start), u'finish_time': unicode(finish), u'elapsed_time': unicode(finish-start), u'num_downloads': num_downloads, #u'num_children': len(entity.children), #~ u'git_commit': self.get_latest_commit_id(), } entity[u'changelog'].append(changelog) DBSession.commit() self.log.info("== Statistics ==") self.log.info("Scraped url: " + url) self.log.info("Number of downloaded files: %d" % num_downloads)
def test_associating_facts_unicode_by_attr(self): apple = Entity('apple') DBSession.add(apple) DBSession.commit() apple['foo'] = u'bar' eq_(apple.foo, 'bar')
def test_basic_one(self): """ Basic usage. """ apple = Entity('apple') DBSession.add(apple) DBSession.commit() eq_(apple.name, 'apple')
def by_name(cls, name): """A class method that permits to search entities based on their name attribute. """ return DBSession.query(cls).filter(cls.name == name).first()