def run(self): #setting interval for the next run delay = int(self.getConfigParam('delay')) self.setInterval(delay) # the parameters into the 'run' method self.logger().info('%s connector next run in %s s' % (self.getName(), delay)) # now go get the sitemap.xml file itself host = self.getConfigParam('confluence_host') user = self.getConfigParam('confluence_user') password = self.getConfigParam('confluence_pass') self.logger().debug(' logging %s %s:%s' % (host, user, password)) self.confluence = Server("http://%s/rpc/xmlrpc" % host) self.conf_token = self.confluence.confluence2.login(user, password) spaces = self.confluence.confluence2.getSpaces(self.conf_token) n = 0 for space in spaces: feed_type = 'metadata-and-url' #feed_type = 'incremental' feed = connector.Feed(feed_type) n += self.make_feed(feed, space) self.pushFeed(feed) feed.clear() self.logger().info('Congrats, work done! %d URLs have been posted to GSA.' % n)
def run(self): # the parameters into the 'run' method self.logger().info('TIMER INVOKED for %s ' % self.getName()) # now go get the sitemap.xml file itself u = self.getConfigParam('surl') req = urllib2.Request(u) response = urllib2.urlopen(req) content = response.read() #parse out the sitemap and get all the urls sitemap_urls = [] xmldoc = xml.dom.minidom.parseString(content) m_node = xmldoc.getElementsByTagName('url') for rnode in m_node: rchild = rnode.childNodes for nodes in rchild: if nodes.nodeName == 'loc': sitemap_urls.append(nodes.childNodes[0].nodeValue) # if nodes.nodeName == 'lastmod': # now = datetime.datetime.now() # strt = nodes.childNodes[0].nodeValue # lastmodtime_time = time.strptime(strt, "%Y-%m-%d") # lastmodtime_date = datetime.datetime(*lastmodtime_time[:6]) #for each url in the sitemap, send them in batches to the GSA #the batch size is specified by the 'load' parameter from the config page i = 0 feed_type = 'metadata-and-url' #feed_type = 'incremental' feed = connector.Feed(feed_type) for url in sitemap_urls: if feed_type == 'metadata-and-url': feed.addRecord(url=url, displayurl=url, action='add', mimetype='text/html') else: content = urllib2.urlopen(url).read() feed.addRecord(url=url, displayurl=url, action='add', mimetype='text/html', content=content) #if the number of urls were going to send to the GSA right now is more #than what its expecting, send what we have now and reset the counter #afer waiting 1 min (this is the poormans traversal rate limit delay) if i >= float(self.getLoad()): self.logger().debug( 'Posting %s URLs to the GSA for connector [%s]' % (i, self.getName())) self.pushFeed(feed) feed.clear() i = 0 else: i = i + 1 if i > 0: self.logger().debug(('Final posting %s URLs to the GSA ' 'for connector [%s]') % (i, self.getName())) self.pushFeed(feed) feed.clear()
def run(self): # fetch the contents of the URL url = self.getConfigParam('url') req = urllib2.Request(url) response = urllib2.urlopen(req) content = response.read() # push it to the GSA as a content feed feed = connector.Feed('incremental') feed.addRecord(url=url, action='add', mimetype='text/html', content=content) self.pushFeed(feed)
def run(self): # fetch all the document URLs with smbcrawler output = smbcrawler.Crawl(self.smbconfig) # now download each file individually with smbclient into a temporary file, # then send the file content as a content feed to the GSA feed = connector.Feed('incremental') devnull = open(os.devnull, 'w') for url, doc in output.urls_map.iteritems(): if not doc.IsFile(): continue filename = doc.filename[1:] # strip out initial slash mimetype = mimetypes.guess_type(url)[0] or 'application/octet-stream' # download the file to a temporary place, and read out its contents tmp = tempfile.NamedTemporaryFile() subprocess.call(['smbclient', self.share, '-N', '-c', 'get %s %s' % (filename, tmp.name)], stdout=devnull, stderr=devnull) tmp.seek(0) filedata = tmp.read() tmp.close() feed.addRecord(url=url, action='add', mimetype=mimetype, content=filedata) devnull.close() self.pushFeed(feed)
def getSiteData(self, site_name, url_prefix, libraries): if (libraries != None): self.logger().info('Running getSiteData for site: [' + site_name + '], for [' + str(libraries.length) + '] libraries') else: self.logger().info('Running Initial Traversal for site: [' + site_name + ']') lib_dict = [] ntlm = WindowsHttpAuthenticated(username=self.SP_DOMAIN + '\\' + self.SP_USER, password=self.SP_PASSWORD) url = url_prefix + '/_vti_bin/SiteData.asmx?WSDL' client = suds.client.Client(url, transport=ntlm) # First get the FolderID number for that site resp = client.service.GetListCollection() for _sList in resp.vLists._sList: if _sList.BaseType == "DocumentLibrary": folder_id = _sList.InternalName self.logger().info('Found [' + _sList.Title + '] of type [' + _sList.BaseType + ']' + ' with Folder ID: ' + folder_id) # get ready to refeed all the doc URLs feed_type = 'metadata-and-url' feed = connector.Feed(feed_type) last_sync = None # See if there is a change token for the current site if (libraries != None): for lib in libraries: if (lib.attributes['id'].value == folder_id): last_sync = lib.attributes['last_sync'].value self.logger().info( 'Retrieved LastChangeToken from file [' + last_sync + ']') # then use that ID to get the document lists ntlm = WindowsHttpAuthenticated(username=self.SP_DOMAIN + '\\' + self.SP_USER, password=self.SP_PASSWORD) url = url_prefix + '/_vti_bin/Lists.asmx?WSDL' client = suds.client.Client(url, transport=ntlm) # attribute used for paging ListItemCollectionPositionNext = '' while (ListItemCollectionPositionNext != None): query = ( '<Query><OrderBy><FieldRef Name="Created" Ascending="TRUE" /></OrderBy></Query>' ) viewfields = '<ViewFields Properties="TRUE"/>' # get 100 rows now per cursor... rowlimit = 100 if (ListItemCollectionPositionNext != ''): queryoptions = ( '<QueryOptions><IncludeMandatoryColumns>true</IncludeMandatoryColumns>' + '<DateInUtc>TRUE</DateInUtc><ViewAttributes Scope="Recursive"/>' + '<Paging ListItemCollectionPositionNext="%s"/>' + '<OptimizeFor>ItemIds</OptimizeFor></QueryOptions>' ) % (escape(ListItemCollectionPositionNext)) else: queryoptions = ( '<QueryOptions><IncludeMandatoryColumns>true</IncludeMandatoryColumns>' + '<DateInUtc>TRUE</DateInUtc><ViewAttributes Scope="Recursive"/>' + '<OptimizeFor>ItemIds</OptimizeFor></QueryOptions>' ) contains = ( '<Contains><FieldRef Name="Status"/><Value Type="Text">Complete</Value></Contains>' ) client.service.GetListItemChangesSinceToken( folder_id, '', Raw(query), Raw(viewfields), rowlimit, Raw(queryoptions), last_sync, Raw(contains)) li = client.last_received().getChild( "soap:Envelope").getChild("soap:Body").getChild( "GetListItemChangesSinceTokenResponse").getChild( "GetListItemChangesSinceTokenResult").getChild( "listitems") # Read the last change token from Sharepoint changes = li.getChild('Changes') if (changes != None): if (changes.getAttribute("LastChangeToken") != None): changeid = changes.getAttribute( "LastChangeToken").getValue() self.logger().info('Found new LastChangeToken [' + changeid + ']') else: self.logger().info('LastChangeToken is None') else: self.logger().info('LastChangeToken is None') rsd = li.getChild("rs:data") # extract out the cursor ListItemCollectionPositionNext if (rsd.getAttribute('ListItemCollectionPositionNext') != None): ListItemCollectionPositionNext = rsd.getAttribute( 'ListItemCollectionPositionNext').getValue() self.logger().info( 'Found response cursor ListItemCollectionPositionNext [' + ListItemCollectionPositionNext + ']') else: ListItemCollectionPositionNext = None # now for each row returned, add that to the feed set for zrow in rsd: if zrow != None: my_url = zrow.getAttribute( "ows_EncodedAbsUrl").getValue() my_last_modified = zrow.getAttribute( "ows_Last_x0020_Modified").getValue() self.logger().debug('Found URL [' + my_url + ']') # set all the attributes for this feed (TODO: set the security SPI parameters) feed.addRecord(url=my_url, displayurl=my_url, action='add', mimetype='text/html') else: break # Finally, save the library name and change token so that the next time, we know where we left off... lib_dict.append('<library id="' + folder_id + '" name="' + _sList.Title + '" last_sync="' + changeid + '" />') # flush the records to the GSA self.logger().info('Transmitting [' + str(len(rsd)) + '] documents.') self.pushFeed(feed) feed.clear() # return the last sync time return lib_dict
def run(self): #setting interval for the next run delay = int(self.getConfigParam('delay')) self.setInterval(delay) # the parameters into the 'run' method self.logger().info('%s connector next run in %s s' % (self.getName(), delay)) self.logger().debug('reading files from %s' % JSON_LOCATION) json_files = [ join(JSON_LOCATION, f) for f in os.listdir(JSON_LOCATION) if isfile(join(JSON_LOCATION, f)) and re.match('.*\.json$', f) ] self.logger().debug(' total %d files' % len(json_files)) #clear GSA feed first self.logger().debug('sending empty ' 'full' ' feed to reset GSA') feed = connector.Feed("full") self.pushFeed(feed) feed.clear() n = 0 feed = connector.Feed('incremental') for fname in json_files: with open(fname, 'r') as f: pages = json.loads(f.read()) self.logger().info("%s: has %d records" % (fname, len(pages))) per_feed_counter = 0 #slice record array into smaller slis for page in pages: url, html = (page['url'], page.get('result', None)) if (html): url_hash = hashlib.md5(url).hexdigest() feed.addRecord(url="http://reveal/recommendation/%s" % url_hash, displayurl=escape(url), action='add', mimetype='text/html', metadata={ 'content_source': 'Reveal', 'rp_elastic_id': url_hash }, content=html) per_feed_counter += 1 if per_feed_counter < self.RECORD_PER_FEED: continue self.pushFeed(feed) feed.clear() n += per_feed_counter per_feed_counter = 0 else: self.pushFeed(feed) feed.clear() n += per_feed_counter self.logger().info( 'Congrats, work done! %d pages have been posted to GSA.' % n)