Beispiel #1
0
  def run(self):
    #setting interval for the next run
    delay = int(self.getConfigParam('delay'))
    self.setInterval(delay)

    # the parameters into the 'run' method
    self.logger().info('%s connector next run in %s s' % (self.getName(), delay))

    # now go get the sitemap.xml file itself
    host = self.getConfigParam('confluence_host')
    user = self.getConfigParam('confluence_user')
    password = self.getConfigParam('confluence_pass')

    self.logger().debug(' logging %s %s:%s' % (host, user, password))

    self.confluence = Server("http://%s/rpc/xmlrpc" % host)
    self.conf_token = self.confluence.confluence2.login(user, password)
    spaces = self.confluence.confluence2.getSpaces(self.conf_token)

    n = 0
    
    for space in spaces:

      feed_type = 'metadata-and-url'
      #feed_type = 'incremental'
      feed = connector.Feed(feed_type)

      n += self.make_feed(feed, space)

      self.pushFeed(feed)

      feed.clear()

    self.logger().info('Congrats, work done! %d URLs have been posted to GSA.' % n)
Beispiel #2
0
    def run(self):
        # the parameters into the 'run' method
        self.logger().info('TIMER INVOKED for %s ' % self.getName())
        # now go get the sitemap.xml file itself
        u = self.getConfigParam('surl')
        req = urllib2.Request(u)
        response = urllib2.urlopen(req)
        content = response.read()
        #parse out the sitemap and get all the urls
        sitemap_urls = []
        xmldoc = xml.dom.minidom.parseString(content)
        m_node = xmldoc.getElementsByTagName('url')
        for rnode in m_node:
            rchild = rnode.childNodes
            for nodes in rchild:
                if nodes.nodeName == 'loc':
                    sitemap_urls.append(nodes.childNodes[0].nodeValue)


#        if nodes.nodeName == 'lastmod':
#          now = datetime.datetime.now()
#          strt = nodes.childNodes[0].nodeValue
#          lastmodtime_time = time.strptime(strt, "%Y-%m-%d")
#          lastmodtime_date = datetime.datetime(*lastmodtime_time[:6])
#for each url in the sitemap, send them in batches to the GSA
#the batch size is specified by the 'load' parameter from the config page
        i = 0
        feed_type = 'metadata-and-url'
        #feed_type = 'incremental'
        feed = connector.Feed(feed_type)
        for url in sitemap_urls:
            if feed_type == 'metadata-and-url':
                feed.addRecord(url=url,
                               displayurl=url,
                               action='add',
                               mimetype='text/html')
            else:
                content = urllib2.urlopen(url).read()
                feed.addRecord(url=url,
                               displayurl=url,
                               action='add',
                               mimetype='text/html',
                               content=content)
            #if the number of urls were going to send to the GSA right now is more
            #than what its expecting, send what we have now and reset the counter
            #afer waiting 1 min (this is the poormans traversal rate limit delay)
            if i >= float(self.getLoad()):
                self.logger().debug(
                    'Posting %s URLs to the GSA for connector [%s]' %
                    (i, self.getName()))
                self.pushFeed(feed)
                feed.clear()
                i = 0
            else:
                i = i + 1
        if i > 0:
            self.logger().debug(('Final posting %s URLs to the GSA '
                                 'for connector [%s]') % (i, self.getName()))
            self.pushFeed(feed)
            feed.clear()
    def run(self):
        # fetch the contents of the URL
        url = self.getConfigParam('url')
        req = urllib2.Request(url)
        response = urllib2.urlopen(req)
        content = response.read()

        # push it to the GSA as a content feed
        feed = connector.Feed('incremental')
        feed.addRecord(url=url,
                       action='add',
                       mimetype='text/html',
                       content=content)
        self.pushFeed(feed)
Beispiel #4
0
  def run(self):
    # fetch all the document URLs with smbcrawler
    output = smbcrawler.Crawl(self.smbconfig)

    # now download each file individually with smbclient into a temporary file,
    # then send the file content as a content feed to the GSA
    feed = connector.Feed('incremental')
    devnull = open(os.devnull, 'w')
    for url, doc in output.urls_map.iteritems():
      if not doc.IsFile():
        continue
      filename = doc.filename[1:] # strip out initial slash
      mimetype = mimetypes.guess_type(url)[0] or 'application/octet-stream'
      # download the file to a temporary place, and read out its contents
      tmp = tempfile.NamedTemporaryFile()
      subprocess.call(['smbclient', self.share, '-N', '-c',
                       'get %s %s' % (filename, tmp.name)],
                      stdout=devnull, stderr=devnull)
      tmp.seek(0)
      filedata = tmp.read()
      tmp.close()
      feed.addRecord(url=url, action='add', mimetype=mimetype, content=filedata)
    devnull.close()
    self.pushFeed(feed)
Beispiel #5
0
    def getSiteData(self, site_name, url_prefix, libraries):

        if (libraries != None):
            self.logger().info('Running getSiteData for  site: [' + site_name +
                               '], for [' + str(libraries.length) +
                               '] libraries')
        else:
            self.logger().info('Running Initial Traversal for  site: [' +
                               site_name + ']')

        lib_dict = []

        ntlm = WindowsHttpAuthenticated(username=self.SP_DOMAIN + '\\' +
                                        self.SP_USER,
                                        password=self.SP_PASSWORD)
        url = url_prefix + '/_vti_bin/SiteData.asmx?WSDL'
        client = suds.client.Client(url, transport=ntlm)

        # First get the FolderID number for that site
        resp = client.service.GetListCollection()
        for _sList in resp.vLists._sList:
            if _sList.BaseType == "DocumentLibrary":

                folder_id = _sList.InternalName
                self.logger().info('Found [' + _sList.Title + '] of type [' +
                                   _sList.BaseType + ']' +
                                   ' with Folder ID: ' + folder_id)

                # get ready to refeed all the doc URLs
                feed_type = 'metadata-and-url'
                feed = connector.Feed(feed_type)

                last_sync = None

                # See if there  is a change token for the current site
                if (libraries != None):
                    for lib in libraries:
                        if (lib.attributes['id'].value == folder_id):
                            last_sync = lib.attributes['last_sync'].value
                            self.logger().info(
                                'Retrieved LastChangeToken from file [' +
                                last_sync + ']')

                # then use that ID to get the document lists
                ntlm = WindowsHttpAuthenticated(username=self.SP_DOMAIN +
                                                '\\' + self.SP_USER,
                                                password=self.SP_PASSWORD)
                url = url_prefix + '/_vti_bin/Lists.asmx?WSDL'
                client = suds.client.Client(url, transport=ntlm)

                # attribute used for paging
                ListItemCollectionPositionNext = ''
                while (ListItemCollectionPositionNext != None):
                    query = (
                        '<Query><OrderBy><FieldRef Name="Created" Ascending="TRUE" /></OrderBy></Query>'
                    )
                    viewfields = '<ViewFields Properties="TRUE"/>'
                    # get 100 rows now per cursor...
                    rowlimit = 100
                    if (ListItemCollectionPositionNext != ''):
                        queryoptions = (
                            '<QueryOptions><IncludeMandatoryColumns>true</IncludeMandatoryColumns>'
                            +
                            '<DateInUtc>TRUE</DateInUtc><ViewAttributes Scope="Recursive"/>'
                            + '<Paging ListItemCollectionPositionNext="%s"/>' +
                            '<OptimizeFor>ItemIds</OptimizeFor></QueryOptions>'
                        ) % (escape(ListItemCollectionPositionNext))
                    else:
                        queryoptions = (
                            '<QueryOptions><IncludeMandatoryColumns>true</IncludeMandatoryColumns>'
                            +
                            '<DateInUtc>TRUE</DateInUtc><ViewAttributes Scope="Recursive"/>'
                            +
                            '<OptimizeFor>ItemIds</OptimizeFor></QueryOptions>'
                        )
                    contains = (
                        '<Contains><FieldRef Name="Status"/><Value Type="Text">Complete</Value></Contains>'
                    )

                    client.service.GetListItemChangesSinceToken(
                        folder_id, '', Raw(query), Raw(viewfields), rowlimit,
                        Raw(queryoptions), last_sync, Raw(contains))

                    li = client.last_received().getChild(
                        "soap:Envelope").getChild("soap:Body").getChild(
                            "GetListItemChangesSinceTokenResponse").getChild(
                                "GetListItemChangesSinceTokenResult").getChild(
                                    "listitems")

                    # Read the last change token from Sharepoint
                    changes = li.getChild('Changes')
                    if (changes != None):
                        if (changes.getAttribute("LastChangeToken") != None):
                            changeid = changes.getAttribute(
                                "LastChangeToken").getValue()
                            self.logger().info('Found new LastChangeToken [' +
                                               changeid + ']')
                        else:
                            self.logger().info('LastChangeToken is None')
                    else:
                        self.logger().info('LastChangeToken is None')

                    rsd = li.getChild("rs:data")
                    # extract out the cursor ListItemCollectionPositionNext
                    if (rsd.getAttribute('ListItemCollectionPositionNext') !=
                            None):
                        ListItemCollectionPositionNext = rsd.getAttribute(
                            'ListItemCollectionPositionNext').getValue()
                        self.logger().info(
                            'Found response cursor ListItemCollectionPositionNext ['
                            + ListItemCollectionPositionNext + ']')
                    else:
                        ListItemCollectionPositionNext = None
                    # now for each row returned, add that to the feed set
                    for zrow in rsd:
                        if zrow != None:
                            my_url = zrow.getAttribute(
                                "ows_EncodedAbsUrl").getValue()
                            my_last_modified = zrow.getAttribute(
                                "ows_Last_x0020_Modified").getValue()
                            self.logger().debug('Found URL [' + my_url + ']')
                            # set all the attributes for this feed (TODO: set the security SPI parameters)
                            feed.addRecord(url=my_url,
                                           displayurl=my_url,
                                           action='add',
                                           mimetype='text/html')
                        else:
                            break
                    # Finally, save the library name and change token so that the next time, we know where we left off...
                    lib_dict.append('<library id="' + folder_id + '" name="' +
                                    _sList.Title + '" last_sync="' + changeid +
                                    '" />')
                    # flush the records to the GSA
                    self.logger().info('Transmitting [' + str(len(rsd)) +
                                       '] documents.')
                    self.pushFeed(feed)
                    feed.clear()
        # return the last sync time
        return lib_dict
Beispiel #6
0
    def run(self):
        #setting interval for the next run
        delay = int(self.getConfigParam('delay'))
        self.setInterval(delay)

        # the parameters into the 'run' method
        self.logger().info('%s connector next run in %s s' %
                           (self.getName(), delay))

        self.logger().debug('reading files from %s' % JSON_LOCATION)

        json_files = [
            join(JSON_LOCATION, f) for f in os.listdir(JSON_LOCATION)
            if isfile(join(JSON_LOCATION, f)) and re.match('.*\.json$', f)
        ]

        self.logger().debug('  total %d files' % len(json_files))

        #clear GSA feed first
        self.logger().debug('sending empty ' 'full' ' feed to reset GSA')
        feed = connector.Feed("full")
        self.pushFeed(feed)
        feed.clear()

        n = 0

        feed = connector.Feed('incremental')

        for fname in json_files:
            with open(fname, 'r') as f:
                pages = json.loads(f.read())
                self.logger().info("%s: has %d records" % (fname, len(pages)))

                per_feed_counter = 0

                #slice record array into smaller slis
                for page in pages:

                    url, html = (page['url'], page.get('result', None))

                    if (html):
                        url_hash = hashlib.md5(url).hexdigest()
                        feed.addRecord(url="http://reveal/recommendation/%s" %
                                       url_hash,
                                       displayurl=escape(url),
                                       action='add',
                                       mimetype='text/html',
                                       metadata={
                                           'content_source': 'Reveal',
                                           'rp_elastic_id': url_hash
                                       },
                                       content=html)

                        per_feed_counter += 1
                        if per_feed_counter < self.RECORD_PER_FEED:
                            continue

                    self.pushFeed(feed)
                    feed.clear()
                    n += per_feed_counter
                    per_feed_counter = 0
                else:
                    self.pushFeed(feed)
                    feed.clear()
                    n += per_feed_counter

            self.logger().info(
                'Congrats, work done! %d pages have been posted to GSA.' % n)