Esempio n. 1
0
def check_network_status():
  """
  Performs the network status check.
  """
  # Initialize the state of nodes and subnets, remove out of date ap clients and graph items
  Node.objects.all().update(visible = False)
  Subnet.objects.all().update(visible = False)
  Link.objects.all().update(visible = False)
  APClient.objects.filter(last_update__lt = datetime.now() -  timedelta(minutes = 11)).delete()

  # Reset some states
  NodeWarning.objects.all().update(source = EventSource.Monitor, dirty = False)
  Node.objects.all().update(warnings = False, conflicting_subnets = False)

  # Fetch routing tables from OLSR
  try:
    nodes, hna = wifi_utils.get_tables(settings.MONITOR_OLSR_HOST)
  except TypeError:
    logging.error("Unable to fetch routing tables from '%s'!" % settings.MONITOR_OLSR_HOST)
    return

  # Ping nodes present in the database and visible in OLSR
  dbNodes = {}
  nodesToPing = []
  for nodeIp in nodes.keys():
    try:
      # Try to get the node from the database
      n = Node.get_exclusive(ip = nodeIp)
      n.visible = True
      n.peers = len(nodes[nodeIp].links)

      # If we have succeeded, add to list (if not invalid)
      if not n.is_invalid():
        if n.awaiting_renumber:
          # Reset any status from awaiting renumber to invalid
          for notice in n.renumber_notices.all():
            try:
              rn = Node.objects.get(ip = notice.original_ip)
              if rn.status == NodeStatus.AwaitingRenumber:
                rn.status = NodeStatus.Invalid
                rn.node_type = NodeType.Unknown
                rn.awaiting_renumber = False
                rn.save()
            except Node.DoesNotExist:
              pass
            
            notice.delete()
          
          n.awaiting_renumber = False
          n.save()
        
        nodesToPing.append(nodeIp)
      else:
        n.last_seen = datetime.now()
        n.peers = len(nodes[nodeIp].links)
        
        # Create a warning since node is not registered
        NodeWarning.create(n, WarningCode.UnregisteredNode, EventSource.Monitor)
        n.save()
      
      dbNodes[nodeIp] = n
    except Node.DoesNotExist:
      # Node does not exist, create an invalid entry for it
      n = Node(ip = nodeIp, status = NodeStatus.Invalid, last_seen = datetime.now())
      n.visible = True
      n.node_type = NodeType.Unknown
      n.peers = len(nodes[nodeIp].links)
      
      # Check if there are any renumber notices for this IP address
      try:
        notice = RenumberNotice.objects.get(original_ip = nodeIp)
        n.status = NodeStatus.AwaitingRenumber
        n.node_type = notice.node.node_type
        n.awaiting_renumber = True
      except RenumberNotice.DoesNotExist:
        pass
      
      n.save(force_insert = True)
      dbNodes[nodeIp] = n

      # Create an event and append a warning since an unknown node has appeared
      NodeWarning.create(n, WarningCode.UnregisteredNode, EventSource.Monitor)
      Event.create_event(n, EventCode.UnknownNodeAppeared, '', EventSource.Monitor)
  
  # Add a warning to all nodes that have been stuck in renumbering state for over a week
  for node in Node.objects.filter(renumber_notices__renumbered_at__lt = datetime.now() - timedelta(days = 7)):
    NodeWarning.create(node, WarningCode.LongRenumber, EventSource.Monitor)
    node.save()
  
  # Mark invisible nodes as down
  for node in Node.objects.exclude(status__in = (NodeStatus.Invalid, NodeStatus.AwaitingRenumber)):
    oldStatus = node.status

    if node.ip not in dbNodes:
      if node.status == NodeStatus.New:
        node.status = NodeStatus.Pending
      elif node.status != NodeStatus.Pending:
        node.status = NodeStatus.Down
      node.save()

    if oldStatus in (NodeStatus.Up, NodeStatus.Visible, NodeStatus.Duped) and node.status == NodeStatus.Down:
      Event.create_event(node, EventCode.NodeDown, '', EventSource.Monitor)
      
      # Invalidate uptime credit for this node
      node.uptime_last = None
      node.save()
  
  # Generate timestamp and snapshot identifier
  timestamp = datetime.now()
  snapshot_id = int(time.time())
  
  # Setup all node peerings
  for nodeIp, node in nodes.iteritems():
    n = dbNodes[nodeIp]
    n.redundancy_link = False
    links = []
    
    # Find old VPN server peers
    old_vpn_peers = set([p.dst for p in n.get_peers().filter(dst__vpn_server = True)])

    for peerIp, lq, ilq, etx, vtime in node.links:
      try:
        l = Link.objects.get(src = n, dst = dbNodes[peerIp])
      except Link.DoesNotExist:
        l = Link(src = n, dst = dbNodes[peerIp])
      
      l.lq = float(lq)
      l.ilq = float(ilq)
      l.etx = float(etx)
      l.vtime = vtime
      l.visible = True
      l.save()
      links.append(l)
      
      # Check if any of the peers has never peered with us before
      if n.is_adjacency_important() and l.dst.is_adjacency_important() and not n.peer_history.filter(pk = l.dst.pk).count():
        n.peer_history.add(l.dst)
        Event.create_event(n, EventCode.AdjacencyEstablished, '', EventSource.Monitor,
                           data = 'Peer node: %s' % l.dst, aggregate = False)
        Event.create_event(l.dst, EventCode.AdjacencyEstablished, '', EventSource.Monitor,
                           data = 'Peer node: %s' % n, aggregate = False)

      # Check if we have a peering with any VPN servers
      if l.dst.vpn_server:
        n.redundancy_link = True
    
    if not n.is_invalid():
      # Determine new VPN server peers
      new_vpn_peers = set([p.dst for p in n.get_peers().filter(visible = True, dst__vpn_server = True)])
      
      if old_vpn_peers != new_vpn_peers:
        for p in old_vpn_peers:
          if p not in new_vpn_peers:
            # Redundancy loss has ocurred
            Event.create_event(n, EventCode.RedundancyLoss, '', EventSource.Monitor,
                               data = 'VPN server: %s' % p)
        
        for p in new_vpn_peers:
          if p not in old_vpn_peers:
            # Redundancy restoration has ocurred
            Event.create_event(n, EventCode.RedundancyRestored, '', EventSource.Monitor,
                               data = 'VPN server: %s' % p)
      
      # Issue a warning when node requires peering but has none
      if n.redundancy_req and not n.redundancy_link:
        NodeWarning.create(n, WarningCode.NoRedundancy, EventSource.Monitor)
    
    n.save()
    
    # Archive topology information
    data_archive.record_topology_entry(snapshot_id, timestamp, n, links)

  # Update valid subnet status in the database
  for nodeIp, subnets in hna.iteritems():
    if nodeIp not in dbNodes:
      continue

    for subnet in subnets:
      subnet, cidr = subnet.split("/")
      
      try:
        s = Subnet.objects.get(node__ip = nodeIp, subnet = subnet, cidr = int(cidr))
        s.last_seen = datetime.now()
        s.visible = True
      except Subnet.DoesNotExist:
        s = Subnet(node = dbNodes[nodeIp], subnet = subnet, cidr = int(cidr), last_seen = datetime.now())
        s.visible = True
        s.allocated = False
      
      # Save previous subnet status for later use
      old_status = s.status
      
      # Set status accoording to allocation flag
      if s.allocated:
        s.status = SubnetStatus.AnnouncedOk
      else:
        s.status = SubnetStatus.NotAllocated
      
      # Check if this is a more specific prefix announce for an allocated prefix
      if s.is_more_specific() and not s.allocated:
        s.status = SubnetStatus.Subset
      
      # Check if this is a hijack
      try:
        origin = Subnet.objects.ip_filter(
          # Subnet overlaps with another one
          ip_subnet__contains = '%s/%s' % (subnet, cidr)
        ).exclude(
          # Of another node (= filter all subnets belonging to current node)
          node = s.node
        ).get(
          # That is allocated and visible
          allocated = True,
          visible = True
        )
        s.status = SubnetStatus.Hijacked
      except Subnet.DoesNotExist:
        pass
      
      # Generate an event if status has changed
      if old_status != s.status and s.status == SubnetStatus.Hijacked:
        Event.create_event(n, EventCode.SubnetHijacked, '', EventSource.Monitor,
                           data = 'Subnet: %s/%s\n  Allocated to: %s' % (s.subnet, s.cidr, origin.node))
      
      # Flag node entry with warnings flag for unregistered announces
      if not s.is_properly_announced():
        if s.node.border_router and not s.is_from_known_pool():
          # TODO when we have peering announce registration this should first check if
          #      the subnet is registered as a peering
          s.status = SubnetStatus.Peering
        
        if not s.node.border_router or s.status == SubnetStatus.Hijacked or s.is_from_known_pool():
          # Add a warning message for unregistered announced subnets
          NodeWarning.create(s.node, WarningCode.UnregisteredAnnounce, EventSource.Monitor)
          s.node.save()
      
      s.save()
      
      # Detect subnets that cause conflicts and raise warning flags for all involved
      # nodes
      if s.is_conflicting():
        NodeWarning.create(s.node, WarningCode.AnnounceConflict, EventSource.Monitor)
        s.node.conflicting_subnets = True
        s.node.save()
        
        for cs in s.get_conflicting_subnets():
          NodeWarning.create(cs.node, WarningCode.AnnounceConflict, EventSource.Monitor)
          cs.node.conflicting_subnets = True
          cs.node.save()
  
  # Remove subnets that were hijacked but are not visible anymore
  for s in Subnet.objects.filter(status = SubnetStatus.Hijacked, visible = False):
    Event.create_event(s.node, EventCode.SubnetRestored, '', EventSource.Monitor, data = 'Subnet: %s/%s' % (s.subnet, s.cidr))
    s.delete()
  
  # Remove (or change their status) subnets that are not visible
  Subnet.objects.filter(allocated = False, visible = False).delete()
  Subnet.objects.filter(allocated = True, visible = False).update(status = SubnetStatus.NotAnnounced)
  
  for subnet in Subnet.objects.filter(status = SubnetStatus.NotAnnounced, node__visible = True):
    NodeWarning.create(subnet.node, WarningCode.OwnNotAnnounced, EventSource.Monitor)
    subnet.node.save()
  
  # Remove invisible unknown nodes
  for node in Node.objects.filter(status = NodeStatus.Invalid, visible = False).all():
    # Create an event since an unknown node has disappeared
    Event.create_event(node, EventCode.UnknownNodeDisappeared, '', EventSource.Monitor)
  
  Node.objects.filter(status__in = (NodeStatus.Invalid, NodeStatus.AwaitingRenumber), visible = False).delete()
  
  # Remove invisible links
  Link.objects.filter(visible = False).delete()
  
  # Add nodes to topology map and generate output
  if not getattr(settings, 'MONITOR_DISABLE_GRAPHS', None):
    # Only generate topology when graphing is not disabled
    topology = DotTopologyPlotter()
    for node in dbNodes.values():
      topology.addNode(node)
    topology.save(os.path.join(settings.GRAPH_DIR, 'network_topology.png'), os.path.join(settings.GRAPH_DIR, 'network_topology.dot'))

  # Ping the nodes to prepare information for later node processing
  varsize_results = {}
  results, dupes = wifi_utils.ping_hosts(10, nodesToPing)
  for packet_size in (100, 500, 1000, 1480):
    r, d = wifi_utils.ping_hosts(10, nodesToPing, packet_size - 8)
    for node_ip in nodesToPing:
      varsize_results.setdefault(node_ip, []).append(r[node_ip][3] if node_ip in r else None)
  
  if getattr(settings, 'MONITOR_DISABLE_MULTIPROCESSING', None):
    # Multiprocessing is disabled (the MONITOR_DISABLE_MULTIPROCESSING option is usually
    # used for debug purpuses where a single process is prefered)
    for node_ip in nodesToPing:
      process_node(node_ip, results.get(node_ip), node_ip in dupes, nodes[node_ip].links, varsize_results.get(node_ip))
    
    # Commit the transaction here since we do everything in the same session
    transaction.commit()
  else:
    # We MUST commit the current transaction here, because we will be processing
    # some transactions in parallel and must ensure that this transaction that has
    # modified the nodes is commited. Otherwise this will deadlock!
    transaction.commit()
    
    worker_results = []
    for node_ip in nodesToPing:
      worker_results.append(
        WORKER_POOL.apply_async(process_node, (node_ip, results.get(node_ip), node_ip in dupes, nodes[node_ip].links, varsize_results.get(node_ip)))
      )
    
    # Wait for all workers to finish processing
    objects = {}
    for result in worker_results:
      try:
        k, v = result.get()
        objects[k] = v
      except Exception, e:
        logging.warning(format_exc())
    
    # When GC debugging is enabled make some additional computations
    if getattr(settings, 'MONITOR_ENABLE_GC_DEBUG', None):
      global _MAX_GC_OBJCOUNT
      objcount = sum(objects.values())
      
      if '_MAX_GC_OBJCOUNT' not in globals():
        _MAX_GC_OBJCOUNT = objcount
      
      logging.debug("GC object count: %d %s" % (objcount, "!M" if objcount > _MAX_GC_OBJCOUNT else ""))
      _MAX_GC_OBJCOUNT = max(_MAX_GC_OBJCOUNT, objcount)