コード例 #1
0
ファイル: pull_live.py プロジェクト: epireve/recordedfuture
def main():
    options, args = parse_arguments()
    query = build_query(options, args)

    api = RFAPI(options.token)

    substitute_fields = ["attributes"]
    output_columns = [
        "id",
        "momentum",
        "positive",
        "negative",
        "canonical.id",
        "type",
        "document.id",
        "document.published",
        "document.downloaded",
        "start",
        "stop",
        "document.url",
        "document.title",
        "document.sourceId.id",
        "document.sourceId.name",
        "document.sourceId.media_type",
        "document.sourceId.topic",
        "document.sourceId.country",
        "fragment",
        "attributes",
    ]
    entity_columns = ["id", "name", "hits", "type", "momentum", "attributes"]

    out = csv.DictWriter(sys.stdout, output_columns, extrasaction="ignore")

    if query.get("aggregate") or query.get("output", {}).get("count"):
        res = api.query(query)
        print res
    else:
        if options.header:
            out.writerow(dict(zip(output_columns, output_columns)))
        if options.entityfile:
            entityout = csv.DictWriter(open(options.entityfile, "w"), entity_columns, extrasaction="ignore")
            entityout.writerow(dict(zip(entity_columns, entity_columns)))

    for res in api.paged_query(query):
        for i in res["instances"]:
            i["positive"] = i.get("attributes", {}).get("positive", 0.0)
            i["negative"] = i.get("attributes", {}).get("negative", 0.0)
            out.writerow(encode_instance(flatten_instance(i, res["entities"], substitute_fields)))

        if options.entityfile:
            entities = pack_entity_attributes(res["entities"], entity_columns)
            for e in entities:
                # Here we reuse the instance formatting code to format entities for output.
                entityout.writerow(encode_instance(flatten_instance(e, res["entities"], [])))

        if not options.page:
            break
コード例 #2
0
ファイル: pull_live.py プロジェクト: reggiev/api
def main():
    options, args = parse_arguments()
    query = build_query(options, args)

    api = RFAPI(options.token)

    substitute_fields = ['attributes']
    output_columns = [
        'id', 'momentum', 'positive', 'negative', 'canonical.id', 'type',
        'document.id', 'document.published', 'document.downloaded', 'start',
        'stop', 'document.url', 'document.title', 'document.sourceId.id',
        'document.sourceId.name', 'document.sourceId.media_type',
        'document.sourceId.topic', 'document.sourceId.country', 'fragment',
        'attributes'
    ]
    entity_columns = ['id', 'name', 'hits', 'type', 'momentum', 'attributes']

    out = csv.DictWriter(sys.stdout, output_columns, extrasaction='ignore')

    if query.get('aggregate') or query.get('output', {}).get('count'):
        res = api.query(query)
        print res
        return

    if options.header:
        out.writerow(dict(zip(output_columns, output_columns)))
    if options.entityfile:
        entityout = csv.DictWriter(open(options.entityfile, 'w'),
                                   entity_columns,
                                   extrasaction='ignore')
        entityout.writerow(dict(zip(entity_columns, entity_columns)))

    for res in api.paged_query(query):
        for i in res['instances']:
            i['positive'] = i.get('attributes', {}).get('positive', 0.0)
            i['negative'] = i.get('attributes', {}).get('negative', 0.0)
            out.writerow(
                encode_instance(
                    flatten_instance(i, res['entities'], substitute_fields)))

        if options.entityfile:
            entities = pack_entity_attributes(res['entities'], entity_columns)
            for e in entities:
                #Here we reuse the instance formatting code to format entities for output.
                entityout.writerow(
                    encode_instance(flatten_instance(e, res['entities'], [])))

        if not options.page:
            break
コード例 #3
0
ファイル: pull_live.py プロジェクト: Brentbin/recordedfuture
def main():
    options, args = parse_arguments()
    query = build_query(options, args)

    api = RFAPI(options.token)

    substitute_fields = ['attributes']
    output_columns = ['id', 'momentum', 'positive', 'negative', 'canonical.id',
            'type', 'document.id', 'document.published', 'document.downloaded',
            'start', 'stop', 'document.url','document.title',
            'document.sourceId.id', 'document.sourceId.name',
            'document.sourceId.media_type', 'document.sourceId.topic',
            'document.sourceId.country', 'fragment', 'attributes']
    entity_columns = ['id', 'name', 'hits', 'type', 'momentum', 'attributes']

    out = csv.DictWriter(sys.stdout, output_columns, extrasaction='ignore')

    if query.get('aggregate') or query.get('output', {}).get('count'):
        res = api.query(query)
        print res
        return
        
    if options.header:
        out.writerow(dict(zip(output_columns, output_columns)))
    if options.entityfile:
        entityout = csv.DictWriter(open(options.entityfile, 'w'), entity_columns, extrasaction='ignore')
        entityout.writerow(dict(zip(entity_columns, entity_columns)))

    for res in api.paged_query(query):
        for i in res['instances']:
            i['positive'] = i.get('attributes', {}).get('positive', 0.0)
            i['negative'] = i.get('attributes', {}).get('negative', 0.0)
            out.writerow(encode_instance(flatten_instance(i, res['entities'], substitute_fields)))

        if options.entityfile:
            entities = pack_entity_attributes(res['entities'], entity_columns)
            for e in entities:
                #Here we reuse the instance formatting code to format entities for output.
                entityout.writerow(encode_instance(flatten_instance(e, res['entities'], [])))

        if not options.page:
            break
コード例 #4
0
def main():
    # Construct a RFAPI query object 
    rfqapi = RFAPI(TOKEN)

    # Query for the metadata 
    mdata_result = rfqapi.paged_query(q)

    # Loop over all the metadata and each metadata attributes 
    for metadata in mdata_result:
        mdata_types =  metadata['types']
        for md_type in mdata_types:
            # Print each Root Metadata Type 
            parent_type=""
            if 'parent' in md_type:
                parent_type = str(md_type['parent'])
            print md_type['name']+"("+parent_type+")"

            # Loop over attributes in this metadata type and print their corresponding types 
            for md_attr_list in md_type['attrs']:
                print_attributes(md_attr_list)
コード例 #5
0
ファイル: generate_bro_intel.py プロジェクト: eyvindn/api
    "data_group": "Hash",
    "limit": 10000,
    "attributes": [
      {
        "name": "stats.metrics.riskScore",
        "range": {
          "gte": args.hash_risk_floor
        }
      }
    ]
  },
  "output": {
    "exclude": [
      "stats.entity_lists"
    ],
    "inline_entities": True
  },
  "search_type": "scan"
}
for page in api.paged_query(hash_query):
    for ev in page['events']:
        ent = ev['attributes']['entities'][0]
        print('\t'.join([
            ent['name'], 'Intel::FILE_HASH',
            meta_source, intel_summ_link(ent['id']),
            do_notice, 'Files::IN_HASH'
            ]))
        c += 1

sys.stderr.write('Generated intel for {0} indicators.\n'.format(c))
コード例 #6
0
api = RFAPI(args.token)
hash_query = {
    "cluster": {
        "data_group":
        "Hash",
        "limit":
        10000,
        "attributes": [{
            "name": "stats.metrics.riskScore",
            "range": {
                "gte": args.hash_risk_floor
            }
        }]
    },
    "output": {
        "exclude": ["stats.entity_lists"],
        "inline_entities": True
    },
    "search_type": "scan"
}
for page in api.paged_query(hash_query):
    for ev in page['events']:
        ent = ev['attributes']['entities'][0]
        print('\t'.join([
            ent['name'], 'Intel::FILE_HASH', meta_source,
            intel_summ_link(ent['id']), do_notice, 'Files::IN_HASH'
        ]))
        c += 1

sys.stderr.write('Generated intel for {0} indicators.\n'.format(c))
コード例 #7
0
class IOCEnricher(object):
    '''Enriches a list of IOCs with data from Recorded Future.
    '''

    _VALID_TYPES = ["IpAddress",
                    "Hash",
                    "InternetDomainName"]
    _INSTANCES_OR_DOCUMENTS = 'instances'
    _MALICIOUS_INDICATORS = ["compromised",
                             "malicious",
                             "suspected", 
                             "threat", 
                             "malware", 
                             "infected", 
                             "honeypot", 
                             "attacked from", 
                             "exploit", 
                             "attacks from", 
                             "bad http request from", 
                             "attack detected", 
                             "attack deteted"]
    _RELATED_ENTITY_TYPES = ['Malware',
                             'CyberVulnerability',
                             'IpAddress',
                             'Hash',
                             'InternetDomainName']
    # can be "document" also, but enrichment will take much longer
    # to pull document-level co-entities. fragment-level will use
    # extended_entities where available
    _RELATED_ENTITY_SCOPE = "fragment"
    _FEATURES = {"debug": collections.OrderedDict([("RFID", ""),
                                                   ("EntityType", ""),
                                                   ("TotalHits", 0),
                                                   ("7DayHits", 0), 
                                                   ("1DayHits", 0),
                                                   ("MaliciousHits", 0), 
                                                   ("InfoSecHits", 0),
                                                   ("PasteHits", 0), 
                                                   ("SocialMediaHits", 0)]),
                 "related": collections.OrderedDict([("RelatedMalware", []),
                                                     ("RelatedCyberVulnerability", []), 
                                                     ("RelatedIpAddress", []), 
                                                     ("RelatedInternetDomainName", []),
                                                     ("RelatedHash", []),
                                                     ("RelatedMalwareCount", 0), 
                                                     ("RelatedCyberVulnerabilityCount", 0),
                                                     ("RelatedIpAddressCount", 0), 
                                                     ("RelatedInternetDomainNameCount", 0),
                                                     ("RelatedHashCount", 0),
                                                     ("Score", 0.0)]),
                 "core": collections.OrderedDict([("Name", ""),
                                                  ("RFURL", ""),
                                                  ("MostRecent", ""),
                                                  ("MostRecentSource", ""),
                                                  ("MostRecentTitle", ""),
                                                  ("MostRecentFragment", ""),
                                                  ("MostRecentURL", ""),
                                                  ("RecentInfoSecSource", ""), 
                                                  ("RecentInfoSecTitle", ""),
                                                  ("RecentInfoSecFragment", ""), 
                                                  ("RecentInfoSecURL", ""), 
                                                  ("RecentPasteSource", ""), 
                                                  ("RecentPasteTitle", ""), 
                                                  ("RecentPasteFragment", ""), 
                                                  ("RecentPasteURL", ""),
                                                  ("RecentSocialMediaSource", ""), 
                                                  ("RecentSocialMediaTitle", ""), 
                                                  ("RecentSocialMediaFragment", ""), 
                                                  ("RecentSocialMediaURL", ""), 
                                                  ("FirstSource", ""), 
                                                  ("FirstTitle", ""), 
                                                  ("FirstFragment", ""), 
                                                  ("FirstURL", ""), 
                                                  ("FirstPublished", "")])}
    
    def __init__(self, token, iocs, entity_type, mode='core'):
        '''
        Parameters
        ----------
        token : str
            Recorded Future API token
        iocs : list or dict
            List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. 
        entity_type : {"IpAddress", "Hash", "InternetDomainName"}
            Name of Recorded Future entity type for IOC.
        mode : {"core", "related", "debug"}
            Subset of features to return with enrichment. "core" is default.
        '''
        self.rfqapi = RFAPI(token)
        self.response = collections.OrderedDict()
        # need all features early for scoring; they're removed later
        # need to test whether this can be avoided
        keys = self._FEATURES['core']
        keys.update(self._FEATURES['debug'])
        if mode in ('related', 'debug'):
            keys.update(self._FEATURES['related'])
        if mode not in ('core', 'related', 'debug'):
            raise ValueError('"mode" must be one of ("core", "related", "debug"). Input: %s.' % mode)
        self.mode = mode
        self.entity_type = entity_type
        if isinstance(iocs, list):
            self.iocs = self._get_rfids(iocs)
        elif isinstance(iocs, dict):
            self.iocs = iocs
        else:
            raise ValueError('"iocs" must be list or dict.')
        for ioc in self.iocs:
            new_resp = {}
            for key in keys:
                new_resp[key] = keys[key]
                if key == 'Name':
                    new_resp[key] = ioc
                elif key == 'RFID':
                    new_resp[key] = self.iocs[ioc]
                elif key == 'EntityType':
                    new_resp[key] = self.entity_type
            self.response[ioc] = new_resp
        self.keys = keys
    
    def get_keys(self, mode=None):
        '''Getter for the keys in the response.
        '''
        return [key for key in self.keys if key not in self._get_extra_features(mode)]
    
    def _get_extra_features(self, mode=None):
        if not mode:
            mode = self.mode
        extra_features = []
        if mode in ('core', 'related'):
            extra_features = self._FEATURES['debug'].keys()
        return extra_features
        
    def enrich(self):
        '''Enriches the given IOC.
        Returns
        -------
        response : dict
            The enrichment package containing all keys requested by "mode" parameter.
        '''
        print "    Getting all references"
        max_index = None
        for names in _chunks(self.iocs.keys(), 250):
            refs, edetails = self._get_all_references(names)
            print "      Getting enrichment from references"
            max_index_cand = self._get_enrichment(refs, edetails)
            if max_index_cand < max_index or not max_index:
                # using < here because the references are no longer retrieved all from
                # the same query, so there may be timings, so we're looking at the minimax
                max_index = max_index_cand
            print "      Getting URL and Score"
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            # Get RF URL
            if 'RFURL' in ioc_resp:
                ioc_resp['RFURL'] = _generate_rfURL_from_entity(ioc, ioc_resp.get('RFID', None))
            # Score the ref
            if 'Score' in ioc_resp:
                self.score(ioc_resp)
            # Remove unnecessary features
            extra_features = self._get_extra_features()
            for key in extra_features:
                del ioc_resp[key]
        return self.response, max_index

    def score(self, ioc_resp):
        spec_keys = ('7DayHits', '1DayHits')
        nonzero_keys = ('MaliciousHits',
                        'InfoSecHits',
                        'PasteHits',
                        'RelatedMalwareCount',
                        'RelatedCyberVulnerabilityCount',
                        'RelatedIpAddressCount',
                        'RelatedInternetDomainNameCount',
                        'RelatedHashCount')
        max_score = 0.0
        # score special keys
        if 'TotalHits' in self.keys:
            for key in filter(lambda k: k in self.keys, spec_keys):
                if ((ioc_resp[key]*2) > ioc_resp["TotalHits"]): 
                    ioc_resp['Score'] += 1
                max_score += len(spec_keys)
        # score nonzero keys
        for key in filter(lambda k: k in self.keys, nonzero_keys):
            if ioc_resp[key] > 0:
                ioc_resp['Score'] += 1
            max_score += 1
        ioc_resp['Score'] = ioc_resp['Score'] / max_score        

    def _get_enrichment(self, refs, edetails):
        max_index = None
        today = datetime.datetime.today()
        one_day_hit_string = _rfid_date_conv(today - datetime.timedelta(days=1))
        seven_day_hit_string = _rfid_date_conv(today - datetime.timedelta(days=7))
        
        # first get everything from all references
        print "    Processing references"
        ioc_to_rfid = self.iocs
        rfid_to_ioc = {}
        for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid):
            rfid_to_ioc[ioc_to_rfid[ioc]] = ioc
        recent_pub = {"MostRecent": {},
                      "Paste": {},
                      "InfoSec": {},
                      "SocialMedia": {}}
        first_pub = {}
        for ref in refs:
            indexed = ref['document']['indexed']
            if indexed > max_index or not max_index:
                max_index = indexed
            fragment = ref['fragment'].lower()
            attrs = ref['attributes']
            source_topic = ref['document']['sourceId'].get('topic', None)
            source_media_type = ref['document']['sourceId'].get('media_type', None)
            pub_date = ref['document']['published']
            # get entities mentioned
            rfids = filter(lambda ioc: ioc in rfid_to_ioc, attrs.get('entities', []))
            ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc]
            # get string hits that aren't included in the entity hits
            other_hits = [ioc for ioc in ioc_to_rfid if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)]
            # increment hit counts and get recent hits
            iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids]
            for ioc in iocs + other_hits:
                ioc_resp = self.response[ioc]
                # update dates
                recent_pub['MostRecent'][ioc] = self._safe_update_date(ioc_resp, 
                                                                       pub_date, 
                                                                       recent_pub['MostRecent'][ioc] if ioc in recent_pub['MostRecent'] else '', 
                                                                       'MostRecent', 
                                                                       pub_date > recent_pub['MostRecent'][ioc] if ioc in recent_pub['MostRecent'] and len(recent_pub['MostRecent'][ioc]) > 0 else True)
                first_pub[ioc] = self._safe_update_date(ioc_resp, 
                                                        pub_date, 
                                                        first_pub[ioc] if ioc in first_pub else '', 
                                                        'FirstPublished', 
                                                        pub_date < first_pub[ioc] if ioc in first_pub and len(first_pub[ioc]) > 0 else True)
                # update hit counters
                self._safe_update_hits(ioc_resp, 
                                       'TotalHits', 
                                       True)
                self._safe_update_hits(ioc_resp, 
                                       '1DayHits', 
                                       pub_date >= one_day_hit_string)
                self._safe_update_hits(ioc_resp, 
                                       '7DayHits', 
                                       pub_date >= seven_day_hit_string)
                self._safe_update_hits(ioc_resp, 
                                       'MaliciousHits', 
                                       any(term in fragment for term in self._MALICIOUS_INDICATORS))
                # update hit counters and references
                conditions = {"InfoSec": source_topic == 'KPzZAE',
                              "Paste": source_media_type == 'KDS1Zp',
                              "SocialMedia": source_media_type == 'JxSEtC'}
                for key in conditions:
                    condition = conditions[key]
                    recent_pub[key][ioc] = self._safe_update_hits_and_refs(ioc_resp,
                                                                           ref,
                                                                           key,
                                                                           condition,
                                                                           recent_pub[key][ioc] if ioc in recent_pub[key] else '',
                                                                           pub_date > recent_pub[key][ioc] if ioc in recent_pub[key] and len(recent_pub[key][ioc]) > 0 else True)
                # update references for first and recent
                self._safe_update_refs(ioc_resp, 
                                       ref, 
                                       'MostRecent', 
                                       pub_date == recent_pub['MostRecent'][ioc])
                self._safe_update_refs(ioc_resp, 
                                       ref, 
                                       'First', 
                                       pub_date == first_pub[ioc])
        # get related content at fragment scope
        if self.mode in ('debug', 'related') and self._RELATED_ENTITY_SCOPE == 'fragment':
            self._safe_get_related_entities_from_frags(refs, edetails)
        # get related content at document scope
        if self.mode in ('debug', 'related') and self._RELATED_ENTITY_SCOPE == 'document':
            # print "Getting related content from documents"
            docs = self._get_docs()
            self._safe_get_related_entities_from_docs(docs)
        return max_index    

    def _safe_update_hits_and_refs(self, ioc_resp, ref, key, condition, cur_date, date_condition):
        pub_date = ref['document']['published']
        date_update = self._safe_update_date(ioc_resp, pub_date, cur_date, key, date_condition and condition)
        if condition:
            # update hits
            self._safe_update_hits(ioc_resp, key + 'Hits', condition)
            # get recent frags
            self._safe_update_refs(ioc_resp, ref, 'Recent' + key, pub_date == date_update)
        return date_update
                    
    def _safe_update_date(self, ioc_resp, date, existing_val, key, condition):
        if condition and key in ioc_resp:
            ioc_resp[key] = date
        return date if condition else existing_val
    
    def _safe_update_hits(self, ioc_resp, key, condition):
        if condition and key in ioc_resp:
            ioc_resp[key] += 1

    def _safe_update_refs(self, ioc_resp, ref, key, condition):
        if condition:
            key_suffixes = {'Source': ref['document']['sourceId']['name'].replace('\n', ' ').replace('\r', ' '),
                            'Title': ref['document']['title'].replace('\n', ' ').replace('\r', ' '), 
                            'Fragment': ref['fragment'].replace('\n', ' ').replace('\r', ' '), 
                            'URL': ref['document']['url'] if 'url' in ref['document'] else ''}
            for suffix in filter(lambda suf: key + suf in ioc_resp, key_suffixes):
                ioc_resp[key + suffix] = key_suffixes[suffix]
                    
    def _get_all_references(self, names):
        refs = []
        seen_ids = set()
        edetails = {}
        q = {"instance": {"type": "Event",
                          "limit": 25000,
                          "searchtype": "scan"}}
        q['instance']['attributes'] = [[{"name": "Event.event_fragment", 'string': names}]]
        rfids = [self.iocs[name] for name in names if self.iocs[name]]
        q['instance']['attributes'][0].append({"name": "entities",
                                               "entity": {"id": rfids}})
        # print len(self.iocs.keys()),
        for res in self.rfqapi.paged_query(q):
            refs.extend([inst for inst in res['instances'] if inst['id'] not in seen_ids])
            seen_ids.update([inst['id'] for inst in res['instances']])
            edetails.update({ eid: res['entities'][eid] for eid in res['entities'] if res['entities'][eid]['type'] in self._RELATED_ENTITY_TYPES})
        return refs, edetails

    def _get_docs(self):
        all_docs = set()
        for names in _chunks(self.iocs.keys(), 250):
            q = {"instance": {"type": "Event"},
                 "output": {"count": {"axis": [{"name": "attributes.entities",
                                                "type": [self.entity_type],
                                                "aspect": "name"},
                                                "document"],
                                      "values": [self._INSTANCES_OR_DOCUMENTS]}}}
            q['instance']['attributes'] = [[{"name": "Event.event_fragment", 'string': names}]]
            rfids = [self.iocs[name] for name in names if self.iocs[name]]
            q['instance']['attributes'][0].append({"name": "entities",
                                                   "entity": {"id": rfids}})
            res = self.rfqapi.query(q)
            counts = res["counts"][0]
            if len(counts) != 0:
                for ioc in filter(lambda i: i in self.iocs, counts):
                    docids = counts[ioc].keys()
                    self.response[ioc]['DocumentIds'] = docids
                    all_docs.update(docids)
        return list(all_docs)

    def _safe_get_related_entities_from_frags(self, refs, edetails):
        ioc_to_rfid = self.iocs
        rfid_to_ioc = {}
        for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid):
            rfid_to_ioc[ioc_to_rfid[ioc]] = ioc
        entities_to_lookup = set()
        for ref in refs:
            related_ents = ref['attributes'].get('extended_entities', ref['attributes'].get('entities', []))
            entities_to_lookup.update([eid for eid in related_ents if eid not in edetails])
        # print "Updating entity resolution"
        edetails.update(self._resolve_related_entities(list(entities_to_lookup)))
        # print "Updated related entities"
        for ref in refs:
            fragment = ref['fragment'].lower()
            # get related entities from reference
            related_ents = ref['attributes'].get('extended_entities', ref['attributes'].get('entities', []))
            # get entities mentioned
            rfids = filter(lambda ioc: ioc in rfid_to_ioc, related_ents)
            ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc]
            # get string hits that aren't included in the entity hits
            other_hits = [ioc for ioc in ioc_to_rfid if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)]
            iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids]
            for ioc in iocs + other_hits:
                ioc_resp = self.response[ioc]
                for ent in filter(lambda eid: eid in edetails and eid != ioc_resp['RFID'], related_ents):
                    etype, name = edetails[ent]['type'], edetails[ent]['name']
                    if name not in ioc_resp['Related' + etype]:
                        ioc_resp['Related' + etype].append(name)
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            for etype in self._RELATED_ENTITY_TYPES:
                if 'Related' + etype + 'Count' in ioc_resp:
                    ioc_resp['Related' + etype + 'Count'] = len(ioc_resp['Related' + etype])
                if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp:
                    del ioc_resp['Related' + etype]

    def _resolve_related_entities(self, eids):
        if len(eids) == 0:
            return {}
        results = {}
        for ents in _chunks(eids, 250):
            q = {"entity": {"id": ents,
                            "limit": 1001}}
            res = self.rfqapi.query(q)
            results.update({ eid: res['entity_details'][eid] for eid in res['entity_details'] if res['entity_details'][eid]['type'] in self._RELATED_ENTITY_TYPES })
        return results

    def _safe_get_related_entities_from_docs(self, docs):
        for docids in _chunks(docs, 250):
            q = {"instance": {"type": "Event",
                              "document": {"id": docids}},
                 "output": {"count": {"axis": ["document",
                                               {"name": "attributes.entities",
                                                "type": self._RELATED_ENTITY_TYPES,
                                                "aspect": "all"}],
                                      "values": [self._INSTANCES_OR_DOCUMENTS]}}}
            res = self.rfqapi.query(q)
            counts = res['counts'][0]
            for ioc in self.response:
                ioc_resp = self.response[ioc]
                for docid in filter(lambda did: did in counts, ioc_resp['DocumentIds']):
                    for asp_name in filter(lambda n: n != 'NONE', counts[docid]):
                        name, unused, etype = rf_agg_name_parser(asp_name)
                        if name == ioc: continue
                        # update related counts
                        if name not in ioc_resp['Related' + etype]:
                            ioc_resp['Related' + etype].append(name)
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            if 'DocumentIds' not in self.keys and 'DocumentIds' in ioc_resp:
                del ioc_resp['DocumentIds']
            for etype in self._RELATED_ENTITY_TYPES:
                if 'Related' + etype + 'Count' in ioc_resp:
                    ioc_resp['Related' + etype + 'Count'] = len(ioc_resp['Related' + etype])
                if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp:
                    del ioc_resp['Related' + etype]
    
    def _get_rfids(self, iocs):
        new_iocs = collections.OrderedDict()
        edetails = {}
        for names in _chunks(iocs, 250):
            if len(names) == 0: continue
            q = {"entity": {"name": names, 
                            "type": self.entity_type,
                            "limit": 501}}
            res = self.rfqapi.query(q)
            if len(res['entities']) == 0: continue
            for ent in res['entities']:
                edetails[res['entity_details'][ent]['name']] = ent
        for ioc in iocs:
            new_iocs[ioc] = edetails[ioc] if ioc in edetails else None
        return new_iocs
コード例 #8
0
ファイル: ioc_enricher.py プロジェクト: reggiev/api
class IOCEnricher(object):
    '''Enriches a list of IOCs with data from Recorded Future.
    '''

    _VALID_TYPES = ["IpAddress", "Hash", "InternetDomainName"]
    _INSTANCES_OR_DOCUMENTS = 'instances'
    _MALICIOUS_INDICATORS = [
        "compromised", "malicious", "suspected", "threat", "malware",
        "infected", "honeypot", "attacked from", "exploit", "attacks from",
        "bad http request from", "attack detected", "attack deteted"
    ]
    _RELATED_ENTITY_TYPES = [
        'Malware', 'CyberVulnerability', 'IpAddress', 'Hash',
        'InternetDomainName'
    ]
    # can be "document" also, but enrichment will take much longer
    # to pull document-level co-entities. fragment-level will use
    # extended_entities where available
    _RELATED_ENTITY_SCOPE = "fragment"
    _FEATURES = {
        "debug":
        collections.OrderedDict([("RFID", ""), ("EntityType", ""),
                                 ("TotalHits", 0), ("7DayHits", 0),
                                 ("1DayHits", 0), ("MaliciousHits", 0),
                                 ("InfoSecHits", 0), ("PasteHits", 0),
                                 ("SocialMediaHits", 0)]),
        "related":
        collections.OrderedDict([("RelatedMalware", []),
                                 ("RelatedCyberVulnerability", []),
                                 ("RelatedIpAddress", []),
                                 ("RelatedInternetDomainName", []),
                                 ("RelatedHash", []),
                                 ("RelatedMalwareCount", 0),
                                 ("RelatedCyberVulnerabilityCount", 0),
                                 ("RelatedIpAddressCount", 0),
                                 ("RelatedInternetDomainNameCount", 0),
                                 ("RelatedHashCount", 0), ("Score", 0.0)]),
        "core":
        collections.OrderedDict([("Name", ""), ("RFURL", ""),
                                 ("MostRecent", ""), ("MostRecentSource", ""),
                                 ("MostRecentTitle", ""),
                                 ("MostRecentFragment", ""),
                                 ("MostRecentURL", ""),
                                 ("RecentInfoSecSource", ""),
                                 ("RecentInfoSecTitle", ""),
                                 ("RecentInfoSecFragment", ""),
                                 ("RecentInfoSecURL", ""),
                                 ("RecentPasteSource", ""),
                                 ("RecentPasteTitle", ""),
                                 ("RecentPasteFragment", ""),
                                 ("RecentPasteURL", ""),
                                 ("RecentSocialMediaSource", ""),
                                 ("RecentSocialMediaTitle", ""),
                                 ("RecentSocialMediaFragment", ""),
                                 ("RecentSocialMediaURL", ""),
                                 ("FirstSource", ""), ("FirstTitle", ""),
                                 ("FirstFragment", ""), ("FirstURL", ""),
                                 ("FirstPublished", "")])
    }

    def __init__(self, token, iocs, entity_type, mode='core'):
        '''
        Parameters
        ----------
        token : str
            Recorded Future API token
        iocs : list or dict
            List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. 
        entity_type : {"IpAddress", "Hash", "InternetDomainName"}
            Name of Recorded Future entity type for IOC.
        mode : {"core", "related", "debug"}
            Subset of features to return with enrichment. "core" is default.
        '''
        self.rfqapi = RFAPI(token)
        self.response = collections.OrderedDict()
        # need all features early for scoring; they're removed later
        # need to test whether this can be avoided
        keys = self._FEATURES['core']
        keys.update(self._FEATURES['debug'])
        if mode in ('related', 'debug'):
            keys.update(self._FEATURES['related'])
        if mode not in ('core', 'related', 'debug'):
            raise ValueError(
                '"mode" must be one of ("core", "related", "debug"). Input: %s.'
                % mode)
        self.mode = mode
        self.entity_type = entity_type
        if isinstance(iocs, list):
            self.iocs = self._get_rfids(iocs)
        elif isinstance(iocs, dict):
            self.iocs = iocs
        else:
            raise ValueError('"iocs" must be list or dict.')
        for ioc in self.iocs:
            new_resp = {}
            for key in keys:
                new_resp[key] = keys[key]
                if key == 'Name':
                    new_resp[key] = ioc
                elif key == 'RFID':
                    new_resp[key] = self.iocs[ioc]
                elif key == 'EntityType':
                    new_resp[key] = self.entity_type
            self.response[ioc] = new_resp
        self.keys = keys

    def get_keys(self, mode=None):
        '''Getter for the keys in the response.
        '''
        return [
            key for key in self.keys
            if key not in self._get_extra_features(mode)
        ]

    def _get_extra_features(self, mode=None):
        if not mode:
            mode = self.mode
        extra_features = []
        if mode in ('core', 'related'):
            extra_features = self._FEATURES['debug'].keys()
        return extra_features

    def enrich(self):
        '''Enriches the given IOC.
        Returns
        -------
        response : dict
            The enrichment package containing all keys requested by "mode" parameter.
        '''
        print "    Getting all references"
        max_index = None
        for names in _chunks(self.iocs.keys(), 250):
            refs, edetails = self._get_all_references(names)
            print "      Getting enrichment from references"
            max_index_cand = self._get_enrichment(refs, edetails)
            if max_index_cand < max_index or not max_index:
                # using < here because the references are no longer retrieved all from
                # the same query, so there may be timings, so we're looking at the minimax
                max_index = max_index_cand
            print "      Getting URL and Score"
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            # Get RF URL
            if 'RFURL' in ioc_resp:
                ioc_resp['RFURL'] = _generate_rfURL_from_entity(
                    ioc, ioc_resp.get('RFID', None))
            # Score the ref
            if 'Score' in ioc_resp:
                self.score(ioc_resp)
            # Remove unnecessary features
            extra_features = self._get_extra_features()
            for key in extra_features:
                del ioc_resp[key]
        return self.response, max_index

    def score(self, ioc_resp):
        spec_keys = ('7DayHits', '1DayHits')
        nonzero_keys = ('MaliciousHits', 'InfoSecHits', 'PasteHits',
                        'RelatedMalwareCount',
                        'RelatedCyberVulnerabilityCount',
                        'RelatedIpAddressCount',
                        'RelatedInternetDomainNameCount', 'RelatedHashCount')
        max_score = 0.0
        # score special keys
        if 'TotalHits' in self.keys:
            for key in filter(lambda k: k in self.keys, spec_keys):
                if ((ioc_resp[key] * 2) > ioc_resp["TotalHits"]):
                    ioc_resp['Score'] += 1
                max_score += len(spec_keys)
        # score nonzero keys
        for key in filter(lambda k: k in self.keys, nonzero_keys):
            if ioc_resp[key] > 0:
                ioc_resp['Score'] += 1
            max_score += 1
        ioc_resp['Score'] = ioc_resp['Score'] / max_score

    def _get_enrichment(self, refs, edetails):
        max_index = None
        today = datetime.datetime.today()
        one_day_hit_string = _rfid_date_conv(today -
                                             datetime.timedelta(days=1))
        seven_day_hit_string = _rfid_date_conv(today -
                                               datetime.timedelta(days=7))

        # first get everything from all references
        print "    Processing references"
        ioc_to_rfid = self.iocs
        rfid_to_ioc = {}
        for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid):
            rfid_to_ioc[ioc_to_rfid[ioc]] = ioc
        recent_pub = {
            "MostRecent": {},
            "Paste": {},
            "InfoSec": {},
            "SocialMedia": {}
        }
        first_pub = {}
        for ref in refs:
            indexed = ref['document']['indexed']
            if indexed > max_index or not max_index:
                max_index = indexed
            fragment = ref['fragment'].lower()
            attrs = ref['attributes']
            source_topic = ref['document']['sourceId'].get('topic', None)
            source_media_type = ref['document']['sourceId'].get(
                'media_type', None)
            pub_date = ref['document']['published']
            # get entities mentioned
            rfids = filter(lambda ioc: ioc in rfid_to_ioc,
                           attrs.get('entities', []))
            ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc]
            # get string hits that aren't included in the entity hits
            other_hits = [
                ioc for ioc in ioc_to_rfid
                if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)
            ]
            # increment hit counts and get recent hits
            iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids]
            for ioc in iocs + other_hits:
                ioc_resp = self.response[ioc]
                # update dates
                recent_pub['MostRecent'][ioc] = self._safe_update_date(
                    ioc_resp, pub_date, recent_pub['MostRecent'][ioc]
                    if ioc in recent_pub['MostRecent'] else '', 'MostRecent',
                    pub_date > recent_pub['MostRecent'][ioc]
                    if ioc in recent_pub['MostRecent']
                    and len(recent_pub['MostRecent'][ioc]) > 0 else True)
                first_pub[ioc] = self._safe_update_date(
                    ioc_resp, pub_date,
                    first_pub[ioc] if ioc in first_pub else '',
                    'FirstPublished', pub_date < first_pub[ioc]
                    if ioc in first_pub and len(first_pub[ioc]) > 0 else True)
                # update hit counters
                self._safe_update_hits(ioc_resp, 'TotalHits', True)
                self._safe_update_hits(ioc_resp, '1DayHits',
                                       pub_date >= one_day_hit_string)
                self._safe_update_hits(ioc_resp, '7DayHits',
                                       pub_date >= seven_day_hit_string)
                self._safe_update_hits(
                    ioc_resp, 'MaliciousHits',
                    any(term in fragment
                        for term in self._MALICIOUS_INDICATORS))
                # update hit counters and references
                conditions = {
                    "InfoSec": source_topic == 'KPzZAE',
                    "Paste": source_media_type == 'KDS1Zp',
                    "SocialMedia": source_media_type == 'JxSEtC'
                }
                for key in conditions:
                    condition = conditions[key]
                    recent_pub[key][ioc] = self._safe_update_hits_and_refs(
                        ioc_resp, ref, key, condition,
                        recent_pub[key][ioc] if ioc in recent_pub[key] else '',
                        pub_date > recent_pub[key][ioc]
                        if ioc in recent_pub[key]
                        and len(recent_pub[key][ioc]) > 0 else True)
                # update references for first and recent
                self._safe_update_refs(
                    ioc_resp, ref, 'MostRecent',
                    pub_date == recent_pub['MostRecent'][ioc])
                self._safe_update_refs(ioc_resp, ref, 'First',
                                       pub_date == first_pub[ioc])
        # get related content at fragment scope
        if self.mode in ('debug', 'related'
                         ) and self._RELATED_ENTITY_SCOPE == 'fragment':
            self._safe_get_related_entities_from_frags(refs, edetails)
        # get related content at document scope
        if self.mode in ('debug', 'related'
                         ) and self._RELATED_ENTITY_SCOPE == 'document':
            # print "Getting related content from documents"
            docs = self._get_docs()
            self._safe_get_related_entities_from_docs(docs)
        return max_index

    def _safe_update_hits_and_refs(self, ioc_resp, ref, key, condition,
                                   cur_date, date_condition):
        pub_date = ref['document']['published']
        date_update = self._safe_update_date(ioc_resp, pub_date, cur_date, key,
                                             date_condition and condition)
        if condition:
            # update hits
            self._safe_update_hits(ioc_resp, key + 'Hits', condition)
            # get recent frags
            self._safe_update_refs(ioc_resp, ref, 'Recent' + key,
                                   pub_date == date_update)
        return date_update

    def _safe_update_date(self, ioc_resp, date, existing_val, key, condition):
        if condition and key in ioc_resp:
            ioc_resp[key] = date
        return date if condition else existing_val

    def _safe_update_hits(self, ioc_resp, key, condition):
        if condition and key in ioc_resp:
            ioc_resp[key] += 1

    def _safe_update_refs(self, ioc_resp, ref, key, condition):
        if condition:
            key_suffixes = {
                'Source':
                ref['document']['sourceId']['name'].replace('\n', ' ').replace(
                    '\r', ' '),
                'Title':
                ref['document']['title'].replace('\n', ' ').replace('\r', ' '),
                'Fragment':
                ref['fragment'].replace('\n', ' ').replace('\r', ' '),
                'URL':
                ref['document']['url'] if 'url' in ref['document'] else ''
            }
            for suffix in filter(lambda suf: key + suf in ioc_resp,
                                 key_suffixes):
                ioc_resp[key + suffix] = key_suffixes[suffix]

    def _get_all_references(self, names):
        refs = []
        seen_ids = set()
        edetails = {}
        q = {
            "instance": {
                "type": "Event",
                "limit": 25000,
                "searchtype": "scan"
            }
        }
        q['instance']['attributes'] = [[{
            "name": "Event.event_fragment",
            'string': names
        }]]
        rfids = [self.iocs[name] for name in names if self.iocs[name]]
        q['instance']['attributes'][0].append({
            "name": "entities",
            "entity": {
                "id": rfids
            }
        })
        # print len(self.iocs.keys()),
        for res in self.rfqapi.paged_query(q):
            refs.extend([
                inst for inst in res['instances'] if inst['id'] not in seen_ids
            ])
            seen_ids.update([inst['id'] for inst in res['instances']])
            edetails.update({
                eid: res['entities'][eid]
                for eid in res['entities']
                if res['entities'][eid]['type'] in self._RELATED_ENTITY_TYPES
            })
        return refs, edetails

    def _get_docs(self):
        all_docs = set()
        for names in _chunks(self.iocs.keys(), 250):
            q = {
                "instance": {
                    "type": "Event"
                },
                "output": {
                    "count": {
                        "axis": [{
                            "name": "attributes.entities",
                            "type": [self.entity_type],
                            "aspect": "name"
                        }, "document"],
                        "values": [self._INSTANCES_OR_DOCUMENTS]
                    }
                }
            }
            q['instance']['attributes'] = [[{
                "name": "Event.event_fragment",
                'string': names
            }]]
            rfids = [self.iocs[name] for name in names if self.iocs[name]]
            q['instance']['attributes'][0].append({
                "name": "entities",
                "entity": {
                    "id": rfids
                }
            })
            res = self.rfqapi.query(q)
            counts = res["counts"][0]
            if len(counts) != 0:
                for ioc in filter(lambda i: i in self.iocs, counts):
                    docids = counts[ioc].keys()
                    self.response[ioc]['DocumentIds'] = docids
                    all_docs.update(docids)
        return list(all_docs)

    def _safe_get_related_entities_from_frags(self, refs, edetails):
        ioc_to_rfid = self.iocs
        rfid_to_ioc = {}
        for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid):
            rfid_to_ioc[ioc_to_rfid[ioc]] = ioc
        entities_to_lookup = set()
        for ref in refs:
            related_ents = ref['attributes'].get(
                'extended_entities', ref['attributes'].get('entities', []))
            entities_to_lookup.update(
                [eid for eid in related_ents if eid not in edetails])
        # print "Updating entity resolution"
        edetails.update(
            self._resolve_related_entities(list(entities_to_lookup)))
        # print "Updated related entities"
        for ref in refs:
            fragment = ref['fragment'].lower()
            # get related entities from reference
            related_ents = ref['attributes'].get(
                'extended_entities', ref['attributes'].get('entities', []))
            # get entities mentioned
            rfids = filter(lambda ioc: ioc in rfid_to_ioc, related_ents)
            ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc]
            # get string hits that aren't included in the entity hits
            other_hits = [
                ioc for ioc in ioc_to_rfid
                if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)
            ]
            iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids]
            for ioc in iocs + other_hits:
                ioc_resp = self.response[ioc]
                for ent in filter(
                        lambda eid: eid in edetails and eid != ioc_resp[
                            'RFID'], related_ents):
                    etype, name = edetails[ent]['type'], edetails[ent]['name']
                    if name not in ioc_resp['Related' + etype]:
                        ioc_resp['Related' + etype].append(name)
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            for etype in self._RELATED_ENTITY_TYPES:
                if 'Related' + etype + 'Count' in ioc_resp:
                    ioc_resp['Related' + etype + 'Count'] = len(
                        ioc_resp['Related' + etype])
                if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp:
                    del ioc_resp['Related' + etype]

    def _resolve_related_entities(self, eids):
        if len(eids) == 0:
            return {}
        results = {}
        for ents in _chunks(eids, 250):
            q = {"entity": {"id": ents, "limit": 1001}}
            res = self.rfqapi.query(q)
            results.update({
                eid: res['entity_details'][eid]
                for eid in res['entity_details'] if res['entity_details'][eid]
                ['type'] in self._RELATED_ENTITY_TYPES
            })
        return results

    def _safe_get_related_entities_from_docs(self, docs):
        for docids in _chunks(docs, 250):
            q = {
                "instance": {
                    "type": "Event",
                    "document": {
                        "id": docids
                    }
                },
                "output": {
                    "count": {
                        "axis": [
                            "document", {
                                "name": "attributes.entities",
                                "type": self._RELATED_ENTITY_TYPES,
                                "aspect": "all"
                            }
                        ],
                        "values": [self._INSTANCES_OR_DOCUMENTS]
                    }
                }
            }
            res = self.rfqapi.query(q)
            counts = res['counts'][0]
            for ioc in self.response:
                ioc_resp = self.response[ioc]
                for docid in filter(lambda did: did in counts,
                                    ioc_resp['DocumentIds']):
                    for asp_name in filter(lambda n: n != 'NONE',
                                           counts[docid]):
                        name, unused, etype = rf_agg_name_parser(asp_name)
                        if name == ioc: continue
                        # update related counts
                        if name not in ioc_resp['Related' + etype]:
                            ioc_resp['Related' + etype].append(name)
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            if 'DocumentIds' not in self.keys and 'DocumentIds' in ioc_resp:
                del ioc_resp['DocumentIds']
            for etype in self._RELATED_ENTITY_TYPES:
                if 'Related' + etype + 'Count' in ioc_resp:
                    ioc_resp['Related' + etype + 'Count'] = len(
                        ioc_resp['Related' + etype])
                if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp:
                    del ioc_resp['Related' + etype]

    def _get_rfids(self, iocs):
        new_iocs = collections.OrderedDict()
        edetails = {}
        for names in _chunks(iocs, 250):
            if len(names) == 0: continue
            q = {
                "entity": {
                    "name": names,
                    "type": self.entity_type,
                    "limit": 501
                }
            }
            res = self.rfqapi.query(q)
            if len(res['entities']) == 0: continue
            for ent in res['entities']:
                edetails[res['entity_details'][ent]['name']] = ent
        for ioc in iocs:
            new_iocs[ioc] = edetails[ioc] if ioc in edetails else None
        return new_iocs