Example #1
0
def get_crime_data():
    """
    Fetches XML data, converts to JSON using Parker syntax
    """
    print('Getting crime data...')
    r = requests.get(url)
    bf = Parker(dict_type=dict)
    obj = bf.data(fromstring(r.text))
    return obj
Example #2
0
def get_prices(date):
    html = requests.get(URL + date + '/').text
    i = html.find(PREFIX)
    if i < 0:
        return None

    link = html[i:i + PATH_LEN]
    r = requests.get(URL + date + '/' + link)

    xml = gzip.decompress(r.content)

    parker = Parker(dict_type=dict)
    return parker.data(fromstring(xml))
Example #3
0
 def get_json_instance(node):
     pk = Parker(xml_fromstring=_fromstring, dict_type=OrderedDict)
     default_ns = {}
     nodes = [node] + node.findall('.//')
     for item in nodes:
         parents = [
             p for p in node.findall('.//{}/..'.format(item.tag))
             if item in p.findall('*')
         ]
         if parents and id(parents[0]) in default_ns:
             default_url = default_ns[id(parents[0])]
             ns, tag = self.device.convert_tag(default_url,
                                               item.tag,
                                               dst=Tag.JSON_NAME)
         else:
             ns, tag = self.device.convert_tag('',
                                               item.tag,
                                               dst=Tag.JSON_NAME)
         default_ns[id(item)] = ns
         item.tag = tag
     return pk.data(node, preserve_root=True)
Example #4
0
 def get_json_instance(node):
     pk = Parker(xml_fromstring=_fromstring, dict_type=OrderedDict)
     default_ns = {}
     for item in node.iter():
         parents = [p for p in node.iter() if item in p]
         if parents and id(parents[0]) in default_ns:
             ns, tag = self.device.convert_tag(default_ns[id(parents[0])],
                                               item.tag,
                                               dst=ns_spec[origin]['val_name'])
         else:
             ns, tag = self.device.convert_tag('',
                                               item.tag,
                                               dst=ns_spec[origin]['val_name'])
         default_ns[id(item)] = ns
         item.tag = tag
         if item.text:
             text = self.device.convert_tag(self._url_to_prefix[ns],
                                            item.text,
                                            src=Tag.JSON_PREFIX,
                                            dst=ns_spec[origin]['val_val'])[1]
             item.text = text
     return pk.data(node)
Example #5
0
class Parser:
    """Class that handles the verification and parsing of DMARC
    aggregate reports.

    Attributes
    ----------
    schema : str
        The name of the file containing the XML schema defining a
        DMARC aggregate report.

    domains : io.FileIO
        The file object to which a list of the domains encountered
        while parsing DMARC aggregate reports should be saved, or None
        if no such file is to be saved.

    report_directory : str
        The name of the directory to which XML files containing the
        DMARC aggregate reports encountered while parsing DMARC
        aggregate reports should be saved, or None if no such files
        are to be saved.

    es_url : str
        The Elasticsearch index where the DMARC aggregate reports
        should be written.

    es_index : str
        The index to use when writing the DMARC aggregate reports to
        Elasticsearch.

    es_region : str
        The AWS region where the Elasticsearch instance is located.

    parker : xmljson.Parker
        Converts XML to JSON using the Parker convention.  Since the
        aggregate report XSD does not define any attributes we can use
        this convention to simplify the JSON without losing any
        information.

    api_headers : dict
        The Dmarcian API authentication header.
    """

    """The URL for the Dmarcian API call that retrieves the bulk
    mail-sending organization (if any) associated with an IP.
    """
    __DmarcianApiUrl = 'https://dmarcian.com/api/v1/find/source/{}'

    """The name of the authentication header required by the Dmarcian API"""
    __DmarcianHeaderName = 'Authorization'

    """The value of the authentication header required by the Dmarcian API"""
    __DmarcianHeaderValue = 'Token {}'

    """The timeout in seconds to use when retrieving API data"""
    __Timeout = 300

    """The payload to use when creating the Elasticsearch index where
DMARC aggregate reports are stored.
    """
    __IndexPayload = {
        'mappings': {
            '_doc': {
                'properties': {
                    'policy_published': {
                        'properties': {
                            'adkim': {'type': 'text'},
                            'aspf': {'type': 'text'},
                            'domain': {'type': 'text'},
                            'fo': {'type': 'long'},
                            'p': {'type': 'text'},
                            'pct': {'type': 'long'},
                            'sp': {'type': 'text'}
                        }
                    },
                    'record': {
                        'properties': {
                            'auth_results': {
                                'properties': {
                                    'dkim': {
                                        'properties': {
                                            'domain': {'type': 'text'},
                                            'human_result': {'type': 'text'},
                                            'result': {'type': 'text'},
                                            'selector': {'type': 'text'}
                                        }
                                    },
                                    'spf': {
                                        'properties': {
                                            'domain': {'type': 'text'},
                                            'result': {'type': 'text'},
                                            'scope': {'type': 'text'}
                                        }
                                    }
                                }
                            },
                            'identifiers': {
                                'properties': {
                                    'envelope_from': {'type': 'text'},
                                    'envelope_to': {'type': 'text'},
                                    'header_from': {'type': 'text'}
                                }
                            },
                            'row': {
                                'properties': {
                                    'count': {'type': 'long'},
                                    'policy_evaluated': {
                                        'properties': {
                                            'disposition': {'type': 'text'},
                                            'dkim': {'type': 'text'},
                                            'reason': {
                                                'properties': {
                                                    'comment': {'type': 'text'},
                                                    'type': {'type': 'text'}
                                                }
                                            },
                                            'spf': {'type': 'text'}
                                        }
                                    },
                                    'source_ip': {'type': 'text'}
                                }
                            }
                        }
                    },
                    'report_metadata': {
                        'properties': {
                            'date_range': {
                                'properties': {
                                    'begin': {'type': 'long'},
                                    'end': {'type': 'long'}
                                }
                            },
                            'email': {'type': 'text'},
                            'error': {'type': 'text'},
                            'extra_contact_info': {'type': 'text'},
                            'org_name': {'type': 'text'},
                            'report_id': {'type': 'text'}
                        }
                    },
                    'version': {'type': 'float'}
                }
            }
        }
    }

    def __init__(self, schema_file, domain_file=None, report_directory=None,
                 es_url=None, es_index=None, es_region=None, api_token=None):
        """Construct a Parser instance.

        Parameters
        ----------
        schema_file : str
            The name of the file containing the XML schema defining a DMARC
            aggregate report.

        domain_file : str
            The name of the file to which a list of the domains
            encountered while parsing DMARC aggregate reports should
            be saved, or None if no such file is to be saved.

        report_directory : str
            The name of the directory to which XML files containing
            the DMARC aggregate reports encountered while parsing
            DMARC aggregate reports should be saved, or None if no
            such files are to be saved.

        es_url : str
            A URL corresponding to an AWS Elasticsearch instance where
            DMARC aggregate reports should be written.

        es_index : str
            The index to use when writing the DMARC aggregate reports
            to Elasticsearch.

        api_token : str
            The Dmarcian API token.
        """
        self.schema = etree.XMLSchema(file=schema_file)

        if domain_file is not None:
            self.domains = open(domain_file, 'w')
        else:
            self.domains = None

        self.report_directory = report_directory

        self.es_url = es_url
        self.es_index = es_index
        self.es_region = es_region

        # We don't care about order of dictionary elements here, so we can use
        # a simple dict instead of the default OrderedDict
        self.parker = Parker(dict_type=dict)

        if api_token is not None:
            self.api_headers = {
                Parser.__DmarcianHeaderName: Parser.__DmarcianHeaderValue.format(api_token)
            }
        else:
            self.api_headers = None

    def pp_validation_error(self, tree):
        """Pretty-print a validation error to the error log.

        Parameters
        ----------
        tree : etree.Element
            The XML element that caused the error.
        """
        logging.error(self.schema.error_log)
        line_num = 2  # Dunno, it lines up with error messages
        for line in etree.tostring(tree).decode().splitlines():
            logging.error('{}\t{}'.format(line_num, line))
            line_num += 1

    def process_message(self, message):
        """Process a (possibly multipart) email message containing one
        or more DMARC aggregate reports.

        Parameters
        ----------
        message : email.message.EmailMessage
            The email message to be processed.

        Returns
        -------
        bool: True if the message was parsed successfully and False
        otherwise.
        """
        # The binascii.Error and AssertionError that appear below are raised if
        # the payload contains a non-base64 digit.  We'll catch the exceptions
        # here since we want to process any other message parts, but we'll log
        # them and set success to False so that the message isn't deleted.
        success = True
        if message.is_multipart():
            # Loop through message parts
            for part in message.get_payload():
                try:
                    success &= self.process_payload(part.get_content_type(),
                                                    part.get_payload(decode=True))
                except (binascii.Error, AssertionError) as e:
                    logging.error('Unable to process a multipart message payload', e)
                    success = False
                    continue
        else:
            # This isn't a multipart message
            try:
                success = self.process_payload(message.get_content_type(),
                                               message.get_payload(decode=True))
            except (binascii.Error, AssertionError) as e:
                logging.error('Unable to process a non-multipart message payload', e)
                success = False

        return success

    def process_payload(self, content_type, payload):
        """Process a (possibly compressed) payload containing an DMARC
        aggregate report.

        Parameters
        ----------
        content_type : str
            The content type of the payload.

        payload : str
            The (possibly compressed) payload.

        Returns
        -------
        bool: True if the payload was parsed successfully and False
        otherwise.
        """
        success = True
        if payload is not None:
            decoded_payload = decode_payload(content_type, payload)
            if decoded_payload is not None:
                patched_payload = patch_xml(decoded_payload)
                tree = None
                try:
                    tree = parse_payload(patched_payload)
                except etree.XMLSyntaxError as e:
                    pp_parse_error(patched_payload, e)
                    success = False

                if tree is not None:
                    valid = self.schema.validate(tree)
                    if valid:
                        logging.debug('RUA payload passed schema validation')
                        logging.debug('Report XML is: {}'.format(pp(tree)))
                        domain = tree.find('policy_published').find('domain').text
                        logging.info('Received a report for {}'.format(domain))

                        # Write the domain to the domains file if necessary
                        if self.domains is not None:
                            self.domains.write('{}\n'.format(domain))

                        # Write the report to the report directory if necessary
                        if self.report_directory is not None:
                            report_id = tree.find('report_metadata').find('report_id').text
                            with open('{}/{}.xml'.format(self.report_directory, report_id), 'w') as report_file:
                                report_file.write(etree.tostring(tree, pretty_print=True).decode())

                        # Convert the XML to JSON
                        jsn = self.parker.data(tree)

                        # Find the bulk mail-sending organizations (if any)
                        # associated with the IPs in the report.
                        #
                        # jsn['record'] can be a list if there are multiple
                        # record tags in the XML, or a dict if there is only a
                        # single record tag.  Parser.listify() will make sure
                        # that we have a list here.
                        for record in Parser.listify(jsn['record']):
                            if self.api_headers is not None:
                                ip = record['row']['source_ip']
                                url = Parser.__DmarcianApiUrl.format(ip)
                                try:
                                    response = requests.get(url,
                                                            headers=self.api_headers,
                                                            timeout=Parser.__Timeout)
                                    # Raises an exception if we didn't get back
                                    # a 200 code
                                    response.raise_for_status()
                                    record['row']['source_ip_affiliation'] = response.json()[ip]
                                except requests.exceptions.RequestException:
                                    logging.exception('Unable to use the Dmarcian API to determine the affiliation of source IP {}'.format(ip))
                                    # We can't query the Dmarcian API because
                                    # of an error, so just add an empty entry
                                    record['row']['source_ip_affiliation'] = None
                                    success = False
                            else:
                                # We can't query the Dmarcian API because we
                                # don't have a token, so just add an empty
                                # entry
                                logging.debug('json is: {}'.format(jsn))
                                logging.debug('record is: {}'.format(record))
                                record['row']['source_ip_affiliation'] = None

                        # Write the report to Elasticsearch if necessary
                        if (self.es_url is not None) and (self.es_region is not None) and (self.es_index is not None):
                            credentials = boto3.Session().get_credentials()
                            awsauth = AWS4Auth(credentials.access_key,
                                               credentials.secret_key,
                                               self.es_region,
                                               'es',
                                               session_token=credentials.token)

                            # Check if the index exists and create it
                            # if necessary
                            index_only_url = '{}/{}'.format(self.es_url,
                                                            self.es_index)
                            response = requests.head(index_only_url,
                                                     auth=awsauth,
                                                     timeout=Parser.__Timeout)
                            if response.status_code != 200:
                                logging.info('The index {} does not exist.  Creating it.'.format(self.es_index))
                                try:
                                    response = requests.put(index_only_url,
                                                            auth=awsauth,
                                                            json=Parser.__IndexPayload,
                                                            headers={'Content-Type': 'application/json'},
                                                            timeout=Parser.__Timeout)
                                    # Raises an exception if we didn't get back a
                                    # 200 code
                                    response.raise_for_status()
                                except requests.exceptions.RequestException:
                                    logging.exception('Unable to create the index {}.'.format(self.es_index))
                                    return False

                            # Now save the report
                            full_url = '{}/_doc'.format(index_only_url)
                            try:
                                response = requests.post(full_url,
                                                         auth=awsauth,
                                                         json=jsn,
                                                         headers={'Content-Type': 'application/json'},
                                                         timeout=Parser.__Timeout)
                                # Raises an exception if we didn't get back a
                                # 200 code
                                response.raise_for_status()
                            except requests.exceptions.RequestException:
                                logging.exception('Unable to save the DMARC aggregate report to Elasticsearch')
                                success = False
                    else:
                        logging.error('RUA payload failed schema validation')
                        self.pp_validation_error(tree)
                        success = False
                else:
                    logging.error('RUA payload failed XML parsing')
                    success = False

        return success

    @staticmethod
    def listify(x):
        """If x is a list then just return it.  If x is a dict then
        return a list with x as the sole item.

        Parameters
        ----------
        x : list, dict
            The list or dict to be listified.

        Returns
        -------
        list: x if x is a list.  If x is a dict then returns a list
        with x as the sole item.
        """
        retVal = x
        if isinstance(x, dict):
            retVal = [x]

        return retVal
def xml_to_json(data):
    pk = Parker(dict_type=OrderedDict)
    converted_json = json.dumps(pk.data(fromstring(data)))
    return converted_json