def parse_file(self, filename): """Parse 7901 file by given filename. :param filename """ try: item = {'type': 'preformatted'} item['guid'] = generate_guid(type=GUID_TAG) item['versioncreated'] = utcnow() with open(filename, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)', lines[0], flags=re.I) if m: item['original_source'] = m.group(1).decode() item['ingest_provider_sequence'] = m.group(2).decode() item['priority'] = self.map_priority(m.group(3).decode()) item['anpa-category'] = {'qcode': self.map_category(m.group(4).decode())} item['word_count'] = int(m.group(5).decode()) inHeader = True inText = False inNote = False for line in lines[1:]: # STX starts the body of the story if line[0:1] == b'\x02': # pick the rest of the line off as the headline item['headline'] = line[1:].decode().rstrip('\r\n') item['body_html'] = '' inText = True inHeader = False continue # ETX denotes the end of the story if line[0:1] == b'\x03': break if inText: if line.decode().find('The following information is not for publication') != -1: inNote = True inText = False item['ednote'] = '' continue item['body_html'] += line.decode() if inNote: item['ednote'] += line.decode() continue if inHeader: if 'slugline' not in item: item['slugline'] = '' item['slugline'] += line.decode().rstrip('/\r\n') continue return item except Exception as ex: raise ParserError.IPTC7901ParserError(filename, ex)
def parse_file(self, filename): """Parse 7901 file by given filename. :param filename """ try: item = {"type": "preformatted"} item["guid"] = generate_guid(type=GUID_TAG) item["versioncreated"] = utcnow() with open(filename, "rb") as f: lines = [line for line in f] # parse first header line m = re.match(b"\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)", lines[0], flags=re.I) if m: item["original_source"] = m.group(1).decode() item["ingest_provider_sequence"] = m.group(2).decode() item["priority"] = self.map_priority(m.group(3).decode()) item["anpa_category"] = [{"qcode": self.map_category(m.group(4).decode())}] item["word_count"] = int(m.group(5).decode()) inHeader = True inText = False inNote = False for line in lines[1:]: # STX starts the body of the story if line[0:1] == b"\x02": # pick the rest of the line off as the headline item["headline"] = line[1:].decode().rstrip("\r\n") item["body_html"] = "" inText = True inHeader = False continue # ETX denotes the end of the story if line[0:1] == b"\x03": break if inText: if line.decode().find("The following information is not for publication") != -1: inNote = True inText = False item["ednote"] = "" continue item["body_html"] += line.decode() if inNote: item["ednote"] += line.decode() continue if inHeader: if "slugline" not in item: item["slugline"] = "" item["slugline"] += line.decode().rstrip("/\r\n") continue return item except Exception as ex: raise ParserError.IPTC7901ParserError(filename, ex)
def parse_email(self, data, provider): try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item['type'] = 'text' item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) item['original_creator'] = self.parse_header(msg['from']) item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}" .format(item['headline'], item['original_creator']), ex) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing text html for {0} from {1}" .format(item['headline'], item['original_creator']), ex) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item['type'] = 'composite' comp_item['guid'] = generate_guid( type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] # create a reference to the item that stores the body of the email item_ref = {} item_ref['guid'] = item['guid'] item_ref['residRef'] = item['guid'] item_ref['headline'] = item['headline'] item_ref['location'] = 'ingest' item_ref['itemClass'] = 'icls:text' refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item['type'] = 'picture' media_item['renditions'] = renditions media_item['mimetype'] = content_type media_item['filemeta'] = metadata media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {} media_ref['guid'] = media_item['guid'] media_ref['residRef'] = media_item['guid'] media_ref['headline'] = fileName media_ref['location'] = 'ingest' media_ref['itemClass'] = 'icls:picture' refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = text_body item['type'] = 'preformatted' # if there is composite item then add the main group and references if comp_item: grefs = {} grefs['refs'] = [{'idRef': 'main'}] grefs['id'] = 'root' grefs['role'] = 'grpRole:NEP' comp_item['groups'].append(grefs) grefs = {} grefs['refs'] = refs grefs['id'] = 'main' grefs['role'] = 'grpRole:Main' comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse_email(self, data, provider): try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item['type'] = 'text' item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) item['original_creator'] = self.parse_header(msg['from']) item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}".format(item['headline'], item['original_creator']), ex) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing text html for {0} from {1}".format(item['headline'], item['original_creator']), ex) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream(content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put(content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item['type'] = 'composite' comp_item['guid'] = generate_guid(type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] # create a reference to the item that stores the body of the email item_ref = {} item_ref['guid'] = item['guid'] item_ref['residRef'] = item['guid'] item_ref['headline'] = item['headline'] item_ref['location'] = 'ingest' item_ref['itemClass'] = 'icls:text' refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item['type'] = 'picture' media_item['renditions'] = renditions media_item['mimetype'] = content_type media_item['filemeta'] = metadata media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {} media_ref['guid'] = media_item['guid'] media_ref['residRef'] = media_item['guid'] media_ref['headline'] = fileName media_ref['location'] = 'ingest' media_ref['itemClass'] = 'icls:picture' refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = text_body item['type'] = 'preformatted' # if there is composite item then add the main group and references if comp_item: grefs = {} grefs['refs'] = [{'idRef': 'main'}] grefs['id'] = 'root' grefs['role'] = 'grpRole:NEP' comp_item['groups'].append(grefs) grefs = {} grefs['refs'] = refs grefs['id'] = 'main' grefs['role'] = 'grpRole:Main' comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse_file(self, filename): """Parse 7901 file by given filename. :param filename """ try: item = {'type': 'preformatted'} item['guid'] = generate_guid(type=GUID_TAG) item['versioncreated'] = utcnow() with open(filename, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match( b'\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)', lines[0], flags=re.I) if m: item['original_source'] = m.group(1).decode() item['ingest_provider_sequence'] = m.group(2).decode() item['priority'] = self.map_priority(m.group(3).decode()) item['anpa-category'] = { 'qcode': self.map_category(m.group(4).decode()) } item['word_count'] = int(m.group(5).decode()) inHeader = True inText = False inNote = False for line in lines[1:]: # STX starts the body of the story if line[0:1] == b'\x02': # pick the rest of the line off as the headline item['headline'] = line[1:].decode().rstrip('\r\n') item['body_html'] = '' inText = True inHeader = False continue # ETX denotes the end of the story if line[0:1] == b'\x03': break if inText: if line.decode().find( 'The following information is not for publication' ) != -1: inNote = True inText = False item['ednote'] = '' continue item['body_html'] += line.decode() if inNote: item['ednote'] += line.decode() continue if inHeader: if 'slugline' not in item: item['slugline'] = '' item['slugline'] += line.decode().rstrip('/\r\n') continue return item except Exception as ex: raise ParserError.IPTC7901ParserError(filename, ex)