def parse(self, file_path, provider=None): try: item = { 'guid': '{}-{}'.format(file_path, uuid.uuid4()), 'pubstatus': 'usable', 'versioncreated': utcnow(), ITEM_TYPE: CONTENT_TYPE.TEXT, FORMAT: FORMATS.HTML, } with open(file_path, 'r', encoding='windows-1252') as f: data = f.read().replace('\r', '') header, dateline_data, body_data = data.split('\n\n', 2) self._process_header(item, header) start_of_body = 'MEDIA RELEASE ' source, data = data.split(start_of_body, 1) data = start_of_body + data item['anpa_category'] = [{'qcode': 'j'}] item['original_source'] = 'AsiaNet' body_html = to_ascii(html.escape(data)).replace('\n\n', '</p><p>').replace( '\n', ' ') item['body_html'] = '<p>' + body_html + '</p>' item['word_count'] = get_word_count(item['body_html']) return item except Exception as e: raise AAPParserError.AsiaNetParserError(file_path, e)
def parse(self, filename, provider=None): try: item = {} self.set_item_defaults(item, filename) with open(filename, 'r', encoding='windows-1252') as f: # read the whole file into a single string lines = f.read() # Construct pattern for the regular expression pattern = '(.*)\n' for f in self.field_list: pattern = pattern + f[0] + '(.*)\n' m = re.match(pattern, ''.join(lines), re.MULTILINE | re.DOTALL) if m: for f in self.field_list: if f[1] is not None: item[f[1]] = m.group(f[2]) # fix the formatting item[self.ITEM_VERSION_CREATED] = self.datetime( item[self.ITEM_VERSION_CREATED]) item[self.ITEM_BODY_HTML] = '<p>' + html.escape(item[self.ITEM_BODY_HTML].strip()).replace('\n', '</p><p>')\ + '</p>' item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise AAPParserError.NewsBitesParserError(exception=ex, provider=provider)
def parse(self, file_path, provider=None): try: item = { 'guid': '{}-{}'.format(file_path, uuid.uuid4()), 'pubstatus': 'usable', 'versioncreated': utcnow(), ITEM_TYPE: CONTENT_TYPE.TEXT, FORMAT: FORMATS.PRESERVED, } with open(file_path, 'r', encoding='windows-1252') as f: data = f.read().replace('\r', '') header, dateline_data, data = data.split('\n\n', 2) self._process_header(item, header) self._process_dateline(item, dateline_data) item['original_source'] = 'AsiaNet' item['word_count'] = get_text_word_count(data) item['body_html'] = '<pre>' + html.escape(data) + '</pre>' return item except Exception as e: raise AAPParserError.AsiaNetParserError(file_path, e)
def parse(self, filename, provider=None): try: item = {} self.set_item_defaults(item, provider) with open(filename, 'r', encoding='latin-1') as f: lines = f.readlines() header = False body = False for line in lines: if self.START_OF_MESSAGE in line and not header: item['guid'] = filename + str(uuid.uuid4()) header = True continue if header: if line == '\n': continue if line[0] in self.header_map: if self.header_map[line[0]]: item[self.header_map[line[0]]] = line[1:-1] continue if line[0] == self.CATEGORY: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': line[1]}] continue if line[0] == self.FORMAT: if line[1] == self.TEXT: item[ITEM_TYPE] = CONTENT_TYPE.TEXT continue if line[1] == self.TABULAR: item[FORMAT] = FORMATS.PRESERVED continue continue if line[0] == self.GENRE: genre = line[1:-1] if genre: genre_map = get_resource_service('vocabularies').find_one(req=None, _id='genre') item['genre'] = [x for x in genre_map.get('items', []) if x['qcode'] == genre and x['is_active']] continue if line[0] == self.IPTC: iptc_code = line[1:-1] if iptc_code.isdigit(): item[self.ITEM_SUBJECT] = [{'qcode': iptc_code, 'name': subject_codes[iptc_code]}] continue header = False body = True item['body_html'] = line else: if self.END_OF_MESSAGE in line: break if body: item['body_html'] = item.get('body_html', '') + line if item.get(FORMAT) == FORMATS.PRESERVED: item['body_html'] = '<pre>' + html.escape(item['body_html']) + '</pre>' return self.post_process_item(item, provider) except Exception as ex: raise AAPParserError.ZCZCParserError(exception=ex, provider=provider)
item['body_html'] = '<pre>' + '\n'.join(lines[lines_to_remove:]) # if the concatenation of the slugline and take key contain the phrase 'Brief Form' change the category to # h if (item.get(self.ITEM_SLUGLINE, '') + item.get(self.ITEM_TAKE_KEY, '')).lower().find('brief form') >= 0: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # Another exception if 'NZ/AUST FIELDS' in item.get('body_html', ''): item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # if the item has been marked as convert to HTML then we need to use the racing reformat macro # to convert it. if lines[0] and lines[0].find('HH ') != -1: racing_reformat_macro(item) genre_map = get_resource_service('vocabularies').find_one(req=None, _id='genre') if genre_map: item['genre'] = [x for x in genre_map.get('items', []) if x['qcode'] == 'Racing Data' and x['is_active']] return item except Exception as ex: logger.exception(ex) try: register_feed_parser(ZCZCRacingParser.NAME, ZCZCRacingParser()) except AlreadyExistsError as ex: pass register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # Another exception if 'NZ/AUST FIELDS' in item.get('body_html', ''): item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # if the item has been marked as convert to HTML then we need to use the racing reformat macro # to convert it. if lines[0] and lines[0].find('HH ') != -1: racing_reformat_macro(item) genre_map = get_resource_service('vocabularies').find_one( req=None, _id='genre') if genre_map: item['genre'] = [ x for x in genre_map.get('items', []) if x['qcode'] == 'Racing Data' and x['is_active'] ] return item except Exception as ex: logger.exception(ex) try: register_feed_parser(ZCZCRacingParser.NAME, ZCZCRacingParser()) except AlreadyExistsError: pass register_feeding_service_error( 'file', AAPParserError.ZCZCParserError().get_error_description())
'qcode': '04000000', 'name': subject_codes['04000000'] }] item[FORMAT] = FORMATS.HTML def datetime(self, string): """ Convert the date string parsed from the source file to a datetime, assumes that the time is local to Sydney Australia :param string: :return: """ # 06 June 2016 14:00:00 try: local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S') except ValueError: local_dt = datetime.datetime.strptime(string, '%d %b %Y %H:%M:%S') local_tz = pytz.timezone('Australia/Sydney') aus_dt = local_tz.localize(local_dt, is_dst=None) return aus_dt.astimezone(pytz.utc) try: register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser()) except AlreadyExistsError: pass register_feeding_service_error( 'file', AAPParserError.NewsBitesParserError().get_error_description())
:param dict item: The item where the data will be stored :param str header: The header of the file """ source = 'anpa_take_key' for line in header.split('\n'): if line.lower().startswith('media release'): break if source not in item: item[source] = line else: item[source] += line # Clean up the header entries item['anpa_take_key'] = item['anpa_take_key'][8:].replace('\n', '').strip() item['headline'] = 'Media Release: ' + item.get('anpa_take_key', '') item['slugline'] = 'AAP Medianet' self._truncate_headers(item) try: register_feed_parser(AsiaNetFeedParser.NAME, AsiaNetFeedParser()) except AlreadyExistsError: pass register_feeding_service_error( 'file', AAPParserError.AsiaNetParserError().get_error_description())
def parse(self, filename, provider=None): """ Attempt to parse the file and return the item :param filename: :param provider: :return: """ try: with open(filename, 'rb') as f: lines = [line for line in f] item = { 'guid': filename + '-' + str(uuid.uuid4()), 'urgency': 5, 'pubstatus': 'usable', 'versioncreated': utcnow(), ITEM_TYPE: CONTENT_TYPE.TEXT, FORMAT: FORMATS.PRESERVED } m = re.match( b'\x01(.*)' + b'\x1f(.*)' + b'\x1f([Y|N])' + b'\x1f([Y|N])' + b'\x1f(.*)' + b'\x1f(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)', lines[0], flags=re.I) if m: state = m.group(5).decode('ascii') item['slugline'] = titlecase( m.group(1).decode('ascii')) + ' Gallop' item['anpa_take_key'] = ('Result ' if '-' not in m.group(2).decode('ascii') else 'Results ') + \ m.group(2).decode('ascii') + ' ' + self.CityMap.get(state, '') correction = m.group(3).decode('ascii') abandoned = m.group(4).decode('ascii') day_of_week = m.group(6).decode('ascii') item['headline'] = item.get('slugline', '') + ' ' + item.get('anpa_take_key', '') + ' ' + \ day_of_week # if abandoned then the city string might get shortened if abandoned == 'Y': city = self.CityMap.get(state, '') if state.upper() in set(['NSW', 'TAS', 'NT', 'WA']): city = self.CityMap.get(state, '')[:4] if state.upper() in set(['VIC', 'QLD', 'SA']): city = self.CityMap.get(state, '')[:5] # append the city to the take key item['anpa_take_key'] = ('Result ' if '-' not in m.group(2).decode('ascii') else 'Results ') + \ m.group(2).decode('ascii') + ' ' + city item['headline'] = item.get('slugline', '') + ' ' + item.get('anpa_take_key', '') + ' ' + \ day_of_week if correction == 'Y': item['headline'] = 'RPTG CRTG ' + item.get( 'headline', '') else: if abandoned == 'Y': item['anpa_take_key'] = item.get( 'anpa_take_key', '') + ' ABANDONED' item['headline'] = item.get('headline', '') + ' ABANDONED' else: raise AAPParserError.PDAResulstParserError() item['body_html'] = '<pre>' + b'\n'.join( lines[1:]).decode('ascii') + '</pre>' # remove the sign off as recieved, it will get put back on when published if item.get('body_html', '').find('AAP RESULTS'): item['body_html'] = item.get('body_html', '').replace( 'AAP RESULTS', '') item['sign_off'] = 'RESULTS' item['subject'] = [{'qcode': '15030001'}] item['anpa_category'] = [{'qcode': 'r'}] genre_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='genre') if genre_map: item['genre'] = [ x for x in genre_map.get('items', []) if x['qcode'] == 'Results (sport)' and x['is_active'] ] self.truncate_fields(item) return item except Exception as ex: logging.exception(ex)