def body_to_article(headline, byline, text, date, source, meta): """ Create an Article-object based on given parameters. It raises an error (Medium.DoesNotExist) when the given source does not have an entry in the database. @param headline: headline of new Article-object @type headline: str @param byline: byline for new Article @type byline: NoneType, str @param text: text for new Article @type text: str @param date: date(time) for new Article @type date: datetime.date, datetime.datetime @param source: medium-label for new Article @type source: str @param meta: object containing all sorts of meta-information, most of it suitable for metastring. However, some information (author, length) will be extracted. @type meta: dictionary @return Article-object """ log.debug("Creating article object for {headline!r}".format(**locals())) art = Article(headline=headline, byline=byline, text=text, date=date) art.medium = Medium.get_or_create(source) # Author / Section meta = meta.copy() art.author = meta.pop('author', None) art.section = meta.pop('section', None) if 'length' in meta: art.length = int(meta.pop('length').split()[0]) else: art.length = art.text.count(" ") if 'url' in meta: art.url = meta.pop('url') art.url = re.sub("\s+", "", art.url) art.metastring = str(meta) return art
def body_to_article(self, headline, byline, text, date, source, meta): """ Create an Article-object based on given parameters. It raises an error (Medium.DoesNotExist) when the given source does not have an entry in the database. @param headline: headline of new Article-object @type headline: unicode / str @param byline: byline for new Article @type byline: NoneType, unicode, str @param text: text for new Article @type text: unicode / str @param date: date(time) for new Article @type date: datetime.date, datetime.datetime @param source: medium-label for new Article @type source: unicode / str @param meta: object containing all sorts of meta-information, most of it suitable for metastring. However, some information (author, length) will be extracted. @type meta: dictionary @return Article-object """ log.debug( "Creating article object for {headline!r}".format(**locals())) art = Article(headline=headline, byline=byline, text=text, date=date) art.medium = get_or_create(Medium, name=source) # Author / Section meta = meta.copy() art.author = meta.pop('author', None) art.section = meta.pop('section', None) if 'length' in meta: art.length = int(meta.pop('length').split()[0]) else: art.length = art.text.count(" ") art.metastring = str(meta) art.project = self.options['project'] return art
def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text = res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text=res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = readDate(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): #headline article.headline = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = readDate(datestr) if ( article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = readDate(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.medium = Medium.get_or_create(medium_str) article.section = data[1] paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode('latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub( "=[A-Z0-9]{2}", character, article.text) yield article
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = readDate(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): #headline article.headline = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = readDate(datestr) if ( article.date - file_date ).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = readDate(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.medium = Medium.get_or_create(medium_str) article.section = data[1] paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len( p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode( 'latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub("=[A-Z0-9]{2}", character, article.text) yield article