def scrape_file(self, _html, t): if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [ div for div in _html.cssselect("#sort div") if "sort_" in div.get('id') ] for div in divs: article = HTMLDocument() article.props.html = div article.props.headline = div.cssselect( "#articleTitle")[0].text_content() article.props.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.props.pagenr, article.props.section = self.get_pagenum( articlepage[0].text) if not div.cssselect("#sourceTitle")[0].text: article.props.medium = Medium.get_or_create("unknown medium") else: article.props.medium = Medium.get_or_create( div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.props.date = readDate(date_str) except ValueError: log.error( "parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def get_medium(self, text): if not text: text = "unknown" if text in MEDIUM_ALIASES.keys(): return Medium.get_or_create(MEDIUM_ALIASES[text]) else: return Medium.get_or_create(text)
def scrape_file(self, _html, t): if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')] for div in divs: article = HTMLDocument() article.props.html = div article.props.headline = div.cssselect("#articleTitle")[0].text_content() article.props.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.props.pagenr, article.props.section = self.get_pagenum(articlepage[0].text) if not div.cssselect("#sourceTitle")[0].text: article.props.medium = Medium.get_or_create("unknown medium") else: article.props.medium = Medium.get_or_create(div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.props.date = readDate(date_str) except ValueError: log.error("parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def test_medium(self): import functools header = ('kop', 'datum', 'tekst', 'med') data = [('kop1', '2001-01-01', '', 'Bla')] test = functools.partial(_run_test_csv, header, data, text="tekst", headline="kop", date="datum") articles = test(medium_name=None, medium="med") self.assertEqual(len(articles), 1) self.assertEqual(articles[0].medium.name, "Bla") articles = test(medium_existing=Medium.get_or_create("1").id) self.assertEqual(len(articles), 1) self.assertEqual(articles[0].medium.name, "1") articles = test(medium_existing=Medium.get_or_create("1").id, medium="med") self.assertEqual(len(articles), 1) self.assertEqual(articles[0].medium.name, "Bla") articles = test(medium_name="bla2", medium="med") self.assertEqual(len(articles), 1) self.assertEqual(articles[0].medium.name, "Bla") articles = test(medium_name="bla2", medium_existing=Medium.get_or_create("2").id) self.assertEqual(len(articles), 1) self.assertEqual(articles[0].medium.name, "2")
def create_medium(self, medium): if not medium or len(medium) < 1: medium = "unknown" if medium in MEDIUM_ALIASES.keys(): return Medium.get_or_create(MEDIUM_ALIASES[medium]) else: return Medium.get_or_create(medium)
def create_medium(self, html): if not html.text: medium = "unknown" else: medium = html.text if medium in MEDIUM_ALIASES.keys(): return Medium.get_or_create(MEDIUM_ALIASES[medium]) else: return Medium.get_or_create(medium)
def _scrape_unit(self, unit): (scraper, unit) = unit for article in scraper._scrape_unit(unit): if not article.is_comment: article.props.medium = Medium.get_or_create(scraper.medium_name) else: article.props.medium = Medium.get_or_create(scraper.medium_name + " - Comments") if not hasattr(article.props, 'text'): article.props.text = pformat(article.props.results) yield article
def _scrape_unit(self, unit): (scraper, unit) = unit self.medium_name = scraper.medium for article in scraper._scrape_unit(unit): if not article.is_comment: article.props.medium = Medium.get_or_create(scraper.medium) else: article.props.medium = Medium.get_or_create(scraper.medium + " - Comments") if not hasattr(article.props, 'text'): article.props.text = pformat(article.props.results) yield article
def _get_units(self): self.medium = Medium.get_or_create(self.medium_name) """ doc = self.getdoc(self.index_url) skip = True for li in doc.cssselect("ol.childforum li.forumbit_post"): forum_url = urljoin(doc.base_url,li.cssselect("h2.forumtitle a")[0].get('href')) if START_AT0] in forum_url: skip = False if skip: continue for page in self.__get_pages(forum_url): for li in page.cssselect("#threads li.threadbit"): try: unit = li.cssselect("h3.threadtitle a")[0].get('href') except IndexError as e: print(e) else: print(unit, file=UNIT_FILE) yield unit""" units = set(map(str.strip, UNIT_FILE.readlines())) skip_until = "wie-schrijft-blijft/359215" for unit in units: if skip_until in unit: skip_until = "" yield unit
def _scrape_unit(self, fn): dest = StringIO() with self.ftp() as ftp: ftp.retrbinary(b'RETR %s' % (fn.encode('latin-1')), dest.write) body = STLtoText(dest.getvalue()) body = body.decode('latin-1', 'ignore').strip().lstrip('888').strip() title = fn.split('/')[-1] medium = title.split('-')[-1].split('.stl')[0].strip().lower() date = getDate(title) if medium == 'nos journaal' and int(format(date, '%H')) == 20 and int( format(date, '%M')) == 0: medium = 'nos journaal 20:00' med = Medium.get_or_create(medium) if med.id in mediadict: print("saving %s as %s" % (med.id, mediadict[med.id])) med = Medium.objects.get(id=mediadict[med.id]) headline = "%s (%s)" % (medium, fn.replace('.stl', '').strip()) art = Article(headline=headline, text=body, medium=med, date=date, url=fn) yield art
def _create_medium(self, source): try: Medium.objects.get(name__iexact=source) except Medium.DoesNotExist: l = Language.objects.get(id=1) Medium(name=source, abbrev=source[0:5], circulation=1, language=l).save()
def _get_units(self): self.medium = Medium.get_or_create(self.medium_name) d = self.options['date'] data = json.loads(self.open("http://www.telegraaf.nl/telegraaf-i/newspapers").read()) self.paperdata = [i for i in data if i['date'] == "{}-{:02d}-{:02d}".format(d.year,d.month,d.day)][0] articles = [] for page in self.paperdata['pages']: articles += page['articles'] for article_id in articles: yield article_id
def _scrape_unit(self, ftuple): title = ftuple[0] url = ftuple[1] body = ftuple[2] date = getDate(url) medium = title.lower() med = Medium.get_or_create(medium) art = Article(headline=medium, text=body, medium = med, date=date, url = url) yield art
def _medium(self): if self.options["medium"]: return if self.options['medium_existing']: return self.options['medium_existing'] if self.options['medium_name']: med = Medium.get_or_create(self.options['medium_name']) self.options['medium_existing'] = med return med raise ValueError("No medium specified!")
def __init__(self, *args, **kargs): super(Scraper, self).__init__(*args, **kargs) self.medium = Medium.get_or_create(self.medium_name) self.project = self.options['project'] for k, v in self.options.items(): if type(v) == str: self.options[k] = v.decode('utf-8') # avoid django problem/bug with repr(File(open(uncode-string))) # https://code.djangoproject.com/ticket/8156 o2 = {k:v for k,v in self.options.iteritems() if k != 'file'} log.debug(u"Articleset: {self.articleset!r}, options: {o2}" .format(**locals()))
def _scrape_unit(self, unit): (scraper, unit, rank) = unit for article in scraper._scrape_unit(unit): article.props.medium = Medium.get_or_create(scraper.source) article.props.rank = rank for attr in ['headline', 'author', 'text']: if hasattr(article.props, attr): value = getattr(article.props, attr) if isinstance(value, str) or isinstance(value, unicode): value = value.strip() setattr(article.props, attr, value) yield article
def body_to_article(headline, byline, text, date, source, meta): """ Create an Article-object based on given parameters. It raises an error (Medium.DoesNotExist) when the given source does not have an entry in the database. @param headline: headline of new Article-object @type headline: str @param byline: byline for new Article @type byline: NoneType, str @param text: text for new Article @type text: str @param date: date(time) for new Article @type date: datetime.date, datetime.datetime @param source: medium-label for new Article @type source: str @param meta: object containing all sorts of meta-information, most of it suitable for metastring. However, some information (author, length) will be extracted. @type meta: dictionary @return Article-object """ log.debug("Creating article object for {headline!r}".format(**locals())) art = Article(headline=headline, byline=byline, text=text, date=date) art.medium = Medium.get_or_create(source) # Author / Section meta = meta.copy() art.author = meta.pop('author', None) art.section = meta.pop('section', None) if 'length' in meta: art.length = int(meta.pop('length').split()[0]) else: art.length = art.text.count(" ") if 'url' in meta: art.url = meta.pop('url') art.url = re.sub("\s+", "", art.url) art.metastring = str(meta) return art
def _scrape_unit(self, fn): dest = StringIO() self._ftp.retrbinary(b'RETR %s' % (fn.encode('latin-1')) , dest.write) body = STLtoText(dest.getvalue()) body = body.decode('latin-1','ignore').strip().lstrip('888').strip() title = fn.split('/')[-1] medium = title.split('-')[-1].split('.stl')[0].strip().lower() date = getDate(title) if medium == 'nos journaal' and int(format(date, '%H')) == 20 and int(format(date, '%M')) == 0: medium = 'nos journaal 20:00' if medium in mediadict.keys(): medium = mediadict[medium] med = Medium.get_or_create(medium) art = Article(headline=medium, text=body, medium = med, date=date, url = fn) yield art
def body_to_article(headline, byline, text, date, source, meta): """ Create an Article-object based on given parameters. It raises an error (Medium.DoesNotExist) when the given source does not have an entry in the database. @param headline: headline of new Article-object @type headline: unicode / str @param byline: byline for new Article @type byline: NoneType, unicode, str @param text: text for new Article @type text: unicode / str @param date: date(time) for new Article @type date: datetime.date, datetime.datetime @param source: medium-label for new Article @type source: unicode / str @param meta: object containing all sorts of meta-information, most of it suitable for metastring. However, some information (author, length) will be extracted. @type meta: dictionary @return Article-object """ log.debug("Creating article object for {headline!r}".format(**locals())) art = Article(headline=headline, byline=byline, text=text, date=date) art.medium = Medium.get_or_create(source) # Author / Section meta = meta.copy() art.author = meta.pop('author', None) art.section = meta.pop('section', None) if 'length' in meta: art.length = int(meta.pop('length').split()[0]) else: art.length = art.text.count(" ") art.metastring = str(meta) return art
def _scrape_unit(self, ftuple): title = ftuple[0] url = ftuple[1] body = ftuple[2] date = getDate(url) medium = title.lower() med = Medium.get_or_create(medium) headline = "%s (%s)" % (medium, url.split('/')[-1].replace('.stl', '').strip()) art = Article(headline=headline, text=body, medium=med, date=date, url=url) yield art
def parse_document(self, file): if file: dirname, filename = os.path.split(file.name) filename, ext = os.path.splitext(filename) else: dirname, filename, ext = None, None, None metadata = dict((k, v) for (k, v) in self.options.items() if k in ["headline", "project", "date", "section"]) metadata["medium"] = Medium.get_or_create(self.options['medium']) if not metadata["date"]: datestring, filename = filename.split("_", 1) metadata["date"] = toolkit.read_date(datestring) if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["section"].strip(): metadata["section"] = dirname if file: convertors = None if ext.lower() == ".docx": convertors = [_convert_docx, _convert_doc] elif ext.lower() == ".doc": convertors = [_convert_doc, _convert_docx] elif ext.lower() == ".pdf": convertors = [_convert_pdf] if convertors: text = _convert_multiple(file, convertors) else: text = file.text else: text = self.options['text'] return Article(text=text, **metadata)
def parse_document(self, file): if file: dirname, filename = os.path.split(file.name) filename, ext = os.path.splitext(filename) else: dirname, filename, ext = None, None, None metadata = dict((k, v) for (k,v) in self.options.items() if k in ["headline", "project", "date", "section"]) metadata["medium"] = Medium.get_or_create(self.options['medium']) if not metadata["date"]: datestring, filename = filename.split("_", 1) metadata["date"] = toolkit.read_date(datestring) if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["section"].strip(): metadata["section"] = dirname if file: convertors = None if ext.lower() == ".docx": convertors = [_convert_docx, _convert_doc] elif ext.lower() == ".doc": convertors = [_convert_doc, _convert_docx] elif ext.lower() == ".pdf": convertors = [_convert_pdf] if convertors: text = _convert_multiple(file, convertors) else: text = file.text else: text = self.options['text'] return Article(text=text, **metadata)
def _postprocess_article(self, article): """ Finalize an article. This should convert the output of _scrape_unit to the required output for scrape_unit, e.g. convert to Article, add project and/or medium """ comment = False if isinstance(article, Document): if hasattr(article, 'is_comment') and article.is_comment: if not hasattr(self, 'comment_medium'): self.comment_medium = Medium.get_or_create(self.medium_name + " - Comments") comment = True article = article.create_article() if comment: _set_default(article, "medium", self.comment_medium) else: _set_default(article, "medium", self.medium) _set_default(article, "project", self.project) article.scraper = self return article
def parse_document(self, tupleText): meta, body = tupleText meta = meta.strip() meta = meta.split('\n') kargs = {} kargs['externalid'] = int(meta[0].split('.')[0].lstrip('?')) kargs['headline'] = meta[0].partition('. ')[2] medium_name, date, pagenr, length = meta[2].split(', ') kargs['medium'] = Medium.get_or_create(medium_name) kargs['date'] = readDate(date) kargs['pagenr'] = int(pagenr.strip('p.')) kargs['length'] = int(length.strip('w.')) body = body.split('\n') kargs['section'] = body[2] kargs['text'] = '\n'.join(body[5:]) kargs['project'] = self.options['project'] return Article(**kargs)
from amcat.models.medium import Medium from amcat.models.article import Article from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES as aliases for alias, medium in aliases.items(): if alias != medium: print(alias, " > ", medium) #change all articles in project 29 alias = Medium.get_or_create(alias) articles = Article.objects.filter(medium=alias.id, project_id=29) print("{} articles".format(articles.count())) articles.update(medium=Medium.get_or_create(medium).id) #if medium is now empty, delete if Article.objects.filter(medium=alias.id).count() == 0: print('deleting...') alias.delete() else: print('alias is no alias')
def media(request): canadd = Medium.can_create(request.user) media = Datatable(MediumResource) return render(request, 'navigator/report/media.html', locals())
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = readDate(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): #headline article.headline = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = readDate(datestr) if ( article.date - file_date ).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = readDate(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.medium = Medium.get_or_create(medium_str) article.section = data[1] paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len( p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode( 'latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub("=[A-Z0-9]{2}", character, article.text) yield article
from amcat.models.medium import Medium from amcat.models.article import Article from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES as aliases for alias, medium in aliases.items(): if alias != medium: print(alias, " > ", medium) #change all articles in project 29 alias = Medium.get_or_create(alias) articles = Article.objects.filter(medium = alias.id, project_id = 29) print("{} articles".format(articles.count())) articles.update(medium = Medium.get_or_create(medium).id) #if medium is now empty, delete if Article.objects.filter(medium = alias.id).count() == 0: print('deleting...') alias.delete() else: print('alias is no alias')
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = readDate(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): #headline article.headline = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = readDate(datestr) if ( article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = readDate(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.medium = Medium.get_or_create(medium_str) article.section = data[1] paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode('latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub( "=[A-Z0-9]{2}", character, article.text) yield article