def _scrape_unit(self, fn): dest = StringIO() with self.ftp() as ftp: ftp.retrbinary(b'RETR %s' % (fn.encode('latin-1')), dest.write) body = STLtoText(dest.getvalue()) body = body.decode('latin-1', 'ignore').strip().lstrip('888').strip() title = fn.split('/')[-1] medium = title.split('-')[-1].split('.stl')[0].strip().lower() date = getDate(title) if medium == 'nos journaal' and int(format(date, '%H')) == 20 and int( format(date, '%M')) == 0: medium = 'nos journaal 20:00' med = Medium.get_or_create(medium) if med.id in mediadict: print("saving %s as %s" % (med.id, mediadict[med.id])) med = Medium.objects.get(id=mediadict[med.id]) headline = "%s (%s)" % (medium, fn.replace('.stl', '').strip()) art = Article(headline=headline, text=body, medium=med, date=date, url=fn) yield art
def create_test_article(create=True, articleset=None, deduplicate=True, properties=None, **kargs): """Create a test article""" from amcat.models.article import Article # Get static properties title = kargs.pop("title", "test title {}: {}".format(_get_next_id(), uuid4())) date = kargs.pop("date", datetime.datetime.now()) url = kargs.pop("url", "http://example.com") text = kargs.pop("text", "Lorum Ipsum: {}".format(_get_next_id())) project = kargs.pop("project", articleset.project if articleset is not None else create_test_project()) parent_hash = kargs.pop("parent_hash", None) hash = kargs.pop("hash", None) # Caller is allowed to pas date as string if isinstance(date, str): date = _parse_date(date) a = Article(title=title, date=date, url=url, text=text, project=project, parent_hash=parent_hash, hash=hash) if properties: for propname, value in properties.items(): if get_property_primitive_type(propname) == datetime.datetime and isinstance(value, str): properties[propname] = _parse_date(value) a.properties.update(properties) if create: Article.create_articles([a], articleset, deduplicate=deduplicate) return a
def parse_document(self, row): kargs = dict(medium=self._medium) for fieldname in FIELDS: csvfield = self.options[fieldname] if not csvfield: continue val = row[csvfield] if val.strip(): if fieldname in PARSERS: val = PARSERS[fieldname](val) elif is_nullable(fieldname): val = None else: val = val.strip() kargs[fieldname] = val # In case medium wasn't defined in csv medium = self._medium if medium is not None: kargs["medium"] = medium if self.parent_field: doc_id = kargs.get(self.id_field) parent_id = kargs.pop(self.parent_field) if parent_id: self.parents[doc_id] = parent_id article = Article(**kargs) if self.parent_field: self.articles[doc_id] = article return article
def getarticle(self, headline, lines): article = Article(headline=headline) text = "" for line in lines[2:]: if len(line) > 2: text += "\n" + line text = text.replace("-\n", "") text = text.replace(" ", " ") text = text.replace("\n", " ") article.text = text date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})") result = date_pattern.search(lines[1]) article.date = date(int(result.group(3)), int(result.group(2)), int(result.group(1))) pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)") result = pagenum_pattern.search(lines[1]) if result: article.pagenr = int(result.group(1)) for h, medium in self.index: if article.headline.lower().strip() in h.lower().strip(): article.set_property("medium", self.get_medium(medium)) return article
def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [ div for div in _html.cssselect("#sort div") if "sort_" in div.get('id') ] for div in divs: article = Article(metastring={}) article.metastring['html'] = div article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum( articlepage[0].text) article.medium = self.get_medium( div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.date = readDate(date_str) except ValueError: log.error( "parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def parse_document(self, file): dirname, filename = os.path.split(file.name) filename, ext = os.path.splitext(filename) metadata = dict((k, v) for (k,v) in self.options.items() if k in ["medium", "headline", "project", "date", "section"]) if not metadata["date"]: datestring, filename = filename.split("_", 1) metadata["date"] = toolkit.read_date(datestring) if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["section"].strip(): metadata["section"] = dirname convertors = None if ext.lower() == ".docx": convertors = [_convert_docx, _convert_doc] elif ext.lower() == ".doc": convertors = [_convert_doc, _convert_docx] if convertors: text = _convert_multiple(file, convertors) else: text = file.text return Article(text=text, **metadata)
def _scrape_unit(self, url): try: xml = self.getdoc(url) except: log.warn("COULD NOT FIND XML FOR %s" % url) return url = url.replace('.xml', '.html') metadict = self.getMetaDict(xml, printit=False) if len(metadict) == 0: log.warn( "NO METADATA FOR %s. SKIPPING ARTICLE (to be retrieved after officiele bekendmakingen finalizes it)" % url) return section = self.safeMetaGet(metadict, 'OVERHEID.category') document_id = metadict['DC.identifier'] if document_id.count('-') == 1: #kamer = 'NA' if 'tweede' in metadict['DC.creator'].lower(): kamer = 'tk' if 'eerste' in metadict['DC.creator'].lower(): kamer = 'ek' document_id = document_id.replace('-', '-%s-' % kamer) author = self.safeMetaGet(metadict, 'OVERHEIDop.ontvanger') try: archieftype = metadict['OVERHEIDop.ArchiefType'] except: archieftype = metadict['DC.type'] aanleiding = metadict['DC.title'] try: vraagnummer = metadict['OVERHEIDop.vraagnummer'].strip() except: vraagnummer = self.safeMetaGet(metadict, 'OVERHEIDop.vraagNummer').strip() headline = ("%s | %s - %s" % (document_id, archieftype, vraagnummer)).strip() try: datestring = adhocDateFix(metadict['OVERHEIDop.datumOntvangst']) except: datestring = adhocDateFix(metadict['OVERHEIDop.datumIndiening']) headline += " (publicatiedatum)" try: date = datetime.datetime.strptime(datestring, '%Y-%m-%d') except: date = datetime.datetime.strptime(datestring, '%d-%m-%Y') body = "%s\n\n%s" % (aanleiding, self.getBody(xml)) #print('--------------\n', headline, '\n', body, '\n\n') print("SAVING: %s" % url) yield Article(headline=headline, byline=vraagnummer, text=body, date=date, section=section, url=url)
def parse_file(self, file, encoding, data): self.ln_query, arts = data for data in arts: art = {} for field, setting in self.options['field_map'].items(): value, typ = setting['value'], setting['type'] val = data.get(value) if typ == 'field' else value if val: art[field] = val yield Article(**art)
def parse_document(self, row): kargs = dict(medium=self.options["medium"]) for fieldname in FIELDS: csvfield = self.options[fieldname] if not csvfield: continue val = row[csvfield] if fieldname in PARSERS: val = PARSERS[fieldname](val) kargs[fieldname] = val return Article(**kargs)
def _scrape_unit(self, url): try: xml = self.getdoc(url) except: log.warn("COULD NOT FIND XML FOR %s" % url) return #return [] url = url.replace('.xml', '.html') metadict = self.getMetaDict(xml, printit=False) if len(metadict) == 0: log.warn( "NO METADATA FOR %s. SKIPPING ARTICLE (to be retrieved after officiele bekendmakingen finalizes it)" % url) return #return [] section = self.safeMetaGet(metadict, 'OVERHEID.category') document_id = metadict['DC.identifier'].strip() if document_id.count('-') == 1: #kamer = 'NA' if 'tweede' in metadict['DC.creator'].lower(): kamer = 'tk' if 'eerste' in metadict['DC.creator'].lower(): kamer = 'ek' document_id = document_id.replace('-', '-%s-' % kamer) print('document id:', document_id) author = self.safeMetaGet(metadict, 'OVERHEIDop.indiener') typevraag = metadict['DC.type'] body = self.getBody(xml) headline = "document_id (%s)" % author try: datestring = adhocDatefix(metadict['OVERHEIDop.datumOntvangst']) except: datestring = adhocDatefix(metadict['OVERHEIDop.datumIndiening']) headline += " (publicatiedatum)" try: date = datetime.datetime.strptime(datestring, '%Y-%m-%d') except: date = datetime.datetime.strptime(datestring, '%d-%m-%Y') #print('--------------\n', document_id, typevraag, '\n', body, '\n\n') print("SAVING: %s" % url) yield Article(headline=document_id, byline=typevraag, text=body, date=date, section=section, url=url)
def get_article(e): headline = get_headline(e) body = get_body(e) medium, date, page = get_meta(e) section = get_section(e) medium = get_or_create(Medium, name=medium) return Article(headline=headline, text=body, date=date, pagenr=page, section=section, medium=medium)
def parse_file(self, file, encoding, data): self.ln_query, arts = data for data in arts: art = {} for field, setting in self.options['field_map'].items(): datatype = get_property_primitive_type(field) value, typ = setting['value'], setting['type'] val = data.get(value) if typ == 'field' else value if val: if datatype is datetime.datetime and type(val) is str: val = toolkit.read_date(val) art[field] = val yield Article(**art)
def create_test_article(create=True, articleset=None, check_duplicate=False, **kargs): """Create a test article""" from amcat.models.article import Article if "project" not in kargs: kargs["project"] = create_test_project() if "date" not in kargs: kargs["date"] = "2000-01-01" if "medium" not in kargs: kargs["medium"] = create_test_medium() if "id" not in kargs: kargs["id"] = _get_next_id() if 'headline' not in kargs: kargs['headline'] = 'test headline' a = Article(**kargs) if create: Article.create_articles([a], articleset, check_duplicate=check_duplicate) return a
def body_to_article(self, headline, byline, text, date, source, meta): """ Create an Article-object based on given parameters. It raises an error (Medium.DoesNotExist) when the given source does not have an entry in the database. @param headline: headline of new Article-object @type headline: unicode / str @param byline: byline for new Article @type byline: NoneType, unicode, str @param text: text for new Article @type text: unicode / str @param date: date(time) for new Article @type date: datetime.date, datetime.datetime @param source: medium-label for new Article @type source: unicode / str @param meta: object containing all sorts of meta-information, most of it suitable for metastring. However, some information (author, length) will be extracted. @type meta: dictionary @return Article-object """ log.debug( "Creating article object for {headline!r}".format(**locals())) art = Article(headline=headline, byline=byline, text=text, date=date) art.medium = get_or_create(Medium, name=source) # Author / Section meta = meta.copy() art.author = meta.pop('author', None) art.section = meta.pop('section', None) if 'length' in meta: art.length = int(meta.pop('length').split()[0]) else: art.length = art.text.count(" ") art.metastring = str(meta) art.project = self.options['project'] return art
def parse_item(self, item): #item: a list of html tags article = Article(metastring={}) article.text = self._parse_text(item) for tag in item: if tag.tag == "h2": if tag.text: article.headline = tag.text else: article.headline = tag.cssselect("span")[0].text_content() elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")): article = self.parse_dateline(tag.text_content(), article) if not article.headline: raise Exception("Article has no headline") return article
def _scrape_unit(self, ftuple): title = ftuple[0] url = ftuple[1] body = ftuple[2] date = getDate(url) medium = title.lower() med = Medium.get_or_create(medium) headline = "%s (%s)" % (medium, url.split('/')[-1].replace('.stl', '').strip()) art = Article(headline=headline, text=body, medium=med, date=date, url=url) yield art
def create_article(self): """Convert the document object into an article""" art = Article() # All properties in _ARTICLES_PROPS are set on a new Article, # else in Article.metastring. _metastring = dict() for prop, value in self.getprops().items(): value = self._convert(value) if prop in _ARTICLE_PROPS: setattr(art, prop, value) else: _metastring[prop] = value art.metastring = str(_metastring) self.article = art return art
def get_article(e): title = get_title(e) body = get_body(e) medium, date, page = get_meta(e) section = get_section(e) article = Article(title=title, text=body, date=date) if page is not None: article.set_property("page_num", page) if section is not None: article.set_property("section", section) if medium is not None: article.set_property("medium", medium) return article
def parse_item(self, item): #item: a list of html tags article = Article(metastring={}) for tag in item: if tag.tag in ("p", "div"): if not (hasattr(article, 'text') or article.text): article.text.append(tag) else: article.text = [tag] elif tag.tag == "h2": article.headline = tag.text elif tag.tag == "i": article = self.parse_dateline(tag.text_content(), article) #process html article.text = "\n".join( [html2text(html.tostring(bit)) for bit in article.text]) return article
def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text=res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def parse_document(self, row): kargs = dict(medium=self._medium, metastring={}) csvfields = [(fieldname, self.options[fieldname]) for fieldname in FIELDS if self.options[fieldname]] for fieldname, csvfield in csvfields: val = row[csvfield] if fieldname == 'date' and isinstance(val, datetime.datetime): pass # no need to parse elif val.strip(): if fieldname in PARSERS: val = PARSERS[fieldname](val) elif is_nullable(fieldname): val = None else: val = val.strip() kargs[fieldname] = val # Metadata to metastring csvfields = [tup[1] for tup in csvfields] for key, value in row.items(): if key not in csvfields: kargs["metastring"][key] = value kargs["metastring"] = json.dumps(kargs["metastring"]) # In case medium wasn't defined in csv medium = self._medium if medium is not None: kargs["medium"] = medium if self.parent_field: doc_id = kargs.get(self.id_field) parent_id = kargs.pop(self.parent_field) if parent_id: self.parents[doc_id] = parent_id article = Article(**kargs) if self.parent_field: self.articles[doc_id] = article return article
def scrape_3(self, _html): """Some ugly MS Word format, as of 2014-03-03""" # Partition articles part = [] articles = [] for tag in _html.cssselect("body > div > *"): if tag.cssselect("hr"): articles.append(part) part = [] else: part.append(tag) for tags in articles[1:]: article = Article() dateline = tags[1].text_content().strip() article = self.parse_dateline(dateline, article) article.headline = tags[1].text_content().strip() html_str = "".join([html.tostring(t) for t in tags[2:]]) article.text = html2text(html_str) article.metastring = {'html': html_str} yield article
def parse_file(self, file): dirname, filename = os.path.split(file.name) filename, ext = os.path.splitext(filename) def parse_field(file, type, value): if type == 'literal': return value if value == 'filename': return filename if value == 'text': return file.read() if value.startswith('filename-'): n = int(value.split("-")[-1]) return filename.split("_")[n - 1] # filename-n is 1 based index raise ValueError("Can't parse field {value}".format(**locals())) fields = { field: parse_field(file, **setting) for (field, setting) in self.options['field_map'].items() } return [Article(**fields)]
def parse_document(self, tupleText): meta, body = tupleText meta = meta.strip() meta = meta.split('\n') kargs = {} kargs['externalid'] = int(meta[0].split('.')[0].lstrip('?')) kargs['headline'] = meta[0].partition('. ')[2] medium_name, date, pagenr, length = meta[2].split(', ') kargs['medium'] = get_or_create_medium(medium_name) kargs['date'] = readDate(date) kargs['pagenr'] = int(pagenr.strip('p.')) kargs['length'] = int(length.strip('w.')) body = body.split('\n') kargs['section'] = body[2] kargs['text'] = '\n'.join(body[5:]) kargs['project'] = self.options['project'] return Article(**kargs)
def create_test_article(create=True, articleset=None, check_duplicate=False, **kargs): """Create a test article""" from amcat.models.article import Article if "date" in kargs and isinstance(kargs["date"], basestring): kargs["date"] = read_date(kargs["date"]) if "project" not in kargs: kargs["project"] = create_test_project() if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1) if "medium" not in kargs: kargs["medium"] = create_test_medium() if "id" not in kargs: kargs["id"] = _get_next_id() if 'headline' not in kargs: kargs['headline'] = 'test headline' if 'text' not in kargs: kargs["text"] = "\n\n".join(map(str, range(5))) a = Article(**kargs) if create: Article.create_articles([a], articleset, check_duplicate=check_duplicate, create_id=True) return a
def parse_file(self, file, encoding, _data): path, filename = os.path.split(file) filename, ext = os.path.splitext(filename) def parse_field(file, type, value): if type == 'literal': return value if value == 'Filename': return filename if value == 'Text': return _read(file, encoding) if value == 'Path': return path if value.startswith('Filename part '): n = int(value.replace("Filename part ", "")) return filename.split("_")[n - 1] # filename-n is 1 based index raise ValueError("Can't parse field {value}".format(**locals())) fields = { field: parse_field(file, **setting) for (field, setting) in self.options['field_map'].items() } return [Article(**fields)]
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = read_date(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): article.title = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = read_date(datestr) if ( article.date - file_date ).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = read_date(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.set_property("medium", medium_str) article.set_property("section", data[1]) paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len( p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode( 'latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub("=[A-Z0-9]{2}", character, article.text) yield article
def _scrape_unit(self, unit): if unit == 1: yield _ErrorArticle() else: yield Article(headline=str(unit), date=date.today())
def _scrape_unit(self, unit): yield Article(headline=str(unit), date=date.today())
def parse_document(self, text): metadata = dict((k, v) for (k, v) in self.options.items() if k in ["medium", "headline", "project", "date"]) return Article(text=text, **metadata)