def test_nqueries(self): from amcat.tools import amcatlogging amcatlogging.setup() codebook, codes = amcattest.create_test_codebook_with_codes() schema, codebook, strf, intf, codef = amcattest.create_test_schema_with_fields(codebook=codebook) job = amcattest.create_test_job(unitschema=schema, articleschema=schema, narticles=7) articles = list(job.articleset.articles.all()) log.info(codes) amcattest.create_test_coding(codingjob=job, article=articles[0]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id}) amcattest.create_test_coding(codingjob=job, article=articles[1]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id}) amcattest.create_test_coding(codingjob=job, article=articles[2]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id}) amcattest.create_test_coding(codingjob=job, article=articles[3]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id}) amcattest.create_test_coding(codingjob=job, article=articles[4]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id}) codingjobs = list(CodingJob.objects.filter(pk__in=[job.id])) c = list(codingjobs[0].codings)[0] amcatlogging.debug_module('django.db.backends') script = self._get_results_script([job], {strf : {}, intf : {}}) with self.checkMaxQueries(8): list(csv.reader(StringIO(script.run()))) script = self._get_results_script([job], {strf : {}, intf : {}, codef : dict(ids=True)}) with self.checkMaxQueries(8): list(csv.reader(StringIO(script.run()))) script = self._get_results_script([job], {strf : {}, intf : {}, codef : dict(labels=True)}) with self.checkMaxQueries(8): list(csv.reader(StringIO(script.run())))
def run_cli(cls=None, handle_output=None, get_script_depth=2): """Handle command line interface invocation of this script""" amcatlogging.setup() if cls is None: cls = get_script(get_script_depth) if handle_output is None: handle_output = cls.output_type != None parser = argument_parser_from_script(cls) args = parser.parse_args() options = args.__dict__ if options.pop("verbose", None): amcatlogging.debug_module(cls.__module__) instance = cls(options) input = None if cls.input_type in (file, str, unicode): input = sys.stdin if cls.input_type in (str, unicode): input = input.read() if cls.input_type == unicode: encoding = chardet.detect(input)["encoding"] log.info("Using encoding {encoding}".format(**locals())) input = input.decode(encoding) out = instance.run(input) if handle_output: out = handleOutput(out, instance.output_type) return out
def test_nqueries(self): from amcat.tools import amcatlogging amcatlogging.setup() codebook, codes = amcattest.create_test_codebook_with_codes() schema, codebook, strf, intf, codef = amcattest.create_test_schema_with_fields(codebook=codebook) job = amcattest.create_test_job(unitschema=schema, articleschema=schema, narticles=7) articles = list(job.articleset.articles.all()) amcattest.create_test_coding(codingjob=job, article=articles[0]).update_values({strf:"bla", intf:1, codef:codes["A1b"]}) amcattest.create_test_coding(codingjob=job, article=articles[1]).update_values({strf:"bla", intf:1, codef:codes["A1b"]}) amcattest.create_test_coding(codingjob=job, article=articles[2]).update_values({strf:"bla", intf:1, codef:codes["A1b"]}) amcattest.create_test_coding(codingjob=job, article=articles[3]).update_values({strf:"bla", intf:1, codef:codes["A1b"]}) amcattest.create_test_coding(codingjob=job, article=articles[4]).update_values({strf:"bla", intf:1, codef:codes["A1b"]}) codingjobs = list(CodingJob.objects.filter(pk__in=[job.id])) c = codingjobs[0].codings.all()[0] amcatlogging.debug_module('django.db.backends') script = self._get_results_script([job], {strf : {}, intf : {}}) with self.checkMaxQueries(5): list(csv.reader(StringIO(script.run()))) script = self._get_results_script([job], {strf : {}, intf : {}, codef : dict(ids=True)}) with self.checkMaxQueries(5): list(csv.reader(StringIO(script.run()))) script = self._get_results_script([job], {strf : {}, intf : {}, codef : dict(labels=True)}) with self.checkMaxQueries(5): list(csv.reader(StringIO(script.run())))
def run_cli(cls=None, handle_output=None, get_script_depth=2): """Handle command line interface invocation of this script""" #amcatlogging.setup() if cls is None: cls = get_script(get_script_depth) if handle_output is None: handle_output = cls.output_type != None parser = argument_parser_from_script(cls) args = parser.parse_args() options = args.__dict__ verbose = options.pop("verbose", None) if verbose: amcatlogging.debug_module(cls.__module__) instance = cls(options) instance.verbose = verbose input = None if cls.input_type in (file, str, unicode): input = sys.stdin if cls.input_type in (str, unicode): input = input.read() if cls.input_type == unicode: encoding = chardet.detect(input)["encoding"] log.info("Using encoding {encoding}".format(**locals())) input = input.decode(encoding) out = instance.run(input) return out
def _setup_logging(self): """ Set up the logging facility. By default runs a file handler in _get_filename(".log") """ fn = self._get_filename(".log") amcatlogging.setFileHandler(fn) amcatlogging.info_module() amcatlogging.debug_module('amcat.tools.amcatsolr')
def run(self, _input): self.project = self.options['project'] outfile = self.options['outputfile'] if not outfile: outfile = StringIO() self.zipfile = ZipFile(outfile, 'w') from amcat.tools import amcatlogging amcatlogging.debug_module("django.db.backends") #self.serialize_project_meta() #self.serialize_articles() #self.serialize_coding_schemas() self.serialize_codebooks() try: return outfile.getvalue() except AttributeError: return outfile
yield urljoin(INDEX_URL, href) def _get_units(self): for url in self.get_categories(): doc = self.getdoc(url) for item in doc.cssselect("item"): date = toolkit.readDate(item.cssselect("pubdate")[0].text) if date.date() != self.options['date']: continue link = item.cssselect("link")[0] doc = HTMLDocument( url=urljoin(INDEX_URL, html.tostring(link).lstrip("<link>")), date = date, headline = item.cssselect("title")[0].text ) yield doc def _scrape_unit(self, doc): doc.prepare(self) doc.props.text = doc.doc.cssselect("div.article-body") doc.props.html = html.tostring(doc.doc) yield doc if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping") cli.run_cli(MetroScraper)
# AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### try: from scrapers.newspapers import tubantia except ImportError: try: from scraping.newspapers import tubantia except ImportError: from amcatscraping.newspapers import tubantia class GelderlanderScraper(tubantia.TubantiaScraper): medium_name = "De Gelderlander" paper = "dg" if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(GelderlanderScraper)
)) if not day_url.startswith(INDEX_URL): continue doc = self.getdoc(day_url) for article in doc.cssselect("div.lbox500 h2 a"): url = urljoin(day_url, article.get("href")) if '/video/' in url: continue yield HTMLDocument( url = urljoin(day_url, article.get("href")), headline = article.text, date = self.options['date'] ) def _scrape_unit(self, doc): doc.prepare(self) if doc.doc.cssselect("div.lbox440"): doc.props.text = doc.doc.cssselect("div.lbox440")[0].cssselect('p') else: doc.props.text = "" yield doc if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(DePersScraper)
export_sql = ("SELECT {self.dest_project.id} AS projectid, {fields} FROM articles a" " WHERE article_id IN ({article_ids})").format(**locals()) export_sql = "COPY ({export_sql}) TO STDOUT WITH BINARY".format(**locals()) import_sql = "COPY articles (project_id, {fields}) FROM STDIN WITH BINARY".format(**locals()) dest_host = "-h {self.dest_host}".format(**locals()) if self.dest_host else "" source_host = "-h {self.source_host}".format(**locals()) if self.source_host else "" cmd = ('psql {source_host} {self.source_db} -c "{export_sql}" ' '| psql {dest_host} {self.dest_db} -c "{import_sql}"').format(**locals()) log.debug("Copying {n} articles...".format(n=len(aids))) #log.debug(cmd) subprocess.check_output(cmd, shell=True) log.debug("... Done!") def _add_to_set(self, uuids): log.debug("Adding {n} articles to set using uuids...".format(n=len(uuids))) aids = [aid for (aid,) in Article.objects.filter(uuid__in=uuids).values_list("id")] if len(aids) != len(uuids): raise Exception("|aids| != |uuids|, something went wrong importing...") self.dest_set.add_articles(aids) log.debug("... Done!") if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module() cli.run_cli()
""" Split the file into one or more fragments representing individual documents. Default implementation returns a single fragment containing the unicode text. @type file: file like object @return: a sequence of objects (e.g. strings) to pass to parse_documents """ return [file] ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scripts.article_upload.upload") class TestUpload(amcattest.AmCATTestCase): def todo_test_zip_file(self): from tempfile import NamedTemporaryFile from django.core.files import File # does _get_units perform normally with NamedTemporaryFile(prefix=u"upload_test", suffix=".txt") as f: f.write("Test") f.flush() s = UploadScript(project=amcattest.create_test_project().id, file=File(f)) self.assertEqual({u.name for u in s._get_units()}, {f.name}) # does a zip file work?
from amcat.scripts.script import Script from amcat.models.scraper import Scraper from amcat.scraping.scraper import DBScraperForm from amcat.scraping.controller import RobustController class RunScraperForm(forms.Form): scraper = forms.ModelChoiceField(queryset=Scraper.objects.all()) date = forms.CharField() class AddProject(Script): """Add a project to the database.""" options_form = RunScraperForm output_type = None def run(self, _input=None): scraper = self.options["scraper"].get_scraper( date=self.options["date"]) controller = RobustController() controller.scrape(scraper) if __name__ == '__main__': from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.controller") from amcat.scripts.tools import cli cli.run_cli()
pagenum = int(a.get('href').split("/")[1]) sections[pagenum] = a.text return sections def _scrape_unit(self, url): article = HTMLDocument(url=url, section=self.section) article.prepare(self) article.props.date = date(*[int(n) for n in url.split("/")[5:8]]) article.props.pagenr = self.pagenum article.props.headline = article.doc.cssselect( "#article h1")[0].text_content() article.props.text = article.doc.cssselect("div.body")[0] dateline_pattern = re.compile("^([A-Z]+( [A-Z]+)?)$") b = article.props.text.cssselect("b") if b and dateline_pattern.search(b[0].text_content()): article.props.dateline = dateline_pattern.search( b[0].text_content()).group(1) if article.doc.cssselect("#article address"): article.props.author = article.doc.cssselect( "#article address")[0].text_content().lstrip("dor").strip() yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping") cli.run_cli(SpitsKrantScraper)
if fields is None: return try: a = body_to_article(*fields) a.project = self.options['project'] yield a except: log.error( "Error on processing fields: {fields}".format(**locals())) raise from amcat.tools import amcatlogging amcatlogging.debug_module() if __name__ == '__main__': from amcat.scripts.tools import cli cli.run_cli(handle_output=False) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest import datetime class TestLexisNexis(amcattest.AmCATTestCase): def setUp(self):
convertors = [_convert_doc, _convert_docx] if convertors: text = _convert_multiple(file, convertors) else: text = file.text return Article(text=text, **metadata) def explain_error(self, error): """Explain the error in the context of unit for the end user""" name = getattr(error.unit, "name", error.unit) return "Error in file {name} : {error.error!r}".format(**locals()) if __name__ == '__main__': from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scripts.article_upload.upload") #amcatlogging.debug_module("amcat.scraping.scraper") from amcat.scripts.tools.cli import run_cli run_cli(handle_output=False) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scripts.article_upload.upload") class TestUploadText(amcattest.AmCATTestCase): def test_article(self): from django.core.files import File
def parse_document(self, text): fields = parse_article(text) if fields is None: return try: a = body_to_article(*fields) a.project = self.options['project'] yield a except: log.error("Error on processing fields: {fields}".format(**locals())) raise from amcat.tools import amcatlogging; amcatlogging.debug_module() if __name__ == '__main__': from amcat.scripts.tools import cli cli.run_cli(handle_output=False) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest import datetime class TestLexisNexis(amcattest.AmCATTestCase):
if __name__ == '__main__': from amcat.tools import amcatlogging amcatlogging.setup() amcatlogging.info_module("amcat.contrib.corenlp") #from amcat.models import ArticleSet nlp = StanfordCoreNLP(corenlp_path="/home/amcat/resources/stanford-corenlp", models_version="2012-07-06") import sys if len(sys.argv) > 1: aids = map(int, sys.argv[1:]) delete_existing = True amcatlogging.debug_module("amcat.contrib.corenlp") else: aids = [int(aid) for aid in sys.stdin] #s = ArticleSet.objects.get(pk=22947) #aids = [aid for (aid,) in s.articles.values_list("id")] delete_existing = True log.info("Parsing %i articles, delete_existing=%s" % (len(aids), delete_existing)) for aid in aids: try: log.info("Parsing article %i" % aid) if AnalysisArticle.objects.filter(article_id=aid, analysis_id=STANFORD_ANALYSIS_ID).count(): if delete_existing: log.info("Deleting existing analysed article") aa = AnalysisArticle.objects.get(article_id=aid, analysis_id=STANFORD_ANALYSIS_ID) super(AnalysisArticle, aa).delete()
triples=json.dumps(triples)) def get_sentences(self, analysis_article_id): for s in self.api.get_objects("analysissentence", analysis_article=analysis_article_id, limit=9999): sent = self.api.get_object("sentence", s.sentence) yield (s.id, sent.sentence) #return [(int(s["id"]), s["sentence"]["sentence"]) for s in if __name__ == '__main__': from amcat.tools import amcatlogging amcatlogging.setup() amcatlogging.debug_module("amcat.tools.rest") #amcatlogging.debug_module() import argparse parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('host', action='store', help="Host to get articles from" " (e.g. localhost:8000 or https://amcat.vu.nl)") parser.add_argument('analysis', action='store', help="Analysis ID to parse") parser.add_argument("narticles", action='store', help="Number of articles to parse")
import logging; log = logging.getLogger(__name__) from django import forms from amcat.scripts.script import Script from amcat.models.scraper import Scraper from amcat.scraping.scraper import DBScraperForm from amcat.scraping.controller import RobustController class RunScraperForm(forms.Form): scraper = forms.ModelChoiceField(queryset=Scraper.objects.all()) date = forms.CharField() class AddProject(Script): """Add a project to the database.""" options_form = RunScraperForm output_type = None def run(self, _input=None): scraper = self.options["scraper"].get_scraper(date=self.options["date"]) controller = RobustController() controller.scrape(scraper) if __name__ == '__main__': from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.controller") from amcat.scripts.tools import cli cli.run_cli()