def test_nqueries(self):
        from amcat.tools import amcatlogging
        amcatlogging.setup()

        codebook, codes = amcattest.create_test_codebook_with_codes()
        schema, codebook, strf, intf, codef = amcattest.create_test_schema_with_fields(codebook=codebook)
        job = amcattest.create_test_job(unitschema=schema, articleschema=schema, narticles=7)
        articles = list(job.articleset.articles.all())

        log.info(codes)
        amcattest.create_test_coding(codingjob=job, article=articles[0]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id})
        amcattest.create_test_coding(codingjob=job, article=articles[1]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id})
        amcattest.create_test_coding(codingjob=job, article=articles[2]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id})
        amcattest.create_test_coding(codingjob=job, article=articles[3]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id})
        amcattest.create_test_coding(codingjob=job, article=articles[4]).update_values({strf:"bla", intf:1, codef:codes["A1b"].id})

        codingjobs = list(CodingJob.objects.filter(pk__in=[job.id]))
        c = list(codingjobs[0].codings)[0]
        amcatlogging.debug_module('django.db.backends')

        script = self._get_results_script([job], {strf : {}, intf : {}})
        with self.checkMaxQueries(8):
            list(csv.reader(StringIO(script.run())))


        script = self._get_results_script([job], {strf : {}, intf : {}, codef : dict(ids=True)})
        with self.checkMaxQueries(8):
            list(csv.reader(StringIO(script.run())))


        script = self._get_results_script([job], {strf : {}, intf : {}, codef : dict(labels=True)})
        with self.checkMaxQueries(8):
            list(csv.reader(StringIO(script.run())))
Example #2
0
File: cli.py Project: edisona/amcat
def run_cli(cls=None, handle_output=None, get_script_depth=2):
    """Handle command line interface invocation of this script"""
    amcatlogging.setup()

    if cls is None: cls = get_script(get_script_depth)

    if handle_output is None:
        handle_output = cls.output_type != None

    parser = argument_parser_from_script(cls)
    args = parser.parse_args()
    options = args.__dict__

    if options.pop("verbose", None):
        amcatlogging.debug_module(cls.__module__)

    instance = cls(options)

    input = None
    if cls.input_type in (file, str, unicode):
        input = sys.stdin
    if cls.input_type in (str, unicode):
        input = input.read()
    if cls.input_type == unicode:
        encoding = chardet.detect(input)["encoding"]
        log.info("Using encoding {encoding}".format(**locals()))
        input = input.decode(encoding)

    out = instance.run(input)

    if handle_output:
        out = handleOutput(out, instance.output_type)
    return out
    def test_nqueries(self):
        from amcat.tools import amcatlogging
        amcatlogging.setup()

        codebook, codes = amcattest.create_test_codebook_with_codes()
        schema, codebook, strf, intf, codef = amcattest.create_test_schema_with_fields(codebook=codebook)
        job = amcattest.create_test_job(unitschema=schema, articleschema=schema, narticles=7)
        articles = list(job.articleset.articles.all())
        
        amcattest.create_test_coding(codingjob=job, article=articles[0]).update_values({strf:"bla", intf:1, codef:codes["A1b"]})
        amcattest.create_test_coding(codingjob=job, article=articles[1]).update_values({strf:"bla", intf:1, codef:codes["A1b"]})
        amcattest.create_test_coding(codingjob=job, article=articles[2]).update_values({strf:"bla", intf:1, codef:codes["A1b"]})
        amcattest.create_test_coding(codingjob=job, article=articles[3]).update_values({strf:"bla", intf:1, codef:codes["A1b"]})
        amcattest.create_test_coding(codingjob=job, article=articles[4]).update_values({strf:"bla", intf:1, codef:codes["A1b"]})                        

        codingjobs = list(CodingJob.objects.filter(pk__in=[job.id]))
        c = codingjobs[0].codings.all()[0]
        amcatlogging.debug_module('django.db.backends')

        script = self._get_results_script([job], {strf : {}, intf : {}})
        with self.checkMaxQueries(5):
            list(csv.reader(StringIO(script.run())))


        script = self._get_results_script([job], {strf : {}, intf : {}, codef : dict(ids=True)})
        with self.checkMaxQueries(5):
            list(csv.reader(StringIO(script.run())))


        script = self._get_results_script([job], {strf : {}, intf : {}, codef : dict(labels=True)})
        with self.checkMaxQueries(5):
            list(csv.reader(StringIO(script.run())))
def run_cli(cls=None, handle_output=None, get_script_depth=2):
    """Handle command line interface invocation of this script"""
    #amcatlogging.setup()

    if cls is None: cls = get_script(get_script_depth)

    if handle_output is None:
        handle_output = cls.output_type != None

    parser = argument_parser_from_script(cls)
    args = parser.parse_args()
    options = args.__dict__

    verbose = options.pop("verbose", None)
    if verbose:
        amcatlogging.debug_module(cls.__module__)
    
    instance = cls(options)
    instance.verbose = verbose

    input = None
    if cls.input_type in (file, str, unicode):
        input = sys.stdin
    if cls.input_type in (str, unicode):
        input = input.read()
    if cls.input_type == unicode:
        encoding = chardet.detect(input)["encoding"]
        log.info("Using encoding {encoding}".format(**locals()))
        input = input.decode(encoding)
        
    out = instance.run(input)
    return out
Example #5
0
 def _setup_logging(self):
     """
     Set up the logging facility. By default runs a file handler in _get_filename(".log")
     """
     fn = self._get_filename(".log")
     amcatlogging.setFileHandler(fn)
     amcatlogging.info_module()
     amcatlogging.debug_module('amcat.tools.amcatsolr')
Example #6
0
    def run(self, _input):
        self.project = self.options['project']

        outfile = self.options['outputfile']
        if not outfile:
            outfile = StringIO()
        self.zipfile = ZipFile(outfile, 'w')

        from amcat.tools import amcatlogging
        amcatlogging.debug_module("django.db.backends")

        #self.serialize_project_meta()
        #self.serialize_articles()
        #self.serialize_coding_schemas()
        self.serialize_codebooks()

        try:
            return outfile.getvalue()
        except AttributeError:
            return outfile
Example #7
0
                    yield urljoin(INDEX_URL, href)


    def _get_units(self):
        for url in self.get_categories():
            doc = self.getdoc(url)
            for item in doc.cssselect("item"):
                date = toolkit.readDate(item.cssselect("pubdate")[0].text)
                if date.date() != self.options['date']:
                    continue
                link = item.cssselect("link")[0]
                doc = HTMLDocument(
                    url=urljoin(INDEX_URL, html.tostring(link).lstrip("<link>")),
                    date = date,
                    headline = item.cssselect("title")[0].text
                    )
                yield doc

    def _scrape_unit(self, doc):
        doc.prepare(self)
        doc.props.text = doc.doc.cssselect("div.article-body")
        doc.props.html = html.tostring(doc.doc)
        yield doc

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping")
    cli.run_cli(MetroScraper)

Example #8
0
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

try:
    from scrapers.newspapers import tubantia
except ImportError:
    try:
        from scraping.newspapers import tubantia
    except ImportError:
        from amcatscraping.newspapers import tubantia


class GelderlanderScraper(tubantia.TubantiaScraper):
    medium_name = "De Gelderlander"
    paper = "dg"


if __name__ == '__main__':

    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(GelderlanderScraper)
Example #9
0
            ))

            if not day_url.startswith(INDEX_URL): continue

            doc = self.getdoc(day_url)
            for article in doc.cssselect("div.lbox500 h2 a"):
                url = urljoin(day_url, article.get("href"))

                if '/video/' in url: continue

                yield HTMLDocument(
                    url = urljoin(day_url, article.get("href")),
                    headline = article.text,
                    date = self.options['date']
                )

    def _scrape_unit(self, doc):
        doc.prepare(self)
        if doc.doc.cssselect("div.lbox440"):
            doc.props.text = doc.doc.cssselect("div.lbox440")[0].cssselect('p')
        else:
            doc.props.text = ""
        yield doc

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(DePersScraper)
Example #10
0
        export_sql = ("SELECT {self.dest_project.id} AS projectid, {fields} FROM articles a"
                      " WHERE article_id IN ({article_ids})").format(**locals())
        export_sql = "COPY ({export_sql}) TO STDOUT WITH BINARY".format(**locals())

        import_sql = "COPY articles (project_id, {fields}) FROM STDIN WITH BINARY".format(**locals())

        dest_host = "-h {self.dest_host}".format(**locals()) if self.dest_host else ""
        source_host = "-h {self.source_host}".format(**locals()) if self.source_host else ""
        
        cmd = ('psql {source_host} {self.source_db} -c "{export_sql}" '
               '| psql {dest_host} {self.dest_db} -c "{import_sql}"').format(**locals())

        log.debug("Copying {n} articles...".format(n=len(aids)))
        #log.debug(cmd)
        subprocess.check_output(cmd, shell=True)
        log.debug("... Done!")

    def _add_to_set(self, uuids):
        log.debug("Adding {n} articles to set using uuids...".format(n=len(uuids)))
        aids = [aid for (aid,) in Article.objects.filter(uuid__in=uuids).values_list("id")]
        if len(aids) != len(uuids):
            raise Exception("|aids| != |uuids|, something went wrong importing...")
        self.dest_set.add_articles(aids)
        log.debug("... Done!")
        
if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module()
    cli.run_cli()
Example #11
0
        """
        Split the file into one or more fragments representing individual documents.
        Default implementation returns a single fragment containing the unicode text.

        @type file: file like object
        @return: a sequence of objects (e.g. strings) to pass to parse_documents
        """
        return [file]

###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest
from amcat.tools import amcatlogging
amcatlogging.debug_module("amcat.scripts.article_upload.upload")

class TestUpload(amcattest.AmCATTestCase):
    def todo_test_zip_file(self):
        from tempfile import NamedTemporaryFile
        from django.core.files import File
        # does _get_units perform normally
        with NamedTemporaryFile(prefix=u"upload_test", suffix=".txt") as f:
            f.write("Test")
            f.flush()
            s = UploadScript(project=amcattest.create_test_project().id,
                             file=File(f))
            self.assertEqual({u.name for u in s._get_units()}, {f.name})

        # does a zip file work?
Example #12
0
from amcat.scripts.script import Script
from amcat.models.scraper import Scraper
from amcat.scraping.scraper import DBScraperForm
from amcat.scraping.controller import RobustController


class RunScraperForm(forms.Form):
    scraper = forms.ModelChoiceField(queryset=Scraper.objects.all())
    date = forms.CharField()


class AddProject(Script):
    """Add a project to the database."""

    options_form = RunScraperForm
    output_type = None

    def run(self, _input=None):
        scraper = self.options["scraper"].get_scraper(
            date=self.options["date"])
        controller = RobustController()
        controller.scrape(scraper)


if __name__ == '__main__':
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.controller")
    from amcat.scripts.tools import cli
    cli.run_cli()
Example #13
0
            pagenum = int(a.get('href').split("/")[1])
            sections[pagenum] = a.text
        return sections

    def _scrape_unit(self, url):
        article = HTMLDocument(url=url, section=self.section)
        article.prepare(self)
        article.props.date = date(*[int(n) for n in url.split("/")[5:8]])
        article.props.pagenr = self.pagenum
        article.props.headline = article.doc.cssselect(
            "#article h1")[0].text_content()
        article.props.text = article.doc.cssselect("div.body")[0]
        dateline_pattern = re.compile("^([A-Z]+( [A-Z]+)?)$")
        b = article.props.text.cssselect("b")
        if b and dateline_pattern.search(b[0].text_content()):
            article.props.dateline = dateline_pattern.search(
                b[0].text_content()).group(1)

        if article.doc.cssselect("#article address"):
            article.props.author = article.doc.cssselect(
                "#article address")[0].text_content().lstrip("dor").strip()

        yield article


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping")
    cli.run_cli(SpitsKrantScraper)
Example #14
0
        if fields is None:
            return

        try:
            a = body_to_article(*fields)
            a.project = self.options['project']
            yield a
        except:
            log.error(
                "Error on processing fields: {fields}".format(**locals()))
            raise


from amcat.tools import amcatlogging

amcatlogging.debug_module()

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    cli.run_cli(handle_output=False)

###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest
import datetime


class TestLexisNexis(amcattest.AmCATTestCase):
    def setUp(self):
Example #15
0
            convertors = [_convert_doc, _convert_docx]

        if convertors:
            text = _convert_multiple(file, convertors)
        else:
            text = file.text
        return Article(text=text, **metadata)

    def explain_error(self, error):
        """Explain the error in the context of unit for the end user"""
        name = getattr(error.unit, "name", error.unit)
        return "Error in file {name} : {error.error!r}".format(**locals())
    
if __name__ == '__main__':
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scripts.article_upload.upload")
    #amcatlogging.debug_module("amcat.scraping.scraper")
    from amcat.scripts.tools.cli import run_cli
    run_cli(handle_output=False)

###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest
from amcat.tools import amcatlogging
amcatlogging.debug_module("amcat.scripts.article_upload.upload")

class TestUploadText(amcattest.AmCATTestCase):
    def test_article(self):
        from django.core.files import File
Example #16
0
    
    def parse_document(self, text):
        fields = parse_article(text)

        if fields is None:
            return
        
        try:
            a = body_to_article(*fields)
            a.project = self.options['project']
            yield a
        except:
            log.error("Error on processing fields: {fields}".format(**locals()))
            raise

from amcat.tools import amcatlogging; amcatlogging.debug_module()
        
if __name__ == '__main__':
    from amcat.scripts.tools import cli
    cli.run_cli(handle_output=False)



###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest
import datetime

class TestLexisNexis(amcattest.AmCATTestCase):
Example #17
0
    
    def parse_document(self, text):
        fields = parse_article(text)

        if fields is None:
            return
        
        try:
            a = body_to_article(*fields)
            a.project = self.options['project']
            yield a
        except:
            log.error("Error on processing fields: {fields}".format(**locals()))
            raise

from amcat.tools import amcatlogging; amcatlogging.debug_module()
        
if __name__ == '__main__':
    from amcat.scripts.tools import cli
    cli.run_cli(handle_output=False)



###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest
import datetime

class TestLexisNexis(amcattest.AmCATTestCase):
Example #18
0

if __name__ == '__main__':
    from amcat.tools import amcatlogging
    amcatlogging.setup()
    amcatlogging.info_module("amcat.contrib.corenlp")

    #from amcat.models import ArticleSet

    nlp = StanfordCoreNLP(corenlp_path="/home/amcat/resources/stanford-corenlp", models_version="2012-07-06")

    import sys
    if len(sys.argv) > 1:
	aids = map(int, sys.argv[1:])
	delete_existing = True
	amcatlogging.debug_module("amcat.contrib.corenlp")
    else:
        aids = [int(aid) for aid in sys.stdin]
	#s = ArticleSet.objects.get(pk=22947)
	#aids = [aid for (aid,) in s.articles.values_list("id")]
	delete_existing = True

    log.info("Parsing %i articles, delete_existing=%s" % (len(aids), delete_existing))
    for aid in aids:
	try:
	    log.info("Parsing article %i" % aid)
	    if AnalysisArticle.objects.filter(article_id=aid, analysis_id=STANFORD_ANALYSIS_ID).count():
		if delete_existing:
		    log.info("Deleting existing analysed article")
		    aa = AnalysisArticle.objects.get(article_id=aid, analysis_id=STANFORD_ANALYSIS_ID)
		    super(AnalysisArticle, aa).delete()
Example #19
0
                             triples=json.dumps(triples))

    def get_sentences(self, analysis_article_id):

        for s in self.api.get_objects("analysissentence",
                                      analysis_article=analysis_article_id,
                                      limit=9999):
            sent = self.api.get_object("sentence", s.sentence)
            yield (s.id, sent.sentence)
        #return [(int(s["id"]), s["sentence"]["sentence"]) for s in


if __name__ == '__main__':
    from amcat.tools import amcatlogging
    amcatlogging.setup()
    amcatlogging.debug_module("amcat.tools.rest")
    #amcatlogging.debug_module()

    import argparse
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('host',
                        action='store',
                        help="Host to get articles from"
                        " (e.g. localhost:8000 or https://amcat.vu.nl)")
    parser.add_argument('analysis',
                        action='store',
                        help="Analysis ID to parse")
    parser.add_argument("narticles",
                        action='store',
                        help="Number of articles to parse")
Example #20
0
import logging; log = logging.getLogger(__name__)

from django import forms

from amcat.scripts.script import Script
from amcat.models.scraper import Scraper
from amcat.scraping.scraper import DBScraperForm
from amcat.scraping.controller import RobustController

class RunScraperForm(forms.Form):
    scraper = forms.ModelChoiceField(queryset=Scraper.objects.all())
    date = forms.CharField()

class AddProject(Script):
    """Add a project to the database."""

    options_form = RunScraperForm
    output_type = None

    def run(self, _input=None):
        scraper = self.options["scraper"].get_scraper(date=self.options["date"])
        controller = RobustController()
        controller.scrape(scraper)

if __name__ == '__main__':
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.controller")
    from amcat.scripts.tools import cli
    cli.run_cli()