コード例 #1
0
ファイル: oe24.py プロジェクト: ToonAlfrink/amcatscraping
            datestr = article.doc.cssselect("#page,#Page article span.date")[0].text
            article.props.date = self.extract_date(datestr)
            article.props.headline = article.doc.cssselect("#page,#Page article h1.title")[0].text
            article.props.kicker = article.doc.cssselect("#page,#Page article h2.preTitle")[0].text
            article.props.byline = article.doc.cssselect("#page,#Page article p.leadText")[0].text
            article.props.text = [p for p in article.doc.cssselect("#page,#Page article div.bodyText > p")
                                  if p.text_content().strip()]
        if article.props.date.date() == self.options['date']:
            yield article

    german_months = ["Januar","Februar","März","April","Mai","Juni","Juli","August","September","Oktober","November","Dezember"]
        
    def extract_date(self, datestr):
        for m in self.german_months:
            if m.lower() in datestr.lower():
                month = self.german_months.index(m)
                break
        day = int(datestr.split(".")[0])
        year,time = datestr.lower().split(m.lower())[1].strip().split()
        return datetime(int(year), month+1, day,
                    int(time.split(":")[0]), int(time.split(":")[1]))


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(Oe24Scraper)


コード例 #2
0
def get_parents_from_columns(cols):
    """Assuming cols are inented list, return a list of code, parent pairs in the same order as cols"""
    parents = []
    for i in range(len(cols[0])):
        j = get_index(cols, i)
        code = cols[j][i]
        parents = parents[:j]
        parent = parents[-1] if parents else None
        parents.append(code)
        yield code, parent


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    result = cli.run_cli()
    #print result.output()

###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest


def _run_test(bytes, **options):
    if 'project' not in options:
        options['project'] = amcattest.create_test_project().id
    if 'codebook_name' not in options: options['codebook_name'] = 'test'

    from tempfile import NamedTemporaryFile
コード例 #3
0
                print
                for a, _date in self.scrape_page(page_doc):
                    if _date.date() in self.dates:
                        yield a, _date
                    elif _date.date() < self.dates[0]:
                        br = True
                        break
                if br:
                    break
                try:
                    data_after = page_doc.cssselect("#nextPage")[0].get(
                        'data-after')
                except IndexError:
                    break
                for x in range(3):
                    try:
                        page_doc = json.loads(
                            self.open(self.page_url.format(**locals())).read())
                    except Exception as e:
                        print(self.page_url.format(**locals()))
                        print(e)

                page_doc = html.fromstring(page_doc['content']['div#nextPage'])


if __name__ == "__main__":
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(Nieuws_nlArchiveScraper)
コード例 #4
0
                elif date.date() < self.options['date']:
                    return
            pagenr += 1

    def _scrape_unit(self, bits):
        date, url = bits
        article = HTMLDocument(date = date, url = url)
        article.prepare(self)
        content = article.doc.cssselect("#content")[0]
        article.props.section = content.cssselect("div.info-block p.meta a.label")[0].text
        article.props.headline = content.cssselect("div.title h1")[0].text
        article.props.externalid = url.split("-")[-1].strip("W/")
        article.props.text = content.cssselect("div.article")
        article.props.author = content.cssselect("p.meta span.user a.label")[0].text.strip()
        article.props.tags = set([a.text for a in content.cssselect("ul.taglist li a")])
        article.props.view_count = int(content.cssselect("div.info-block span.view-count")[0].text)
        yield article
        self.clearcookies()

    def clearcookies(self):
        """Clear cookies so the site won't interrupt us after 3 articles"""
        self.opener.cookiejar._cookies = {}

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(BoerderijScraper)


コード例 #5
0
ファイル: RTL.py プロジェクト: edisona/amcat.scraping
                article.props.author = tag.cssselect("span.author")[0].text.strip()
            elif tag.cssselect("div.videoContainer") or 'promo' in tag.get('class'):
                continue
            elif tag.cssselect("div.tagline h4"):
                self.stories.add(urljoin(url, tag.cssselect("h4 a")[0].get('href')))
                continue
            else:
                h = tag.cssselect("div.body h3")[0]
                article.props.type = "article"
                article.props.headline = h.text_content().strip()
                if h.cssselect("a"):
                    article.props.url = urljoin(url, h.cssselect("a")[0].get('href'))
                else:
                    article.props.url = url
            yield article

    def _scrape_unit(self, article):
        if article.props.type == "article":
            article.prepare(self)
            [div.drop_tree() for div in article.doc.cssselect("div.rtldart")]
            article.props.text = article.doc.cssselect("article.news div.body div.paragraph")
        print(article)
        yield article

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(RTLScraper)

コード例 #6
0
                tweet.props.date = datetime.fromtimestamp(
                    float(div.cssselect("span.js-short-timestamp")[0].get("data-time"))
                )
                tweet.props.text = div.cssselect("p.ProfileTweet-text")[0]
                tweet.props.is_retweet = bool(div.get("data-retweeter"))
                if tweet.props.is_retweet:
                    tweet.props.original_author = div.get("data-name")
                tweet.props.url = url
                if "maxid" in locals() and div.get("data-tweet-id") == maxid:
                    # infinite loop
                    done = True
                    break
                maxid = div.get("data-tweet-id")
                if tweet.props.date.date() < self.options["date"]:
                    done = True
                    break
                elif tweet.props.date.date() == self.options["date"]:
                    yield tweet
            if done == False:
                nexturl = url + "&max_id={}".format(maxid)
                data = json.loads(self.open(str(nexturl)).read())


if __name__ == "__main__":
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging

    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(TwitterPoliticiScraper)
コード例 #7
0
ファイル: raw_pdf.py プロジェクト: BBie/amcat
            for line in parser.get_textlines(page):
                page_txt += line.get_text() + "\n"
            res += page_txt + "\n\n"
        article = Article(text = res)
        article.headline = self.getheadline(_file)
        article.medium = self.options['medium']
        article.section = self.options['section']
        if self.options['date']:
            article.date = self.options['date']
        else:
            article.date = date.today()
        yield article

    def getheadline(self, _file):
        hl = _file.name
        if hl.endswith(".pdf"): hl = hl[:-len(".pdf")]
        windows = hl.split("\\")
        other = hl.split("/")
        if len(windows) > len(other):
            #probably a windows path
            hl = windows[-1]
        else:
            hl = other[-1]
        return hl

if __name__ == "__main__":
    from amcat.scripts.tools import cli
    cli.run_cli(RawPDFScraper)
        
            
コード例 #8
0
ファイル: daily.py プロジェクト: kasperwelbers/amcat
            return False #not enough articles
        else:
            return True

from amcat.tools.amcatlogging import AmcatFormatter
import sys

def setup_logging():
    loggers = (logging.getLogger("amcat"), logging.getLogger("scrapers"),logging.getLogger(__name__))
    d = datetime.date.today()
    filename = "/home/amcat/log/daily_{d.year:04d}-{d.month:02d}-{d.day:02d}.txt".format(**locals())
    sys.stderr = open(filename, 'a')
    handlers = (logging.StreamHandler(sys.stdout),logging.FileHandler(filename))
    formatter = AmcatFormatter(date = True)

    for handler in handlers:
        handler.setLevel(logging.INFO)
        handler.setFormatter(formatter)

    for logger in loggers:
        logger.propagate = False
        logger.setLevel(logging.INFO)
        for handler in handlers:        
            logger.addHandler(handler)
    logging.getLogger().handlers = []

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    setup_logging()
    cli.run_cli(DailyScript)
コード例 #9
0
ファイル: mediargus.py プロジェクト: christianbaden/amcat
        return zip(metas, bodies)

    def parse_document(self, tupleText):
        meta, body = tupleText
        meta = meta.strip()
        meta = meta.split('\n')
        kargs = {
            'externalid': int(meta[0].split('.')[0].lstrip('?')),
            'headline': meta[0].partition('. ')[2]
        }

        medium_name, date, pagenr, length = meta[2].split(', ')
        kargs['medium'] = Medium.get_or_create(medium_name)
        kargs['date'] = readDate(date)
        kargs['pagenr'] = int(pagenr.strip('p.'))
        kargs['length'] = int(length.strip('w.'))

        body = body.split('\n')
        kargs['section'] = body[2]

        kargs['text'] = '\n'.join(body[5:])

        kargs['project'] = self.options['project']

        return Article(**kargs)


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    cli.run_cli(Mediargus, handle_output=False)
コード例 #10
0
            yield comment



        yield page

    def scrape_comments(self,page):
        p = page.props.url+"?page={}"
        if not page.doc.cssselect("ul.pager"):
            return
        total = int(page.doc.cssselect("ul.pager li.pager-last a")[0].get('href').split("page=")[-1].split("&")[0]) + 1
        docs = [self.getdoc(p.format(x)) for x in range(total)]
        for doc in docs:
            for div in doc.cssselect("#comments div.comment"):
                comment = HTMLDocument()
                comment.props.text = div.cssselect("div.content")[0]
                comment.props.author = div.cssselect("span.submitted-username")[0].text_content()
                comment.props.date = readDate(div.cssselect("div.submitted div.floatr")[0].text_content())
                comment.parent = page
                yield comment




if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(WebTelegraafArchiveScraper)
コード例 #11
0
            kargs["medium"] = medium

        if self.parent_field:
            doc_id = kargs.get(self.id_field)
            parent_id = kargs.pop(self.parent_field)
            if parent_id:
                self.parents[doc_id] = parent_id

        article = Article(**kargs)
        if self.parent_field:
            self.articles[doc_id] = article

        return article

    def postprocess(self, articles):
        if self.parent_field:
            for doc_id, parent_id in self.parents.iteritems():
                doc = self.articles[doc_id]
                doc.parent = self.articles[parent_id]
                if not doc.addressee and self.options['addressee_from_parent']:
                    doc.addressee = doc.parent.author

                doc.save()
        super(CSV, self).postprocess(articles)


if __name__ == '__main__':
    from amcat.scripts.tools import cli

    cli.run_cli(CSV)
コード例 #12
0
ファイル: fd.py プロジェクト: ToonAlfrink/amcatscraping
            return
        for i, table in enumerate(article.doc.cssselect("table")):
            if table.get('class') == "body":
                table_after_body = article.doc.cssselect("table")[i + 1]
        page_date = re.search(
            "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})",
            table_after_body.text_content())
        article.props.pagenr = page_date.group(1)
        article.props.date = readDate(page_date.group(2))
        article.props.section = self.current_section
        article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip()
        if article.doc.cssselect(".artsubheader"):
            article.props.byline = article.doc.cssselect(".artsubheader")[0]
        if article.doc.cssselect("td.artauthor"):
            article.props.author = article.doc.cssselect("td.artauthor")[0].text.split(":")[1].strip()
        dateline_match = re.search(
            "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n",
            "\n".join([n.text_content() for n in article.props.text]).strip())
        if dateline_match:
            article.props.dateline = dateline_match.group(1)
                                          
        yield article

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(FDScraper)


コード例 #13
0
ファイル: spits.py プロジェクト: ToonAlfrink/amcatscraping
                break
        for a in html.cssselect("a"):
            pagenum = int(a.get("href").split("/")[1])
            sections[pagenum] = a.text
        return sections

    def _scrape_unit(self, url):
        article = HTMLDocument(url=url, section=self.section)
        article.prepare(self)
        article.props.date = date(*[int(n) for n in url.split("/")[5:8]])
        article.props.pagenr = self.pagenum
        article.props.headline = article.doc.cssselect("#article h1")[0].text_content()
        article.props.text = article.doc.cssselect("div.body")[0]
        dateline_pattern = re.compile("^([A-Z]+( [A-Z]+)?)$")
        b = article.props.text.cssselect("b")
        if b and dateline_pattern.search(b[0].text_content()):
            article.props.dateline = dateline_pattern.search(b[0].text_content()).group(1)

        if article.doc.cssselect("#article address"):
            article.props.author = article.doc.cssselect("#article address")[0].text_content().lstrip("dor").strip()

        yield article


if __name__ == "__main__":
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging

    amcatlogging.debug_module("amcat.scraping")
    cli.run_cli(SpitsKrantScraper)
コード例 #14
0
    def get_article(self, page):
        page.props.author = page.doc.cssselect(
            "#artikel-footer .author-date")[0].text.split("|")[0].strip()
        page.props.headline = page.doc.cssselect("div.acarhead h1")[0].text
        page.props.text = [
            page.doc.cssselect("div.artikel-intro")[0],
            page.doc.cssselect("div.artikel-main")[0]
        ]
        page.props.section = page.props.url.split("/")[4]
        return page

    def get_comments(self, page):
        for div in page.doc.cssselect("#comments div.comment"):
            comment = HTMLDocument(parent=page)
            comment.props.section = page.props.section
            comment.props.url = page.props.url
            comment.props.text = div.cssselect("p")[0]
            footer = div.cssselect("p.footer")[0].text_content().split(" | ")
            comment.props.author = footer[0].strip()
            comment.props.date = readDate(footer[1].strip())
            yield comment


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(PownewsScraper)
コード例 #15
0
ファイル: noen.py プロジェクト: ToonAlfrink/amcatscraping
                article = HTMLDocument(
                    url = urljoin(index_url, a.get('href')),
                    section = category,
                    headline = a.text_content().strip(),
                    )
                yield article
                    

    def _scrape_unit(self, article):
        article.prepare(self)
        article.props.date = readDate(article.doc.cssselect("#artikelbox div.dateandmore")[0].text_content())
        if article.props.date.date() != self.options['date']:
            print('Faulty date')
            return
        article.doc.cssselect("#story div")[-1].drop_tree()
        article.props.text = article.doc.cssselect("#story")
        firstline = article.props.text[0].text_content().strip().split("\n")[0]
        if len(firstline.split()) <= 8 and "Von " in firstline: #at most 8 words
            article.props.author = firstline.split("Von ")[1]
        kurztext = article.doc.cssselect("#kurztext")
        article.props.byline = kurztext and kurztext[0].text_content().strip() or None
        yield article

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(NoenScraper)


コード例 #16
0
ファイル: deduplicate.py プロジェクト: edisona/amcat
        Runs on all daily scraper articlesets
        """


if __name__ == '__main__':
    from sys import argv
    from getopt import getopt
    opts, args = getopt(argv, "s")
    for opt, arg in opts:
        if opt == '-s':
            dedu = DeduplicateScript()
            dedu.run_scrapers()

    amcatlogging.info_module("amcat.scripts.maintenance.deduplicate")
    from amcat.scripts.tools import cli
    cli.run_cli(DeduplicateScript)

###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest


class TestDeduplicateScript(amcattest.PolicyTestCase):
    def test_deduplicate(self):
        """One article should be deleted from artset and added to project 2"""
        p = amcattest.create_test_project()
        art1 = amcattest.create_test_article(url='blaat1', project=p)
        art2 = amcattest.create_test_article(url='blaat2', project=p)
        art3 = amcattest.create_test_article(url='blaat1', project=p)
コード例 #17
0
ファイル: apa_prof.py プロジェクト: isususi/amcat
        metadata, text = parse_page(paragraphs)
        metadata["medium"] = Medium.get_or_create(metadata["medium"])

        return Article(text=text, **metadata)

    def split_file(self, file):
        original_rtf, fixed_rtf = file.bytes, fix_rtf(file.bytes)
        doc = parse_html(to_html(original_rtf, fixed_rtf))

        for i, page in enumerate(get_pages(doc)):
            yield doc, page


"""if __name__ == '__main__':
    original_rtf = open(sys.argv[1], 'rb').read()
    fixed_rtf = fix_rtf(original_rtf)
    html = to_html(original_rtf, fixed_rtf)
    #html = open("blaat.html").read()
    doc = parse_html(html)
    pages = list(get_pages(doc))

    for page in pages:
        metadata, text = parse_page((doc, page))
        print(text)
        print("-----")"""

if __name__ == '__main__':
    from amcat.scripts.tools.cli import run_cli

    run_cli(handle_output=False)
コード例 #18
0
class DerStandardScraper(HTTPScraper, DatedScraper):
    medium_name = "derstandard.at"
    index_url = "http://derstandard.at/Archiv/{self.options[date].year}/{self.options[date].month}/{self.options[date].day}"

    def _get_units(self):
        index_url = self.index_url.format(**locals())
        doc = self.getdoc(index_url)
        for li in doc.cssselect("#content ul.chronologie li"):
            article = HTMLDocument(
                date = readDate(li.cssselect("div.date")[0].text_content()),
                headline = li.cssselect("h3")[0].text_content().strip(),
                url = urljoin(index_url, li.cssselect("h3 a")[0].get('href'))
                )
            kicker = li.cssselect("div.text h6 a")
            article.props.kicker = kicker and kicker[0].text or None
            yield article
        
    def _scrape_unit(self, article):
        article.prepare(self)
        article.props.section = " > ".join([span.text_content() for span in article.doc.cssselect("#breadcrumb span.item")[1:]])
        article.props.text = article.doc.cssselect("#artikelBody div.copytext")[0]
        yield article

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(DerStandardScraper)


コード例 #19
0
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

try:
    from scrapers.newssites import nrc_weblogs
except ImportError:
    try:
        from scraping.newssites import nrc_weblogs
    except ImportError:
        from amcatscraping.newssites import nrc_weblogs
    

class ColumnNRCScraper(nrc_weblogs.WeblogNRCScraper):
    medium_name = "NRC website - blogs"
    t = "columns"

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(ColumnNRCScraper)
コード例 #20
0
ファイル: vienna.py プロジェクト: ToonAlfrink/amcatscraping
                break

    def _scrape_unit(self, article):
        article.prepare(self)
        breadcrumbs = article.doc.cssselect("div.BreadCrumbs span:not(.Separator)")[1:-1]
        article.props.section = " > ".join([span.text_content().strip() for span in breadcrumbs])
        if "Bezirk" in article.props.section:
            article.props.bezirk = article.props.section.split(".")[0].strip()
            article.props.section = "Bezirk"
        article.props.text = article.doc.cssselect("div.Article #article_lead")
        bodytext = article.doc.cssselect("#BodyText")[0]
        wrapper = bodytext.cssselect("div.SingleContentWrapper-450px")[0]
        for tag in wrapper.iter():
            #removing html comments and other clutter
            if callable(tag.tag):
                tag.drop_tree()
        lastdiv = wrapper.cssselect(".SingleContentWrapper-450px > div")
        if lastdiv and "zum thema" in lastdiv[0].text_content():
            lastdiv[0].drop_tree()

        article.props.text += wrapper
        yield article

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(ViennaScraper)


コード例 #21
0
ファイル: google_at.py プロジェクト: edisona/amcat.scraping
# Free Software Foundation, either version 3 of the License, or (at your  #
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

try:
    from scrapers.newssites.google import GoogleNewsScraper
except ImportError:
    try:
        from scraping.newssites.google import GoogleNewsScraper
    except ImportError:
        from amcatscraping.newssites.google import GoogleNewsScraper


class GoogleAustriaScraper(GoogleNewsScraper):
    url_gtld = "at"


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(GoogleAustriaScraper)
コード例 #22
0
        for page in self.get_pages(topic.doc):
            if first == True:
                comments = page.cssselect("div.post")[1:]
                first = False
            else:
                comments = page.cssselect("div.post")
            for div in comments:
                comment = HTMLDocument()
                comment.parent = topic
                comment.props.author = div.cssselect("div.postholder_top a.username")[0]
                comment.props.date = readDate(div.cssselect("div.postholder_top span.post_time")[0].text_content())
                comment.props.text = div.cssselect("div.postholder_bot div.contents")[0]
                yield comment
                

    def get_pages(self,doc):
        yield doc
        for a in doc.cssselect("nav div.pagesholder a"):
            url = urljoin("http://forum.fok.nl",a.get('href'))
            yield self.getdoc(url)

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(FokForumScraper) 



コード例 #23
0
ファイル: nujij.py プロジェクト: edisona/amcat.scraping
                        except IndexError:
                            pass
                        else:
                            if comment.props.date.date(
                            ) == self.options['date']:
                                yield comment
        else:
            for li in nxt.cssselect("ol.reacties li.hidenum"):
                comment = HTMLDocument(parent=page)
                if not "<b>Reageer als eerste op dit bericht</b>" in etree.tostring(
                        li):
                    try:
                        comment.props.text = li.cssselect(
                            "div.reactie-body")[0]
                        comment.props.author = li.cssselect("strong")[0].text
                        comment.props.date = readDate(
                            li.cssselect("span.tijdsverschil")[0].get(
                                'publicationdate'))
                        if comment.props.date.date() == self.options['date']:
                            yield comment
                    except IndexError:
                        pass


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(NuJijScraper)
コード例 #24
0
ファイル: add_articles_set.py プロジェクト: pombredanne/amcat
class MoveArticlesForm(forms.Form):
    from_set = forms.ModelChoiceField(queryset = ArticleSet.objects.all())
    to_set = forms.ModelChoiceField(queryset = ArticleSet.objects.all())

class MoveArticles(Script):
    options_form = MoveArticlesForm

    def run(self, _input):
        fr = self.options['from_set']
        to = self.options['to_set']


        log.debug("getting articles...")
        articles = list(Article.objects.filter(
            articlesetarticle__articleset = fr.id))
        n = len(articles)

        log.debug("...done. {n} articles found".format(**locals()))


        log.debug("adding articles to new set...")
        to.add_articles(articles)
        to.save()
        
        log.info("moved {n} articles from {fr} to {to}".format(**locals()))


if __name__ == "__main__":
    from amcat.scripts.tools import cli
    cli.run_cli(MoveArticles)
コード例 #25
0
ファイル: metro.py プロジェクト: ToonAlfrink/amcatscraping
                    yield urljoin(INDEX_URL, href)


    def _get_units(self):
        for url in self.get_categories():
            doc = self.getdoc(url)
            for item in doc.cssselect("item"):
                date = toolkit.readDate(item.cssselect("pubdate")[0].text)
                if date.date() != self.options['date']:
                    continue
                link = item.cssselect("link")[0]
                doc = HTMLDocument(
                    url=urljoin(INDEX_URL, html.tostring(link).lstrip("<link>")),
                    date = date,
                    headline = item.cssselect("title")[0].text
                    )
                yield doc

    def _scrape_unit(self, doc):
        doc.prepare(self)
        doc.props.text = doc.doc.cssselect("div.article-body")
        doc.props.html = html.tostring(doc.doc)
        yield doc

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping")
    cli.run_cli(MetroScraper)

コード例 #26
0
ファイル: ad.py プロジェクト: ToonAlfrink/amcatscraping
#                                                                         #
# AmCAT is free software: you can redistribute it and/or modify it under  #
# the terms of the GNU Affero General Public License as published by the  #
# Free Software Foundation, either version 3 of the License, or (at your  #
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

import pcm

class AlgemeenDagbladScraper(pcm.PCMScraper):
    medium_name = "Algemeen Dagblad"
    domain = "ad.nl"
    paper_id = 8001
    context_id = "AD"


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(AlgemeenDagbladScraper)
コード例 #27
0
            return self.opener.getdoc(uri, encoding)

    def open(self, url,  encoding=None):
        if isinstance(url, (str, unicode)):
            if isinstance(url, unicode):
                url = url.encode('utf-8')
            log.info('Retrieving "{url}"'.format(**locals()))
            try:
                return self.opener.opener.open(url, encoding)
            except UnicodeEncodeError:
                uri = iri2uri(url)
                return self.opener.opener.open(uri, encoding)
        else:
            req = url
            log.info('Retrieving "{url}"'.format(url = req.get_full_url()))
            return self.opener.opener.open(req, encoding)
     


def _set_default(obj, attr, val):
    try:
        if getattr(obj, attr, None) is not None: return
    except ObjectDoesNotExist:
        pass # django throws DNE on x.y if y is not set and not nullable
    setattr(obj, attr, val)
   

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    cli.run_cli(Scraper)
コード例 #28
0
ファイル: ad.py プロジェクト: edisona/amcat.scraping
    def get_comment_pages(self, page):
        if not page.doc.cssselect("#reaction"):
            return
        n_id, c_id = page.props.url.split("/")[4::4]  #5 and #9
        doc = self.getdoc(self.comment_url.format(page=0, cid=c_id, nid=n_id))
        try:
            total = int(doc.cssselect("div.pagenav")[0].text.split(" van ")[1])
        except IndexError:
            yield doc
            return
        except AttributeError:
            return
        for x in range(total - 1):
            for a in doc.cssselect("div.pagenav a"):
                if "volgende" in a.text:
                    onclick = a.get('onclick')
            start = onclick.find("getReactions(") + 13
            end = onclick.find(")", start)
            href = [
                arg.strip("\"';() ") for arg in onclick[start:end].split(",")
            ][0]
            yield self.getdoc(urljoin(doc.base_url, href))


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(WebADScraper)
コード例 #29
0
        if text in MEDIUM_ALIASES.keys():
            return Medium.get_or_create(MEDIUM_ALIASES[text])
        else:
            return Medium.get_or_create(text)

    def get_pagenum(self, text):
        p = re.compile("pagina ([0-9]+)([,\-][0-9]+)?([a-zA-Z0-9 ]+)?")
        m = p.search(text.strip())
        pagenum, otherpage, section = m.groups()
        if section:
            section = section.strip()
        return int(pagenum), section

if __name__ == "__main__":
    from amcat.scripts.tools import cli
    cli.run_cli(BZK)
        
        
###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest

class TestBZK(amcattest.AmCATTestCase):
    def setUp(self):
        from django.core.files import File
        import os.path, json
        self.dir = os.path.join(os.path.dirname(__file__), 'test_files', 'bzk')
        self.bzk = BZK(project = amcattest.create_test_project().id,
                  file = File(open(os.path.join(self.dir, 'test.html'))),
コード例 #30
0

    def _scrape_unit(self, doc):
        art = HTMLDocument()
        try:
            datestring = doc.cssselect("div.dateplace-data")[0].text_content().split("\n")[2]
        except IndexError:  
            datestring = doc.cssselect("div.dateplace span")[0].text_content()

        art.props.date = readDate(datestring)
        art.props.headline = doc.cssselect("div.header h1")[0].text_content()
        if doc.cssselect("div.content center"):
            doc.cssselect("div.content center")[0].drop_tree()
        art.props.text = doc.cssselect("div.content")[0]
        try:
            art.props.author = doc.cssselect("span.smallprint")[0].text_content().strip()
        except IndexError as e:
            print(e)
        yield art
        print("\n")
        
if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(TestScraper)


### works very well!
コード例 #31
0
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

try:
    from scrapers.newssites.google import GoogleNewsScraper
except ImportError:
    try:
        from scraping.newssites.google import GoogleNewsScraper
    except ImportError:
        from amcatscraping.newssites.google import GoogleNewsScraper

        
class GoogleAustriaScraper(GoogleNewsScraper):
    url_gtld = "at"

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(GoogleAustriaScraper)


コード例 #32
0
ファイル: gmx.py プロジェクト: ToonAlfrink/amcatscraping
from amcat.scraping.document import HTMLDocument
from amcat.scraping.scraper import HTTPScraper, DatedScraper
from amcat.tools.toolkit import readDate

class GMXScraper(HTTPScraper, DatedScraper):
    medium_name = "gmx.at"
    index_url = "http://www.gmx.at/themen/all/{d.year}/{d.month}/{d.day}/"

    def _get_units(self):
        d = self.options['date']
        index = self.getdoc(self.index_url.format(**locals()))
        for div in index.cssselect("#main div.unit"):
            yield HTMLDocument(url = div.cssselect("a")[0].get('href'))

    def _scrape_unit(self, article):
        article.prepare(self)
        article.props.date = readDate(article.doc.cssselect("#datetime")[0].text_content())
        article.props.section = " > ".join(article.props.url.split("/")[4:-1])
        article.props.headline = article.doc.cssselect("#headline")[0].text_content().strip()
        article.props.text = article.doc.cssselect("#teaser") + article.doc.cssselect("#main > p")
        yield article

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(GMXScraper)



コード例 #33
0
ファイル: deduplicate_set.py プロジェクト: amcat/amcat
        Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the
        values of thoses fields are equal in both articles.

        @param articleset       The articleset that is to be searched
        @param ignore_fields    A set of fields that should not be included in the calculated hashes

        @return                 An iterable of (<article_id>, <hash>) tuples.
        """
        all_fields = STATIC_FIELDS + list(articleset.get_used_properties())

        if not ignore_fields:
            fields = ["hash"]
        else:
            fields = sorted(f for f in all_fields if not f in ignore_fields)

        x = amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}},
                              fields=fields)
        for x in amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}},
                                   _source=fields):
            if not ignore_fields:
                yield int(x['_id']), x['_source']['hash']
                continue
            art_tuple = tuple(str(x['_source'].get(k, [None])) for k in fields)
            hash = hash_class(repr(art_tuple).encode()).hexdigest()
            yield int(x['_id']), hash


if __name__ == '__main__':
    from amcat.scripts.tools.cli import run_cli
    run_cli()
コード例 #34
0
ファイル: fok.py プロジェクト: edisona/amcat.scraping
        byline = page.doc.cssselect("span.postedbyline")[0].text_content()
        page.props.author = byline[byline.find("Geschreven door") +
                                   16:byline.find(" op ")]
        page.props.date = readDate(
            page.doc.cssselect("span.postedbyline")[0].text_content().split(
                " op ")[1])
        for comment in self.get_comments(page):
            comment.is_comment = True
            yield comment
        yield page

    def get_comments(self, page):

        for div in page.doc.cssselect("div.reactieHolder"):
            comment = HTMLDocument()
            comment.props.author = div.cssselect("span.left a")[0].text
            comment.props.date = readDate(div.cssselect("a.timelink")[0].text)
            comment.props.text = div.cssselect("div.reactieBody")[0]
            comment.props.parent = page
            comment.props.section = page.props.section
            comment.props.url = page.props.url
            yield comment


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(FokScraper)
コード例 #35
0
ファイル: spits.py プロジェクト: edisona/amcat.scraping
            pagenum = int(a.get('href').split("/")[1])
            sections[pagenum] = a.text
        return sections

    def _scrape_unit(self, url):
        article = HTMLDocument(url=url, section=self.section)
        article.prepare(self)
        article.props.date = date(*[int(n) for n in url.split("/")[5:8]])
        article.props.pagenr = self.pagenum
        article.props.headline = article.doc.cssselect(
            "#article h1")[0].text_content()
        article.props.text = article.doc.cssselect("div.body")[0]
        dateline_pattern = re.compile("^([A-Z]+( [A-Z]+)?)$")
        b = article.props.text.cssselect("b")
        if b and dateline_pattern.search(b[0].text_content()):
            article.props.dateline = dateline_pattern.search(
                b[0].text_content()).group(1)

        if article.doc.cssselect("#article address"):
            article.props.author = article.doc.cssselect(
                "#article address")[0].text_content().lstrip("dor").strip()

        yield article


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping")
    cli.run_cli(SpitsKrantScraper)
コード例 #36
0
        info = page.doc.cssselect("div.nieuws_box p")
        for p in info:
            if "Plaatsingsdatum" in p.cssselect("b")[0].text:
                page.props.date = readDate(p.text_content().split(":")[1])
                break

            
        for comment in self.scrape_comments(page):
            comment.is_comment = True
            yield comment

        yield page

    def scrape_comments(self,page):
        for li in page.doc.cssselect("ul.uiList li.fbFeedbackPost"):
            comment = HTMLDocument(parent=page,url=page.url)
            comment.props.text = li.cssselect("div.postText")[0].text
            comment.props.author = li.cssselect("a.profileName")[0].text
            comment.props.date = readDate(li.cssselect("abbr.timestamp")[0].get('title'))
            yield comment


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(Zorgportaal_nlNieuwsScraper)


コード例 #37
0
        text = text.replace("-\n", "")
        text = text.replace("  ", " ")
        text = text.replace("\n", " ")

        article.text = text
        date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})")
        result = date_pattern.search(lines[1])
        article.date = date(int(result.group(3)), int(result.group(2)),
                            int(result.group(1)))
        pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)")
        result = pagenum_pattern.search(lines[1])
        if result:

            article.pagenr = int(result.group(1))

        for h, medium in self.index:
            if article.headline.lower().strip() in h.lower().strip():
                article.set_property("medium", self.get_medium(medium))

        return article

    def get_medium(self, medium):
        if not medium or len(medium) < 1:
            medium = "unknown"
        return MEDIUM_ALIASES.get(medium, medium)


if __name__ == "__main__":
    from amcat.scripts.tools import cli
    cli.run_cli(BZKPDFScraper)
コード例 #38
0
ファイル: salzburg.py プロジェクト: edisona/amcat.scraping
        return offset

    def getresponse(self, offset):
        _json = self.open(self.solr_url.format(**locals())).read()
        return json.loads(_json)["response"]

    article_url = "http://www.salzburg.com/nachrichten/id=112&tx_ttnews%5Btt_news%5D={urlid}&cHash=abc"

    def _scrape_unit(self, data):
        urlid = data['uri'].split('-')[-1]
        article_url = self.article_url.format(**locals())
        yield HTMLDocument(
            date=readDate(data['date']),
            section=", ".join('ressort' in data.keys() and data['ressort']
                              or []),
            headline=data['title'],
            url=article_url,
            externalid=data['id'],
            text=data['text'],
            author=data['author'],
            tags='tag' in data.keys() and data['tag'],
            teaser='teaser' in data.keys() and data['teaser'],
            all_data=data)


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(SalzburgScraper)
コード例 #39
0
ファイル: tubantia.py プロジェクト: ToonAlfrink/amcatscraping
                if stop:
                    break
                else:
                    
                    p = re.compile("[\\\]udc[\w\w]")
                    artpage.props.text = literal_eval(p.sub("",repr(body)))
                    artpage.props.edition = page['edition']
                    artpage.props.byline = byline
                    artpage.props.section = page['section']
                    if re.match("[A-Z][0-9]+", page['page_str']):
                        artpage.props.section += " - section " + page['page_str'][0]
                        artpage.props.pagenr = int(page['page_str'][1:])
                    else:
                        artpage.props.pagenr = int(page['page_str'])

                    dateline_pattern = re.compile("(^[^\n]+\n\n([A-Z]+( [A-Z]+)?) -\n)|(([A-Z]+( [A-Z]+)?)\n\n)")
                    match = dateline_pattern.search(artpage.props.text)
                    if match:
                        #dateline and theme have the same syntax and are therefore undistinguishable
                        artpage.props.dateline_or_theme = match.group(2) or match.group(5)
                    artpage.props.url = page['url']
                    yield artpage

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(TubantiaScraper)

コード例 #40
0
ファイル: geenstijl.py プロジェクト: edisona/amcat.scraping
                    comment.parent = page
                    yield comment
                yield page
                

    def get_article(self, page):
        page.props.author = page.doc.cssselect("article footer")[0].text_content().split("|")[0].strip()
        page.props.headline = page.doc.cssselect("article h1")[0].text.strip()
        if page.props.headline[0] == '#': page.props.headline = page.props.headline[1:].strip()
        datestring = page.doc.cssselect("footer time")[0].text_content()
        page.props.date = datetime.datetime.strptime(datestring, '%d-%m-%y | %H:%M')
        page.doc.cssselect("footer")[0].drop_tree()
        page.props.text = page.doc.cssselect("article")[0]
        page.coords = ""
        return page

    def get_comments(self,page):
        for article in page.doc.cssselect("#comments article"):
            comment = HTMLDocument(parent = page)
            footer = article.cssselect("footer")[0].text_content().split(" | ")
            comment.props.date = readDate(footer[1])
            comment.props.author = footer[0]
            comment.props.text = article.cssselect("p")
            yield comment

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(GeenstijlScraper)
コード例 #41
0
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(
                    p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode(
                'latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub("=[A-Z0-9]{2}", character, article.text)

        yield article


if __name__ == "__main__":
    from amcat.scripts.tools import cli

    cli.run_cli(BZKEML)
コード例 #42
0
                logging.info("{n} duplicate articles found, run without dry_run to remove".format(**locals()))
            else:
                logging.info("Removing {n} articles from set".format(**locals()))
                articleset.remove_articles(to_remove)
            if save_duplicates_to:
                dupes_article_set = ArticleSet.create_set(articleset.project, save_duplicates_to, to_remove)
        return n, dry_run

    def get_hashes(self):
        fields =  [f for f in FIELDS if not self.options.get("skip_{}".format(f))]
        if fields == FIELDS:
            fields = ["hash"]
        setid = self.options['articleset'].id
        for x in amcates.ES().scan(query={"query" : {"constant_score" : {"filter": {"term": {"sets": setid}}}}},
                                   fields=fields):
            if fields == ["hash"]:
                hash = x['fields']['hash'][0]
            else:
                def get(flds, f):
                    val = flds.get(f)
                    return val[0] if val is not None else val
                    
                d = {f: get(x['fields'], f) for f in fields}
                hash = hash_class(json.dumps(d)).hexdigest()
            yield int(x['_id']), hash


if __name__ == '__main__':
    from amcat.scripts.tools.cli import run_cli
    run_cli()
コード例 #43
0
#                                                                         #
# This file is part of AmCAT - The Amsterdam Content Analysis Toolkit     #
#                                                                         #
# AmCAT is free software: you can redistribute it and/or modify it under  #
# the terms of the GNU Affero General Public License as published by the  #
# Free Software Foundation, either version 3 of the License, or (at your  #
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

import nrc

class NRCHandelsbladScraper(nrc.NRCScraper):
    medium_name = "NRC Handelsblad"
    nrc_version = "NH"


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(NRCHandelsbladScraper)
コード例 #44
0
    def parent_table(codebook):
        result = table3.ObjectTable(rows=codebook.codebookcodes)
        result.addColumn(lambda row : row.code.uuid, label="uuid")
        result.addColumn(lambda row : row.code.id, label="code_id")
        result.addColumn(lambda row : row.code, label="code")
        result.addColumn(lambda row : row.parent, label="parent")
        self.add_label_columns(result)
        return result

        
def _get_tree(codebook):
    parents = {cc.code : cc.parent for cc in codebook.codebookcodes}
    for root in (code for (code, parent) in parents.iteritems() if parent is None):
        for row in _get_tree_rows(parents, 0, root):
            yield row

def _get_tree_rows(parents, indent, parent):
    yield TreeRow(indent, parent)
    for child in (c for (c, p) in parents.iteritems() if p == parent):
        for row in _get_tree_rows(parents, indent+1, child):
            yield row

    
if __name__ == '__main__':
    from amcat.scripts.tools import cli
    import sys
    #cli.run_cli().to_csv(stream=sys.stdout)
    print cli.run_cli().to_csv()
    #print result.output()

コード例 #45
0
        export_sql = ("SELECT {self.dest_project.id} AS projectid, {fields} FROM articles a"
                      " WHERE article_id IN ({article_ids})").format(**locals())
        export_sql = "COPY ({export_sql}) TO STDOUT WITH BINARY".format(**locals())

        import_sql = "COPY articles (project_id, {fields}) FROM STDIN WITH BINARY".format(**locals())

        dest_host = "-h {self.dest_host}".format(**locals()) if self.dest_host else ""
        source_host = "-h {self.source_host}".format(**locals()) if self.source_host else ""
        
        cmd = ('psql {source_host} {self.source_db} -c "{export_sql}" '
               '| psql {dest_host} {self.dest_db} -c "{import_sql}"').format(**locals())

        log.debug("Copying {n} articles...".format(n=len(aids)))
        #log.debug(cmd)
        subprocess.check_output(cmd, shell=True)
        log.debug("... Done!")

    def _add_to_set(self, uuids):
        log.debug("Adding {n} articles to set using uuids...".format(n=len(uuids)))
        aids = [aid for (aid,) in Article.objects.filter(uuid__in=uuids).values_list("id")]
        if len(aids) != len(uuids):
            raise Exception("|aids| != |uuids|, something went wrong importing...")
        self.dest_set.add_articles(aids)
        log.debug("... Done!")
        
if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module()
    cli.run_cli()
コード例 #46
0
ファイル: raw_pdf.py プロジェクト: PaulHuygen/amcat
            page_txt = ""
            for line in parser.get_textlines(page):
                page_txt += line.get_text() + "\n"
            res += page_txt + "\n\n"
        article = Article(text=res)
        article.headline = self.getheadline(_file)
        article.medium = self.options['medium']
        article.section = self.options['section']
        if self.options['date']:
            article.date = self.options['date']
        else:
            article.date = date.today()
        yield article

    def getheadline(self, _file):
        hl = _file.name
        if hl.endswith(".pdf"): hl = hl[:-len(".pdf")]
        windows = hl.split("\\")
        other = hl.split("/")
        if len(windows) > len(other):
            #probably a windows path
            hl = windows[-1]
        else:
            hl = other[-1]
        return hl


if __name__ == "__main__":
    from amcat.scripts.tools import cli
    cli.run_cli(RawPDFScraper)
コード例 #47
0
ファイル: depers.py プロジェクト: ToonAlfrink/amcatscraping
            ))

            if not day_url.startswith(INDEX_URL): continue

            doc = self.getdoc(day_url)
            for article in doc.cssselect("div.lbox500 h2 a"):
                url = urljoin(day_url, article.get("href"))

                if '/video/' in url: continue

                yield HTMLDocument(
                    url = urljoin(day_url, article.get("href")),
                    headline = article.text,
                    date = self.options['date']
                )

    def _scrape_unit(self, doc):
        doc.prepare(self)
        if doc.doc.cssselect("div.lbox440"):
            doc.props.text = doc.doc.cssselect("div.lbox440")[0].cssselect('p')
        else:
            doc.props.text = ""
        yield doc

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(DePersScraper)
コード例 #48
0
# AmCAT is free software: you can redistribute it and/or modify it under  #
# the terms of the GNU Affero General Public License as published by the  #
# Free Software Foundation, either version 3 of the License, or (at your  #
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

import pcm


class TrouwScraper(pcm.PCMScraper):
    medium_name = "Trouw"
    domain = "trouw.nl"
    paper_id = 8004
    context_id = "NL"


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(TrouwScraper)
コード例 #49
0
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

try:
    from scrapers.newspapers import tubantia
except ImportError:
    try:
        from scraping.newspapers import tubantia
    except ImportError:
        from amcatscraping.newspapers import tubantia


class GelderlanderScraper(tubantia.TubantiaScraper):
    medium_name = "De Gelderlander"
    paper = "dg"


if __name__ == '__main__':

    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(GelderlanderScraper)
コード例 #50
0
ファイル: text.py プロジェクト: kasperwelbers/amcat
            text = _convert_multiple(file, convertors)
        else:
            text = file.text
        return Article(text=text, **metadata)

    def explain_error(self, error):
        """Explain the error in the context of unit for the end user"""
        name = getattr(error.unit, "name", error.unit)
        return "Error in file {name} : {error.error!r}".format(**locals())
    
if __name__ == '__main__':
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scripts.article_upload.upload")
    #amcatlogging.debug_module("amcat.scraping.scraper")
    from amcat.scripts.tools.cli import run_cli
    run_cli(handle_output=False)

###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest
from amcat.tools import amcatlogging
amcatlogging.debug_module("amcat.scripts.article_upload.upload")

class TestUploadText(amcattest.AmCATTestCase):
    def test_article(self):
        from django.core.files import File
        base = dict(project=amcattest.create_test_project().id,
                    articleset=amcattest.create_test_set().id,
                    medium=amcattest.create_test_medium().id)
コード例 #51
0
        structure = forms.ChoiceField(choices=[(s, s.title())
                                               for s in STRUCTURE])

    def _run(self, codebook, structure, **kargs):
        codebook.cache_labels()
        result = STRUCTURE[structure](codebook)
        return result


def _get_tree(codebook):
    parents = {cc.code: cc.parent for cc in codebook.codebookcodes}
    for root in (code for (code, parent) in parents.iteritems()
                 if parent is None):
        for row in _get_tree_rows(parents, 0, root):
            yield row


def _get_tree_rows(parents, indent, parent):
    yield TreeRow(indent, parent)
    for child in (c for (c, p) in parents.iteritems() if p == parent):
        for row in _get_tree_rows(parents, indent + 1, child):
            yield row


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    import sys
    #cli.run_cli().to_csv(stream=sys.stdout)
    print cli.run_cli().to_csv()
    #print result.output()
コード例 #52
0
ファイル: refresh_index.py プロジェクト: amcat/amcat
###########################################################################

"""
Script to get queries for a codebook
"""

import logging; log = logging.getLogger(__name__)

from django import forms

from amcat.scripts.script import Script
from amcat.models import ArticleSet

PLUGINTYPE_PARSER = 1

class RefreshIndex(Script):
    class options_form(forms.Form):
        articleset = forms.ModelChoiceField(queryset=ArticleSet.objects.all())
        full_refresh = forms.BooleanField(initial=False, required=False)
        
    def _run(self, articleset, full_refresh):
        log.info("Refreshing {articleset}, full_refresh={full_refresh}".format(**locals()))
        articleset.refresh_index(full_refresh=full_refresh)


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    result = cli.run_cli()
    #print(result.output())

コード例 #53
0
ファイル: add_project.py プロジェクト: edisona/amcat
    options_form = AddProjectForm
    output_type = Project

    def run(self, _input=None):
        p = Project.objects.create(**self.options)
        # Add user to project (as admin)
        pr = ProjectRole(project=p, user=self.options['owner'])
        pr.role = Role.objects.get(projectlevel=True, label='admin')
        pr.save()

        return p

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    cli.run_cli()


###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest

class TestAddProject(amcattest.PolicyTestCase):
    def test_add(self):
        u = amcattest.create_test_user()
        p = AddProject(owner=u.id, name='test', description='test',insert_user=u.id).run()
        #self.assertEqual(p.insert_user, current_user()) # current_user() doesn't exist anymore
        self.assertEqual(p.owner, u)
コード例 #54
0
ファイル: lexisnexis.py プロジェクト: pombredanne/amcat
            a = body_to_article(*fields)
            a.project = self.options['project']
            yield a
        except:
            log.error(
                "Error on processing fields: {fields}".format(**locals()))
            raise


from amcat.tools import amcatlogging

amcatlogging.debug_module()

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    cli.run_cli(handle_output=False)

###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest
import datetime


class TestLexisNexis(amcattest.AmCATTestCase):
    def setUp(self):
        import os.path, json
        self.dir = os.path.join(os.path.dirname(__file__), 'test_files',
                                'lexisnexis')
コード例 #55
0
            n_props = 0
            for prop in self.article_properties:
                if hasattr(article, prop):
                    n_props += 1
            if article.metastring:
                n_props += len(eval(article.metastring))
            articles_nprops[article] = n_props

        sortedlist = sorted(articles_nprops, key=articles_nprops.get)
        to_print = set(sortedlist[:3] + sortedlist[-3:])
        log.info("Sample articles:")
        for article in to_print:
            for prop in self.article_properties:
                value = hasattr(article, prop) and getattr(article,
                                                           prop) or None
                value = self.truncate(value)
                log.info("{prop} : {value}".format(**locals()))
            print("\n")

    def truncate(self, value):
        value = unicode(value)
        value = " ".join(value.split("\n"))
        if len(value) > 80:
            value = value[0:79] + "..."
        return value.encode('utf-8')


if __name__ == "__main__":
    from amcat.scripts.tools import cli
    cli.run_cli(ValueArticleScript)
コード例 #56
0
        #    comment.is_comment = True
        #    yield comment

    def scrape_comments(self, article):
        for item in article.doc.cssselect("#commentsTab li.mainComment"):
            mcomment = self.scrape_comment(item, article)        
            yield mcomment
            for li in item.cssselect("ul.answers li.commentItem"):
                yield self.scrape_comment(li, mcomment)                    

    def scrape_comment(self, html, parent):
        c = HTMLDocument(
            text = html.cssselect("div.text-holder"),
            headline = html.cssselect("a.commentTitle")[0].text_content().strip(),
            section = parent.props.section,
            date = readDate(" ".join([t.text for t in html.cssselect("ul.meta li.createdate, li.createtime")])),
            author = html.cssselect("ul.meta li.by")[0].text.strip().lstrip("By").strip(),
            url = parent.props.url + "#{}".format(html.cssselect("a.commentTitle")[0].get('id')))
        c.props._parent = "{p.props.headline}, {p.props.date}".format(p = parent)
        return c


if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.debug_module("amcat.scraping.scraper")
    amcatlogging.debug_module("amcat.scraping.document")
    cli.run_cli(HaaretzScraper)