예제 #1
0
def get_texts_extra(project, set, user, pw, result_dict):
    from amcatclient import AmcatAPI
    from elasticsearch import Elasticsearch
    import string
    texts = []
    conn = AmcatAPI('http://jamcat.mscc.huji.ac.il', user, pw)
    #es = Elasticsearch(['http://jamcat.mscc.huji.ac.il:9200'])
    es = Elasticsearch()
    for id, b in result_dict.items():
        result = es.search(index="amcat",
                           doc_type="article",
                           body={"query": {
                               "match": {
                                   "id": id
                               }
                           }})
        try:
            r = result['hits']['hits'][0]['_source']
            medium = r['medium']
            date = r['date']
            title = ''
            subtitle = ''
            letters = string.ascii_lowercase[0:len(b) + 1]
            for i in range(len(b)):
                text = re.sub(r'\(.*?\)', '', b[i])
                id_ = str(id) + '_' + str(letters[i])
                article = [id_, medium, date, title, subtitle, text]
                texts.append(article)
        except:
            print str(id) + ' is not in the database anymore'
    print len(texts), 'texts retrieved...'
    return texts
예제 #2
0
파일: nlpamcat.py 프로젝트: mcomsa/nlpipe
def get_ids(amcat_server: AmcatAPI, project: int,
            articleset: int) -> Iterable[int]:
    """
    Get the article ids for this articleset

    :param amcat_server: Amcat server
    :param project: AmCAT project ID
    :param articleset: AmCAT Articleset ID
    :return: sequence of AmCAT article IDs
    """
    return (x['id'] for x in amcat_server.get_articles(
        project, articleset, columns=['id']))
예제 #3
0
파일: nlpamcat.py 프로젝트: mcomsa/nlpipe
def process(amcat_server: AmcatAPI,
            project: int,
            articleset: int,
            nlpipe_server: Client,
            module: str,
            reset_error: bool = False,
            reset_started: bool = False,
            to_naf: bool = False) -> None:
    """
    Process the given documents

    :param amcat_server: Amcat server (url str or AmCATAPI object)
    :param project: AmCAT project ID (int)
    :param articleset: AmCAT Articleset ID (int)
    :param nlpipe_server: NLPipe server (url/dirname str or nlpipe.Client object)
    :param module: NLPipe module name (str)
    :param reset_started: Re-set started documents to pending
    :param reset_error: Re-assign documents with errors
    :param to_naf: Assign as NAF documents with metadata (otherwise, assign as plain text)
    :param token: Token to use for authentication
    """
    status = get_status(amcat_server, project, articleset, nlpipe_server,
                        module)
    accept_status = {"UNKNOWN"}
    if reset_error:
        accept_status |= {"ERROR"}
    if reset_started:
        accept_status |= {"STARTED"}

    todo = [id for (id, status) in status.items() if status in accept_status]
    if todo:
        logging.info(
            "Assigning {} articles from {amcat_server} set {project}:{articleset}"
            .format(len(todo), **locals()))
        columns = 'headline,text,creator,date,url,uuid,medium,section,page' if args.naf else 'headline,text'
        for page in amcat_server.get_articles_by_id(articles=todo,
                                                    columns=columns,
                                                    page_size=100,
                                                    yield_pages=True):
            args = page['results']
            ids = [a['id'] for a in arts]
            texts = [_get_text(a, to_naf=args.naf) for a in arts]
            logging.debug("Assigning {} articles".format(len(ids)))
            nlpipe_server.bulk_process(args.module,
                                       texts,
                                       ids=ids,
                                       reset_error=reset_error,
                                       reset_pending=reset_started)
    logging.info("Done! Assigned {} articles".format(len(todo)))
예제 #4
0
 def check_credentials(self, drop_token=True):
     if drop_token:
         token = None
     else:
         token = self.token_input or None
     if token or self.passwd_input:
         try:
             api = AmcatAPI(self.host_input, self.user_input,
                            self.passwd_input, token)
             if api.token is None: api = None
         except (APIError, HTTPError) as e:
             logging.exception("Error on getting credentials")
             api = None
     else:
         api = None
     self.passwd_input = ''
     self.token_input = api and api.token
     self.save_credentials()
     self.api = api
예제 #5
0
    parser.add_argument('--items_per_page', type=int, default=100)
    parser.add_argument('--start', type=int, default=0, help="Record number to start from (default=0)")
    parser.add_argument('project', type=int)
    parser.add_argument('articleset', help="Either the number of an existing set, or the name of a set to create")
    parser.add_argument('ppn', help="The ppn of the newspaper to scrape (see delpher.nl)")
    parser.add_argument('from_date', help="Starting date to scrape, format 31-12-1999")
    parser.add_argument('to_date', help="End date to scrape, format 31-12-1999")
    args = parser.parse_args()
    
    logging.basicConfig(format='[%(asctime)s %(levelname)s %(name)s:%(lineno)s %(threadName)s] %(message)s',
                        level=logging.INFO)
    logging.getLogger("urllib3").setLevel(logging.WARNING)

    logging.info("Connecting to AmCAT at {host}".format(**args.__dict__))
    from amcatclient import AmcatAPI
    api = AmcatAPI(args.host)

    try:
        setid = int(args.articleset)
    except ValueError:
        logging.info("Creating new articleset {articleset} in project {project}".format(**args.__dict__))
        prov = ("[{}] Scraped from delpher using delpher.py, ppn={ppn}, from_date={from_date}, to_date={to_date}"
                .format(datetime.now().isoformat(), **args.__dict__))
        setid = api.create_set(project=args.project, name = args.articleset, provenance=prov)["id"]

    logging.info("Scraping ppn:{ppn} from {from_date}-{to_date} into set {project}:{set}"
                 .format(set=setid, **args.__dict__))
    scrape_delpher(api, args.project, setid, start=args.start,
                   from_date=args.from_date, to_date=args.to_date, ppn=args.ppn)
    
    
예제 #6
0
        link = a.get("href")
        links.append(link)
    return links



if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(name)-12s %(levelname)-5s] %(message)s')
    parser = argparse.ArgumentParser()
    parser.add_argument("server", help="AmCAT host name", )
    parser.add_argument("project", help="AmCAT project", )
    parser.add_argument("articleset", help="AmCAT Articleset ID", type=int)
    parser.add_argument("--batchsize", help="Batch size for uploading to AmCAT", type=int, default=10)
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(name)-12s %(levelname)-5s] %(message)s')

    logging.info(f"Scraping into AmCAT {args.articleset}")
    conn = AmcatAPI(args.server)
    page = range(0, 67, 1)
    for p in page:
        articles=[]
        page = get_page(p)
        links = get_links(page)
        for link in links:
            article = get_article(link)
            articles.append(article)
        conn.create_articles(args.project, args.articleset, articles)


        arts = list(get_articles(urls))
        logging.info("Adding {} articles to set {}:{}".format(
            len(arts), project, articleset))
        conn.create_articles(project=project,
                             articleset=articleset,
                             json_data=arts)


if __name__ == '__main__':
    from amcatclient import AmcatAPI
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('host',
                        help='The AmCAT host to connect to, '
                        'e.g. http://amcat.vu.nl')
    parser.add_argument('project', help='Project ID to add the articles to')
    parser.add_argument('query', help='Wikinews query for scraping')
    parser.add_argument('--username', help='Username for AmCAT login')
    parser.add_argument('--password', help='Password for AmCAT login')
    args = parser.parse_args()

    conn = AmcatAPI(args.host, args.username, args.password)
    category = "Iraq"
    articleset = conn.create_set(
        project=args.project,
        name="Wikinews articles for {}".format(args.query),
        provenance="Scraped from wikinews on {}".format(
            datetime.datetime.now().isoformat()))
    scrape_wikinews(conn, args.project, articleset['id'], args.query)
예제 #8
0
        "url": url
    }


def get_links():
    for page in range(435, 734):
        url = URL_TEMPLATE.format(**locals())
        print(url)
        page = requests.get(url)
        open("/tmp/test.html", "w").write(page.text)
        tree = html.fromstring(page.text)
        for article in tree.cssselect("#results .grid-element"):
            a, = article.cssselect('a.siteLink')
            link = a.get("href")
            if not link.startswith(
                    "https://www.om.nl/actueel/nieuwsberichten"):
                continue
            locale.setlocale(locale.LC_ALL, 'nl_NL.UTF-8')
            date, = article.cssselect("div.iprox-content.iprox-date.date")
            date = date.text_content().split("-")[0].strip()
            #date = date.text_content().strip()
            date2 = datetime.datetime.strptime(date, "%d %B %Y")
            yield link, date2


from amcatclient import AmcatAPI
conn = AmcatAPI("https://amcat.nl")
for link, date in get_links():
    a = scrape_pb(link, date)
    conn.create_articles(2088, 80277, [a])
예제 #9
0
def _amcat(amcat_server: Union[str, AmcatAPI]) -> AmcatAPI:
    return AmcatAPI(amcat_server) if isinstance(amcat_server,
                                                str) else amcat_server
예제 #10
0
    parser.add_argument("target_url", help='URL of the target '
                        '(e.g. "http://amcat.vu.nl")')
    parser.add_argument("source_project", help='Article set ID in the source',
                        type=int)
    parser.add_argument("source_set", help='Article set ID in the source',
                        type=int)
    parser.add_argument("target_project", help='Project ID in the target',
                        type=int)
    parser.add_argument("--target-set", "-s", help='Article set ID in the '
                        'target (if omitted, a new set will be created',
                        type=int)
    parser.add_argument("--batch-size", "-b", help='Batch size for copying',
                        type=int, default=100)
    parser.add_argument("--from-page", "-p", help='Start from page (batch)',
                        type=int, default=1)


    args = parser.parse_args()

    fmt = '[%(asctime)s %(levelname)s %(name)s] %(message)s'
    logging.basicConfig(format=fmt, level=logging.INFO)
    logging.getLogger("requests").setLevel(logging.WARNING)

    src = AmcatAPI(args.source_url)
    trg = AmcatAPI(args.target_url)


    copy_articles(src, args.source_project, args.source_set,
                  trg, args.target_project, args.target_set,
                  args.batch_size, args.from_page)
예제 #11
0
from lxml import html

# Since we need to parse an English-language date (December 3, 2002),
# set the locale to english. You can skip this step on an English-language OS
import datetime
import locale
locale.setlocale(locale.LC_ALL, "en_US.utf8")


# Import amcatclient
from amcatclient import AmcatAPI

# Connect to AmCAT.
# Note: if you create a .amcatauth file in your home dir, there is no
#       need to specify username and password.
conn = AmcatAPI("http://amcat.vu.nl", "<username>","<password>")

# Create a new articleset to add the articles to.
# You can also just set 'setid' to add to an existing set
PROJECT_ID = 1
aset = conn.create_set(project=PROJECT_ID, name="State of the Union",
                       provenance="Scraped from http://www.presidency.ucsb.edu/sou.php")
setid = aset["id"]


# Get the main page and iterate over all links in a 'doclist'
page = requests.get('http://www.presidency.ucsb.edu/sou.php')
tree = html.fromstring(page.text)
for a in tree.cssselect("td.doclist a"):

    # Skip empty links and the 'jump to menu' link
예제 #12
0
import re

from amcatclient import AmcatAPI

# Connect to AmCAT
parser = argparse.ArgumentParser()
parser.add_argument('host',
                    help='The AmCAT host to connect to, '
                    'e.g. http://amcat.vu.nl')
parser.add_argument('project', help='The project to count words in')
parser.add_argument('articleset', help='The article set to count words in')
parser.add_argument('--username', help='Username for AmCAT login')
parser.add_argument('--password', help='Password for AmCAT login')
args = parser.parse_args()

conn = AmcatAPI(args.host, args.username, args.password)

# Iterate over the articles, count all words
counts = collections.Counter()
for a in conn.list_articles(args.project, args.articleset):
    # get words by splitting lowercased text on non-word characters
    text = a['text'].lower()
    words = re.split("\W+", text)
    counts.update(words)

# delete all words with <= 3 characters
for word in counts.keys():
    if len(word) <= 3:
        del counts[word]

# print most common words
예제 #13
0
파일: dwdd_video.py 프로젝트: nruigrok/npo
                date=date,
                text=text,
                publisher="DWDD")


def get_article_text(url):
    page = requests.get(url)
    page.raise_for_status()
    tree = html.fromstring(page.text)
    text = "\n\n".join(
        t.text_content()
        for t in tree.cssselect(".sc-1fnykkm-0.gxieGH,h1.di2x5p-0"))
    return text


conn = AmcatAPI("http://localhost:8000")

if len(sys.argv) > 1:
    end_cursor = sys.argv[1]
else:
    articles = get_first()
    amcat_articles = list(parse_articles(articles))
    conn.create_articles(1, 101, amcat_articles)
#  end_cursor = amcat_articles[-1]['dwdd_id']

for i in count():
    # print(f"{i}: {end_cursor}")
    articles = get_articles(end_cursor)
    amcat_articles = list(parse_articles(articles))
    conn.create_articles(1, 101, amcat_articles)
# end_cursor = amcat_articles[-1]['dwdd_id']
예제 #14
0
import collections
import re

from amcatclient import AmcatAPI

# Connect to AmCAT
parser = argparse.ArgumentParser()
parser.add_argument('host', help='The AmCAT host to connect to, '
                    'e.g. http://amcat.vu.nl')
parser.add_argument('project', help='The project to count words in')
parser.add_argument('articleset', help='The article set to count words in')
parser.add_argument('--username', help='Username for AmCAT login')
parser.add_argument('--password', help='Password for AmCAT login')
args = parser.parse_args()

conn = AmcatAPI(args.host, args.username, args.password)

# Iterate over the articles, count all words
counts = collections.Counter()
for a in conn.list_articles(args.project, args.articleset):
    # get words by splitting lowercased text on non-word characters
    text = a['text'].lower()
    words = re.split("\W+", text)
    counts.update(words)

# delete all words with <= 3 characters
for word in counts.keys():
    if len(word) <= 3:
        del counts[word]

# print most common words
예제 #15
0
                        type=int)
    parser.add_argument("amcat_set",
                        help="Article Set ID in AmCAT server",
                        type=int)
    parser.add_argument("login", help="Login LN")
    parser.add_argument("password", help="Password LN")
    parser.add_argument("medium", help="Medium to get articles from")
    parser.add_argument("from_date", help="Date from which to get articles")
    parser.add_argument("to_date", help="Date to which to get articles")
    parser.add_argument("query", help="searchstring")
    parser.add_argument(
        "--geckodriver",
        help="Path of geckodriver executable (default=~/geckodriver)")

    args = parser.parse_args()

    from amcatclient import AmcatAPI
    conn = AmcatAPI(args.amcat_host)

    driver_path = args.geckodriver or str(Path.home() / "geckodriver")
    driver = webdriver.Firefox(executable_path=driver_path)

    from_date = datetime.strptime(args.from_date, "%Y-%m-%d")
    to_date = datetime.strptime(args.to_date, "%Y-%m-%d")

    login_nexis(driver, args.login, args.password)

    for page in scrape_nexis(driver, args.medium, from_date, to_date,
                             args.query):
        conn.create_articles(args.amcat_project, args.amcat_set, page)
예제 #16
0
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

import argparse
from amcatclient import AmcatAPI

# Connect to AmCAT
parser = argparse.ArgumentParser()
parser.add_argument('host',
                    help='The AmCAT host to connect to, '
                    'e.g. http://amcat.vu.nl')
parser.add_argument('--username', help='Username for AmCAT login')
parser.add_argument('--password', help='Password for AmCAT login')
args = parser.parse_args()

conn = AmcatAPI(args.host, args.username, args.password)

articles = [{
    "headline": "test headline3",
    "medium": "test medium",
    "text": "test text",
    "date": "2013-01-01"
}, {
    "headline": "test headline4",
    "medium": "test medium",
    "text": "test text",
    "date": "2013-01-01"
}]

aset = api.create_set(project=1, name="Testset", provenance="test data")
articles = api.create_articles(project=1,
예제 #17
0
파일: nlpamcat.py 프로젝트: mcomsa/nlpipe
                        "-t",
                        help="Provide auth token"
                        "(default reads ./.nlpipe_token or NLPIPE_TOKEN")
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG if args.verbose else logging.INFO,
        format='[%(asctime)s %(name)-12s %(levelname)-5s] %(message)s')
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("amcatclient").setLevel(logging.INFO)

    logging.debug(
        "Will {args.action} at nlpipe {args.nlpipeserver} all articles "
        "from {args.amcatserver} set {args.articleset}".format(**locals()))

    amcatserver = AmcatAPI(args.amcatserver)
    nlpipeserver = get_client(args.nlpipeserver, args.token)

    if args.action == "process":
        process(amcatserver, args.project, args.articleset, nlpipeserver,
                args.module, args.reset_error, args.reset_started)
    if args.action == "process_pipe":
        process_pipe(amcatserver, args.project, args.articleset, nlpipeserver,
                     args.module, "alpinonerc")
    if args.action == "status":
        status = get_status(amcatserver, args.project, args.articleset,
                            nlpipeserver, args.module)
        for k, v in Counter(status.values()).items():
            print("{k}: {v}".format(**locals()))
    if args.action == 'result':
        results = get_results(amcatserver,
예제 #18
0
        for post in posts:
            # is hetzelfde als de title,= notatie
            #titles = post.cssselect("h2")
            #if len(titles) != 1:
            #    raise Exception("Boe")
            #title = titles[0]
            link, = post.cssselect("h2 > a")
            href = link.get("href")
            if not href.startswith("https://www.pvda.nl/nieuws/"):
                continue
            else:
                headline = link.text_content().strip()
                meta, = post.cssselect("span.meta")
                datestr = meta.text_content()
                m = re.match(r"(\d+ \w+ \d{4})", datestr.strip())
                if not m:
                    raise ValueError(f"Cannot prase date: {datestr}")
                datestr2 = m.group(1)
                date = datetime.strptime(datestr2, "%d %B %Y")
                yield date, headline, href


#a = scrape_pb("/actueel/nieuws/2019/03/04/reactie-minister-blok-op-het-terugroepen-van-de-nederlandse-ambassadeur-uit-iran")
#print(a)
#sys.exit()
from amcatclient import AmcatAPI
conn = AmcatAPI("https://amcat.nl")
for date, headline, href in get_links():
    a = scrape_pb(href, date, headline)
    conn.create_articles(2051, 80339, [a])
예제 #19
0
    password = fbcredentials.password

from_date = datetime.strptime(args.fromdate,
                              "%Y-%m-%d") if args.fromdate else None
to_date = datetime.strptime(args.todate, "%Y-%m-%d") if args.todate else None

logging.info(f"Logging in to facebook as {username}")
scraper = FBPostScraper(username, password)
try:
    posts = scraper.get_posts(args.page,
                              max_scrolls=args.max_scrolls,
                              date_from=from_date,
                              date_to=to_date)

    if args.amcathost:
        conn = AmcatAPI(args.amcathost)
        buffer = []
        for p in posts:
            buffer.append(p)
            if len(buffer) >= 10:
                logging.info(
                    f"Saving {len(buffer)} articles to {args.amcathost} project {args.project} set {args.set}"
                )
                conn.create_articles(project=args.project,
                                     articleset=args.set,
                                     json_data=buffer)
                buffer = []
        if buffer:
            conn.create_articles(project=args.project,
                                 articleset=args.set,
                                 json_data=buffer)
예제 #20
0
        "date": date3,
        "medium": "Persberichten",
        "url": url
    }


def get_links():
    for page in range(1, 2):
        url = URL_TEMPLATE.format(**locals())
        print(url)
        page = requests.get(url)
        tree = html.fromstring(page.text)
        links = list(tree.cssselect('h5.kamervraag-panel-title-col a'))
        for a in links:
            l = a.get("href")
            link = URL_ROOT + l
            print(link)
            #if not link.startswith("/actueel/"):
            #   raise ValueError("Not a persbericht? {link}".format(**locals()))
            yield link


#a = scrape_pb("/actueel/nieuws/2019/03/04/reactie-minister-blok-op-het-terugroepen-van-de-nederlandse-ambassadeur-uit-iran")
#print(a)
#sys.exit()
from amcatclient import AmcatAPI
conn = AmcatAPI("http://localhost:8000")
for link in get_links():
    a = scrape_pb(link)
    conn.create_articles(2, 42, [a])