Esempio n. 1
0
2nd arg           : method name
optionnal 3rd arg : "array" to mark following arguments are to be taken as elements of an array
following args    : method's arguments. To provide an array, write it as a string after "array", for instance : « array "['test','test2']" » or « array "[['test1','test2'],['test3','test4']]] ».
Examples from HCI root:
./hyphe_backend/test_client.py get_status
./hyphe_backend/test_client.py declare_page http://medialab.sciences-po.fr
./hyphe_backend/test_client.py declare_pages array "['http://medialab.sciences-po.fr','http://www.sciences-po.fr']"
./hyphe_backend/test_client.py inline store.get_webentities
"""

from twisted.internet import reactor, defer
from txjsonrpc.web.jsonrpc import Proxy
import sys, re
from hyphe_backend.lib import config_hci

config = config_hci.load_config()
if not config:
    exit()

if len(sys.argv) == 1:
    print helpdoc
    exit()

if sys.argv[1] == "inline":
    inline = True
    startargs = 3
else:
    inline = False
    startargs = 2

Esempio n. 2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os, re, types, time, hashlib
from twisted.web.client import getPage as getPageOrig
from twisted.internet.task import deferLater
from twisted.internet import reactor
from hyphe_backend.lib.config_hci import load_config, DEFAULT_CORPUS
config = load_config()
if not config:
    exit()


# Handle Twisted 16+ now refusing unicode urls
def getPage(url, *args, **kwargs):
    try:
        url = str(url)
    except:
        pass
    return getPageOrig(url, *args, **kwargs)


class Enum(set):
    def __getattr__(self, name):
        if name in self:
            return name
        raise AttributeError


crawling_statuses = Enum(
    ['UNCRAWLED', 'PENDING', 'RUNNING', 'FINISHED', 'CANCELED', 'RETRIED'])
Esempio n. 3
0
# Copy config.json from root to scrapy deployment dir
if verbose:
    print "Copying config.json from root directory to hyphe_backend/crawler for scrapy deployment..."
try:
    if not os.path.exists("config"):
        os.makedirs("config")
    copyfile("../../config/config.json", "config/config.json")
except IOError as e:
    print "Could not open either source or destination config.json file"
    print "config.json", "crawler/config.json"
    print e
    exit()

from hyphe_backend.lib import config_hci
config = config_hci.load_config()
if not config:
    exit()

# Get corpus project's config in DB to replace default global conf
from pymongo import Connection
corpus_conf = Connection(config["mongo-scrapy"]["host"], config["mongo-scrapy"]["mongo_port"])[config["mongo-scrapy"]["db_name"]]["corpus"].find_one({"_id": project})
if corpus_conf:
    corpus_conf = corpus_conf["options"]
    config["phantom"].update(corpus_conf["phantom"])
    if corpus_conf["proxy"]["host"]:
        config["mongo-scrapy"]["proxy_host"] = corpus_conf["proxy"]["host"]
    if corpus_conf["proxy"]["port"]:
        config["mongo-scrapy"]["proxy_port"] = corpus_conf["proxy"]["port"]
else:
    print "WARNING: trying to deploy a crawler for a corpus project missing in DB"
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os, re, types, time, hashlib
from twisted.internet.task import deferLater
from twisted.internet import reactor
from hyphe_backend.lib.config_hci import load_config, DEFAULT_CORPUS
config = load_config()
if not config:
    exit()


class Enum(set):
    def __getattr__(self, name):
        if name in self:
            return name
        raise AttributeError

crawling_statuses = Enum(['UNCRAWLED', 'PENDING', 'RUNNING', 'FINISHED', 'CANCELED', 'RETRIED'])
indexing_statuses = Enum(['UNINDEXED', 'PENDING', 'BATCH_RUNNING', 'BATCH_FINISHED', 'BATCH_CRASHED', 'FINISHED', 'CANCELED'])

def now_ts():
    return int(time.time()*1000)

def urls_match_domainlist(urls, domlist):
    for url in urls:
        url = url.lower()
        if url.find('/', 8) > -1:
            dom = url[:url.find('/', 8)]
        else: dom = url
        for d in domlist: