2nd arg : method name optionnal 3rd arg : "array" to mark following arguments are to be taken as elements of an array following args : method's arguments. To provide an array, write it as a string after "array", for instance : « array "['test','test2']" » or « array "[['test1','test2'],['test3','test4']]] ». Examples from HCI root: ./hyphe_backend/test_client.py get_status ./hyphe_backend/test_client.py declare_page http://medialab.sciences-po.fr ./hyphe_backend/test_client.py declare_pages array "['http://medialab.sciences-po.fr','http://www.sciences-po.fr']" ./hyphe_backend/test_client.py inline store.get_webentities """ from twisted.internet import reactor, defer from txjsonrpc.web.jsonrpc import Proxy import sys, re from hyphe_backend.lib import config_hci config = config_hci.load_config() if not config: exit() if len(sys.argv) == 1: print helpdoc exit() if sys.argv[1] == "inline": inline = True startargs = 3 else: inline = False startargs = 2
#!/usr/bin/env python # -*- coding: utf-8 -*- import os, re, types, time, hashlib from twisted.web.client import getPage as getPageOrig from twisted.internet.task import deferLater from twisted.internet import reactor from hyphe_backend.lib.config_hci import load_config, DEFAULT_CORPUS config = load_config() if not config: exit() # Handle Twisted 16+ now refusing unicode urls def getPage(url, *args, **kwargs): try: url = str(url) except: pass return getPageOrig(url, *args, **kwargs) class Enum(set): def __getattr__(self, name): if name in self: return name raise AttributeError crawling_statuses = Enum( ['UNCRAWLED', 'PENDING', 'RUNNING', 'FINISHED', 'CANCELED', 'RETRIED'])
# Copy config.json from root to scrapy deployment dir if verbose: print "Copying config.json from root directory to hyphe_backend/crawler for scrapy deployment..." try: if not os.path.exists("config"): os.makedirs("config") copyfile("../../config/config.json", "config/config.json") except IOError as e: print "Could not open either source or destination config.json file" print "config.json", "crawler/config.json" print e exit() from hyphe_backend.lib import config_hci config = config_hci.load_config() if not config: exit() # Get corpus project's config in DB to replace default global conf from pymongo import Connection corpus_conf = Connection(config["mongo-scrapy"]["host"], config["mongo-scrapy"]["mongo_port"])[config["mongo-scrapy"]["db_name"]]["corpus"].find_one({"_id": project}) if corpus_conf: corpus_conf = corpus_conf["options"] config["phantom"].update(corpus_conf["phantom"]) if corpus_conf["proxy"]["host"]: config["mongo-scrapy"]["proxy_host"] = corpus_conf["proxy"]["host"] if corpus_conf["proxy"]["port"]: config["mongo-scrapy"]["proxy_port"] = corpus_conf["proxy"]["port"] else: print "WARNING: trying to deploy a crawler for a corpus project missing in DB"
#!/usr/bin/env python # -*- coding: utf-8 -*- import os, re, types, time, hashlib from twisted.internet.task import deferLater from twisted.internet import reactor from hyphe_backend.lib.config_hci import load_config, DEFAULT_CORPUS config = load_config() if not config: exit() class Enum(set): def __getattr__(self, name): if name in self: return name raise AttributeError crawling_statuses = Enum(['UNCRAWLED', 'PENDING', 'RUNNING', 'FINISHED', 'CANCELED', 'RETRIED']) indexing_statuses = Enum(['UNINDEXED', 'PENDING', 'BATCH_RUNNING', 'BATCH_FINISHED', 'BATCH_CRASHED', 'FINISHED', 'CANCELED']) def now_ts(): return int(time.time()*1000) def urls_match_domainlist(urls, domlist): for url in urls: url = url.lower() if url.find('/', 8) > -1: dom = url[:url.find('/', 8)] else: dom = url for d in domlist: