def save_WEs_query(self, corpus, ids, query_options): res = yield self.queries(corpus).insert_one({ "webentities": ids, "total": len(ids), "query": query_options }) returnD(str(res.inserted_id))
def save_WEs_query(self, corpus, ids, query_options): res = yield self.queries(corpus).insert({ "webentities": ids, "total": len(ids), "query": query_options }, safe=True) returnD(str(res))
def SingleMongo(coll, method, *args, **kwargs): conn = MongoConnection(MONGODB['HOST'], MONGODB['PORT']) db = conn[MONGODB['DATABASE']] yield db.authenticate(MONGODB['USER'], MONGODB['PSWD']) res = yield getattr(db[coll], method)(*args, **kwargs) conn.disconnect() returnD(res)
def flush_tweets(self): if self.depiler_running or not self.pile: returnD(None) self.depiler_running = True todo = [] while self.pile and len(todo) < 35: todo.append(self.pile.pop()) if len(self.pile) > 1500: self.fact.ircclient._show_error(failure.Failure( Exception( "Warning, stream on %s has %d tweets late to display. Dumping the data to the trash now... You should still use %sfuckoff and %sunfollow to clean the guilty query." % (self.fact.channel, len( self.pile), COMMAND_CHAR_DEF, COMMAND_CHAR_DEF))), self.fact.channel, admins=True) del self.pile[:] elif len(self.pile) > 500: self.fact.ircclient._show_error(failure.Failure( Exception( "Warning, stream on %s has %d tweets late to display. You should use %sfuckoff and %sunfollow the guilty query or at least restart." % (self.fact.channel, len( self.pile), COMMAND_CHAR_DEF, COMMAND_CHAR_DEF))), self.fact.channel, admins=True) if config.DEBUG: self.log("Flush %s tweets%s." % (len(todo), " (%s left to do)" % len(self.pile) if len(self.pile) else ""), hint=True) yield self.process_twitter_feed(todo, "stream") self.depiler_running = False returnD(True)
def get_queue(self, corpus, specs={}, **kwargs): if "sort" not in kwargs: kwargs["sort"] = sortasc('timestamp') res = yield self.queue(corpus).find(specs, **kwargs) if res and "limit" in kwargs and kwargs["limit"] == 1: res = res[0] returnD(res)
def list_jobs(self, corpus, specs={}, **kwargs): if "sort" not in kwargs: kwargs["sort"] = sortasc("crawling_status") + sortasc("indexing_status") + sortasc("created_at") jobs = yield self.jobs(corpus).find(specs, **kwargs) if jobs and "limit" in kwargs and kwargs["limit"] == 1: jobs = jobs[0] returnD(jobs)
def start_stream(self, conf): if not self.fact.__init_timeout__(): returnD(False) queries = yield self.fact.db['feeds'].find({'database': 'tweets', 'channel': self.fact.channel}, fields=['query']) track = [] skip = [] k = 0 for query in queries: q = str(query['query'].encode('utf-8')).lower() # queries starting with @ should return only tweets from corresponding user, stream doesn not know how to handle this so skip if self.re_twitter_account.match(q): continue elif " OR " in q or " -" in q or '"' in q or len(q) > 60 or len(q) < 6: skip.append(q) continue track.append(q) k += 1 if k > 395: break if self.fact.twuser not in track: track.append(self.fact.twuser) if len(skip): self.log("Skipping unprocessable queries for streaming: « %s »" % " » | « ".join(skip), hint=True) self.log("Start search streaming for: « %s »" % " » | « ".join(track), hint=True) conn = Microblog("twitter", conf, bearer_token=self.fact.twitter_token) # tries to find users corresponding with queries to follow with stream users, self.fact.ircclient.twitter['users'] = conn.lookup_users(track, self.fact.ircclient.twitter['users']) deferToThreadPool(reactor, self.threadpool, self.follow_stream, conf, users.values(), track) self.depiler = LoopingCall(self.flush_tweets) self.depiler.start(1) returnD(True)
def depile(self): if self.queue is None: yield self.init_queue() if not len(self.queue): returnD(None) status = yield self.get_scrapyd_status() if status["pending"] > 0: returnD(None) # Add some random wait to allow possible concurrent Hyphe instance # to compete for ScrapyD's empty slots yield deferredSleep(1./randint(4,20)) # Order jobs by corpus with less currently running crawls then age ordered = sorted(self.queue.items(), key=lambda x: \ float("%s.%s" % (status.get(x[1]["corpus"], 0), x[1]["timestamp"]))) job_id, job = ordered[0] res = yield self.send_scrapy_query('schedule', job["crawl_arguments"]) ts = now_ts() if is_error(res): logger.msg("WARNING: error sending job %s to ScrapyD: %s" % (job, res)) self.queue[job_id]['timestamp'] = ts # let it retry a bit later else: yield self.db.update_job(job["corpus"], job_id, res['jobid'], ts) yield self.db.add_log(job["corpus"], job_id, "CRAWL_SCHEDULED", ts) del(self.queue[job_id])
def __run__(self, coll, method, *args, **kwargs): attempts_left = self.retries result = [] lasttry = False if 'lasttry' in kwargs: lasttry = True del kwargs['lasttry'] while True: try: self.coll = coll self.method = method if not self.conn and not self.db: status = "Connec" self.conn = yield MongoConnection(MONGODB['HOST'], MONGODB['PORT'], reconnect=False) self.db = self.conn[MONGODB['DATABASE']] status = "Authentica" yield self.db.authenticate(MONGODB['USER'], MONGODB['PSWD']) status = "Communica" result = yield getattr(self.db[coll], method)(*args, **kwargs) except Exception as e: if not lasttry: if attempts_left > 0: attempts_left -= 1 if DEBUG: self.logerr("%sting" % status, "Retry #%d" % (self.retries-attempts_left)) yield self.close(silent=True) continue if DEBUG: self.logerr("%sting" % status, "HARD RETRY %s %s" % (type(e), str(e))) result = yield Mongo(coll, method, *args, lasttry=True, **kwargs) yield self.close() returnD(result)
def stop_corpus(self, name, quiet=False): if self.stopped_corpus(name): if config["DEBUG"]: self.log(name, "Traph already stopped", quiet=quiet) returnD(False) if name in self.corpora: yield self.corpora[name].stop() returnD(True)
def get_WEs(self, corpus, query=None): if not query: res = yield self.WEs(corpus).find() else: if isinstance(query, list) and isinstance(query[0], int): query = {"_id": {"$in": query}} res = yield self.WEs(corpus).find(query) returnD(res)
def get_WEs(self, corpus, query=None, **kwargs): if not query: res = yield self.WEs(corpus).find({}, **kwargs) else: if isinstance(query, list) and isinstance(query[0], int): query = {"_id": {"$in": query}} res = yield self.WEs(corpus).find(query, **kwargs) returnD(res)
def run_twitter_search(self): if not self.__init_timeout__(): returnD(False) queries = yield self.db['feeds'].find({'database': 'tweets', 'channel': self.channel}) randorder = range(len(queries)) shuffle(randorder) urls = yield getFeeds(self.db, self.channel, 'tweets', randorder=randorder) yield self.protocol.start_twitter_search(urls, randorder=randorder) self.status = "stopped"
def list_logs(self, corpus, job, **kwargs): if "sort" not in kwargs: kwargs["sort"] = sortasc('timestamp') if "projection" not in kwargs: kwargs["projection"] = ['timestamp', 'log'] if type(job) == list: job = {"$in": job} res = yield self.logs(corpus).find({"_job": job}, **kwargs) returnD(res)
def count_pages(self, corpus, job, **kwargs): tot = yield self.pages(corpus).count( { "_job": job, "forgotten": { "$ne": True } }, **kwargs) returnD(tot)
def list_logs(self, corpus, job, **kwargs): if "filter" not in kwargs: kwargs["filter"] = sortasc('timestamp') if "fields" not in kwargs: kwargs["fields"] = ['timestamp', 'log'] kwargs["safe"] = True if type(job) == list: job = {"$in": job} res = yield self.logs(corpus).find({"_job": job}, **kwargs) returnD(res)
def add_job(self, args, corpus, webentity_id): ts = now_ts() job_id = yield self.db.add_job(corpus, webentity_id, args, ts) self.queue[job_id] = { "corpus": corpus, "timestamp": ts, "crawl_arguments": args } yield self.db.add_log(corpus, job_id, "CRAWL_ADDED", ts) returnD(job_id)
def list_jobs(self, corpus, *args, **kwargs): if "filter" not in kwargs: kwargs["filter"] = sortasc("crawling_status") + sortasc("indexing_status") + sortasc("created_at") jobs = yield self.jobs(corpus).find(*args, **kwargs) for j in jobs: if "created_at" not in j and "timestamp" in j: j["created_at"] = j["timestamp"] for k in ['start', 'crawl', 'finish']: j["%sed_at" % k] = None if jobs and "limit" in kwargs and kwargs["limit"] == 1: jobs = jobs[0] returnD(jobs)
def find_last_followers(user): res = yield SingleMongo( db_foll_coll(user), 'find', { "screen_name": { "$exists": True }, "follows_me": True, "last_update": { "$gte": time.time() - 12 * 3600 } }) returnD(res)
def run_rss_feeds(self): if not self.__init_timeout__(): returnD(False) urls = self.feeds if not urls: urls = yield getFeeds(self.db, self.channel, self.name, add_url=self.tweets_search_page) ct = 0 for url in urls: yield deferredSleep(3 + int(random()*500)/100) self.update_timeout(extra=10) yield self.protocol.start(url) self.status = "stopped"
def list_jobs(self, corpus, *args, **kwargs): kwargs["safe"] = True if "filter" not in kwargs: kwargs["filter"] = sortasc("crawling_status") + sortasc("indexing_status") + sortasc("created_at") jobs = yield self.jobs(corpus).find(*args, **kwargs) for j in jobs: if "created_at" not in j and "timestamp" in j: j["created_at"] = j["timestamp"] for k in ['start', 'crawl', 'finish']: j["%sed_at" % k] = None if jobs and "limit" in kwargs and kwargs["limit"] == 1: jobs = jobs[0] returnD(jobs)
def process_twitter_feed(self, listtweets, feedtype, query=None, pagecount=0): if not listtweets: returnD(False) if query: if not isinstance(listtweets, dict): returnD(False) nexturl = "" if 'max_id_str' in listtweets['search_metadata']: nexturl = listtweets['search_metadata']['max_id_str'] elif 'next_results' in listtweets['search_metadata']: nexturl = self.re_max_id.sub(r'\1', listtweets['search_metadata']['next_results']) res = {'nexturl': nexturl} listtweets = listtweets['statuses'] elif not isinstance(listtweets, list): returnD(False) feed = [] for tweet in listtweets: if not isinstance(tweet, dict): continue tw = {'created_at': tweet['created_at'], 'title': unescape_html(tweet['text']), 'link': tweet['url']} tw = grab_extra_meta(tweet, tw) feed.append(tw) if query: res['tweets'] = feed processed = yield self.process_tweets(res, 'search', query=query, pagecount=pagecount) else: processed = yield self.process_tweets(feed, 'my%s' % feedtype) returnD(processed)
def collect_tlds(): tree = {} double_list = {"rules": [], "exceptions": []} tldlist = yield getPage(MOZ_TLD_LIST) for line in tldlist.split("\n"): line = line.strip() if not line or line.startswith("//"): continue chunks = line.decode('utf-8').split('.') add_tld_chunks_to_tree(chunks, tree) if line[0] == '!': double_list["exceptions"].append(line[1:]) else: double_list["rules"].append(line.strip()) returnD((double_list, tree))
def search_twitter(self, data, query, max_id=None, page=0, randorder=None): if page and randorder: try: query = yield getFeeds(self.fact.db, self.fact.channel, "tweets", randorder=randorder) query = query[page] except Exception as e: returnD(False) if config.DEBUG: text = unquote(query) if max_id: text = "%s before id %s" % (text, max_id.encode('utf-8')) self.log("Query Twitter search for %s" % text) conn = Microblog('twitter', chanconf(self.fact.channel), bearer_token=self.fact.twitter_token) res = conn.search(query, max_id=max_id) returnD(res)
def collect_tlds(): tree = {} try: tldlist = yield getPage(MOZ_TLD_LIST) except: #Fallback local copy from os.path import join, realpath, dirname with open(join(dirname(realpath(__file__)), "tld_list.txt")) as f: tldlist = f.read() for line in tldlist.split("\n"): line = line.strip() if not line or line.startswith("//"): continue chunks = line.decode('utf-8').split('.') add_tld_chunks_to_tree(chunks, tree) returnD(tree)
def stop(self, now=False): if self.monitor.running: self.monitor.stop() if self.stopping(): returnD(None) self.status = "error" if self.error else "stopping" while not now and self.call_running: yield deferredSleep(0.1) if self.transport: self.protocol.stop() self.transport = None self.log("Traph stopped") if not self.error: self.status = "stopped" self.checkAndRemovePID()
def run_web_feeds(self): if not self.__init_timeout__(): returnD(False) urls = self.feeds if not urls: urls = yield getFeeds(self.db, self.channel, self.name, add_url=self.tweets_search_page) ct = 0 for url in urls: name = None if self.name == "pages": url, name = url yield deferredSleep(3 + int(random()*500)/100) self.update_timeout(extra=10) yield self.protocol.start_web(url, name=name) self.status = "stopped"
def stop(self, now=False): if self.monitor.running: self.monitor.stop() if self.stopping(): returnD(None) self.status = "error" if self.error else "stopping" while not now and self.call_running: yield deferredSleep(0.1) if self.transport: self.protocol.stop() self.transport = None self.log("Traph stopped") if not self.error: self.status = "stopped" else: self.checkAndRemovePID()
def flush_tweets(self): if self.depiler_running or not self.pile: returnD(None) self.depiler_running = True todo = [] while self.pile and len(todo) < 35: todo.append(self.pile.pop()) if len(self.pile) > 1000: self.fact.ircclient._show_error(failure.Failure(Exception("Warning, stream on %s has %d tweets late to display. Dumping the data to the trash now... You should still use %sfuckoff and %sunfollow to clean the guilty query." % (self.fact.channel, len(self.pile), COMMAND_CHAR_DEF, COMMAND_CHAR_DEF))), self.fact.channel, admins=True) del self.pile[:] elif len(self.pile) > 300: self.fact.ircclient._show_error(failure.Failure(Exception("Warning, stream on %s has %d tweets late to display. You should use %sfuckoff and %sunfollow the guilty query or at least restart." % (self.fact.channel, len(self.pile), COMMAND_CHAR_DEF, COMMAND_CHAR_DEF))), self.fact.channel, admins=True) if config.DEBUG: self.log("Flush %s tweets%s." % (len(todo), " (%s left to do)" % len(self.pile) if len(self.pile) else ""), hint=True) yield self.process_twitter_feed(todo, "stream") self.depiler_running = False returnD(True)
def start_twitter(self, name, conf, user): if not self.fact.__init_timeout__(): returnD(False) d = succeed(Microblog('twitter', conf, bearer_token=self.fact.twitter_token)) if config.DEBUG: self.log("Query @%s's %s" % (user, name)) def passs(*args, **kwargs): raise Exception("No process existing for %s" % name) source = getattr(Microblog, 'get_%s' % name, passs) processor = getattr(self, 'process_%s' % name, passs) d.addCallback(source, retweets_processed=self.fact.retweets_processed, bearer_token=self.fact.twitter_token) d.addErrback(self._handle_error, "downloading %s for" % name, user) d.addCallback(check_twitter_results) d.addErrback(self._handle_error, "examining %s for" % name, user) d.addCallback(processor, user.lower()) d.addErrback(self._handle_error, "working on %s for" % name, user) d.addCallback(self.end_twitter) return d
def process_elements(self, feed, url): if not feed or not feed.entries: returnD(False) sourcename = url if feed.feed and 'title' in feed.feed: sourcename = feed.feed['title'] sourcename = unescape_html(sourcename) ids = [] news = [] links = [] for i in feed.entries: date = i.get('published_parsed', i.get('updated_parsed', '')) if date: date = datetime.fromtimestamp(time.mktime(date)) if datetime.today() - date > timedelta(hours=config.BACK_HOURS+6): break link, self.fact.cache_urls = yield clean_redir_urls(i.get('link', ''), self.fact.cache_urls) if not link.startswith('http'): link = "%s/%s" % (url[:url.find('/',8)], link.lstrip('/')) if link in links: continue links.append(link) title = i.get('title', '').replace('\n', ' ') try: title = unescape_html(title) except: pass _id = md5(("%s:%s:%s" % (self.fact.channel, link, title.lower())).encode('utf-8')).hexdigest() ids.append(_id) news.append({'_id': _id, 'channel': self.fact.channel, 'message': title, 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': url, 'sourcename': sourcename}) existings = yield self.fact.db['news'].find({'channel': self.fact.channel, '_id': {'$in': ids}}, fields=['_id'], filter=sortdesc('_id')) existing = [n['_id'] for n in existings] new = [n for n in news if n['_id'] not in existing] if new: new.reverse() new = new[:5] try: yield self.fact.db['news'].insert(new, safe=True) except Exception as e: self._handle_error(e, "recording news batch", url) self.fact.ircclient._send_message([(True, "[%s] %s" % (n['sourcename'].encode('utf-8'), self.format_tweet(n))) for n in new], self.fact.channel) returnD(True)
def get_scrapyd_status(self): url = "%sjobs" % self.scrapyd jobs = yield getPage(url) status = {"pending": 0} read = None for line in jobs.split("><tr"): if ">Pending<" in line: read = "pending" elif ">Running<" in line: read = "running" elif ">Finished<" in line: read = None elif read == "running": corpus = line[line.find(".") + 1 : line.find("<", 2)] if corpus not in status: status[corpus] = 0 status[corpus] += 1 elif read: status[read] += 1 returnD(status)
def add_job(self, corpus, webentity_id, args, timestamp=None): if not timestamp: timestamp = now_ts() _id = str(uuid()) yield self.jobs(corpus).insert({ "_id": _id, "crawljob_id": None, "webentity_id": webentity_id, "nb_crawled_pages": 0, "nb_pages": 0, "nb_links": 0, "crawl_arguments": args, "crawling_status": crawling_statuses.PENDING, "indexing_status": indexing_statuses.PENDING, "created_at": timestamp, "scheduled_at": None, "started_at": None, "crawled_at": None, "finished_at": None }, safe=True) returnD(_id)
def get_scrapyd_status(self): url = "%sjobs" % self.scrapyd try: jobs = yield getPage(url) except TimeoutError: logger.msg( "WARNING: ScrapyD's monitoring website seems like not answering" ) returnD(None) except Exception as e: logger.msg( "WARNING: ScrapyD's monitoring website seems down: %s %s" % (type(e), e)) returnD(None) status = {"pending": 0, "running": 0} read = None for line in jobs.split("><tr"): if ">Pending<" in line: read = "pending" elif ">Running<" in line: read = "running" elif ">Finished<" in line: read = None elif read == "running": pattern = ">" + self.db_name + "_" if pattern not in line: continue corpus = line.split(pattern)[1].split("</td>")[0] if corpus not in status: status[corpus] = 0 status[corpus] += 1 status[read] += 1 elif read: status[read] += 1 returnD(status)
def process_dms(self, listdms, user): if not listdms: returnD(False) ids = [] dms = [] if not isinstance(listdms, list): self.log("downloading DMs: %s" % listdms, error=True) returnD(False) for i in listdms: try: date = datetime.fromtimestamp(time.mktime(time.strptime(i.get('created_at', ''), '%a %b %d %H:%M:%S +0000 %Y'))+2*60*60) if datetime.today() - date > timedelta(hours=config.BACK_HOURS): break except: self.log("processing DM %s: %s" % (i, listdms), error=True) continue tid = long(i.get('id', '')) if tid: ids.append(tid) sender = i.get('sender_screen_name', '') dm, self.fact.cache_urls = yield clean_redir_urls(i.get('text', '').replace('\n', ' '), self.fact.cache_urls) dms.append({'_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user, 'sender': sender.lower(), 'screenname': sender, 'message': dm, 'date': date, 'timestamp': datetime.today()}) existings = yield self.fact.db['dms'].find({'channel': self.fact.channel, 'id': {'$in': ids}}, fields=['_id'], filter=sortdesc('id')) existing = [t['_id'] for t in existings] news = [t for t in dms if t['_id'] not in existing] if news: news.reverse() yield self.fact.db['dms'].insert(news, safe=True) self.fact.ircclient._send_message([(True, "[DM] @%s: %s — https://twitter.com/%s" % (n['screenname'].encode('utf-8'), n['message'].encode('utf-8'), n['screenname'].encode('utf-8'))) for n in news], self.fact.channel) returnD(True)
def process_elements(self, data, url, name=None): if not data: returnD(False) if self.fact.name == "pages": differ = WebMonitor(name, url, self.fact.channel) info = yield differ.check_new(data) if info: self.fact.ircclient._send_message(info, self.fact.channel) returnD(True) if not data.entries: returnD(False) sourcename = url if data.feed and 'title' in data.feed: sourcename = data.feed['title'] sourcename = unescape_html(sourcename) ids = [] news = [] links = [] for i in data.entries: date = i.get('published_parsed', i.get('updated_parsed', '')) if date: date = datetime.fromtimestamp(time.mktime(date)) if datetime.today() - date > timedelta(hours=config.BACK_HOURS+6): break link, self.fact.cache_urls = yield clean_redir_urls(i.get('link', ''), self.fact.cache_urls) if not link.startswith('http'): link = "%s/%s" % (url[:url.find('/',8)], link.lstrip('/')) if link in links: continue links.append(link) title = i.get('title', '').replace('\n', ' ') try: title = unescape_html(title) except: pass _id = md5(("%s:%s:%s" % (self.fact.channel, link, title.lower())).encode('utf-8')).hexdigest() ids.append(_id) news.append({'_id': _id, 'channel': self.fact.channel, 'message': title, 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': url, 'sourcename': sourcename}) existings = yield self.fact.db['news'].find({'channel': self.fact.channel, '_id': {'$in': ids}}, fields=['_id'], filter=sortdesc('_id')) existing = [n['_id'] for n in existings] new = [n for n in news if n['_id'] not in existing] if new: new.reverse() new = new[:5] try: yield self.fact.db['news'].insert(new, safe=True) except Exception as e: self._handle_error(e, "recording news batch", url) self.fact.ircclient._send_message([(True, "[%s] %s" % (n['sourcename'].encode('utf-8'), format_tweet(n))) for n in new], self.fact.channel) returnD(True)
def process_stats(self, res, user): if not res: returnD(False) stats, last, timestamp = res if not stats: returnD(False) if not last: last = {'tweets': 0, 'followers': 0} since = timestamp - timedelta(hours=1) else: since = last['timestamp'] if 'lists' not in last: last['lists'] = 0 re_match_rts = re.compile(u'(([MLR]T|%s|♺)\s*)+@?%s' % (QUOTE_CHARS, user), re.I) rts = yield Mongo('tweets', 'find', {'channel': self.fact.channel, 'message': re_match_rts, 'timestamp': {'$gte': since}}, fields=['_id']) nb_rts = len(rts) if config.TWITTER_API_VERSION == 1: stat = {'user': user, 'timestamp': timestamp, 'tweets': stats.get('updates', last['tweets']), 'followers': stats.get('followers', last['followers']), 'rts_last_hour': nb_rts} else: stat = {'user': user, 'timestamp': timestamp, 'tweets': stats.get('statuses_count', last['tweets']), 'followers': stats.get('followers_count', last['followers']), 'rts_last_hour': nb_rts, 'lists': stats.get('listed_count', last['lists'])} yield Mongo('stats', 'insert', stat) weekday = timestamp.weekday() laststats = Stats(user) if chan_displays_stats(self.fact.channel) and ((timestamp.hour == 13 and weekday < 5) or timestamp.hour == 18): stats = yield laststats.print_last() self.fact.ircclient._send_message(stats, self.fact.channel) last_tweet = yield Mongo('tweets', 'find', {'channel': self.fact.channel, 'user': user}, fields=['date'], limit=1, filter=sortdesc('timestamp')) if chan_displays_stats(self.fact.channel) and last_tweet and timestamp - last_tweet[0]['date'] > timedelta(days=3) and (timestamp.hour == 11 or timestamp.hour == 17) and weekday < 5: reactor.callFromThread(reactor.callLater, 3, self.fact.ircclient._send_message, "[FYI] No tweet was sent since %s days." % (timestamp - last_tweet[0]['date']).days, self.fact.channel) reactor.callFromThread(reactor.callLater, 1, laststats.dump_data) returnD(True)
def send_scrapy_query(self, action, arguments=None): url = "%s%s.json" % (self.scrapyd, action) method = "POST" headers = None if action.startswith('list'): method = "GET" if arguments: args = [ str(k) + '=' + str(v) for (k, v) in arguments.iteritems() ] url += '?' + '&'.join(args) arguments = None elif arguments: arguments = urlencode(arguments) headers = {'Content-Type': 'application/x-www-form-urlencoded'} try: res = yield getPage(url, method=method, postdata=arguments, \ headers=headers, timeout=30) result = loadjson(res) returnD(result) except ConnectionRefusedError: returnD(format_error("Could not contact scrapyd server, " + \ "maybe it's not started...")) except Exception as e: returnD(format_error(e))
def get_scrapyd_status(self): url = "%sjobs" % self.scrapyd try: jobs = yield getPage(url) except TimeoutError: logger.msg("WARNING: ScrapyD's monitoring website seems like not answering") returnD(None) except Exception as e: logger.msg("WARNING: ScrapyD's monitoring website seems down: %s %s" % (type(e), e)) returnD(None) status = {"pending": 0} read = None for line in jobs.split("><tr"): if ">Pending<" in line: read = "pending" elif ">Running<" in line: read = "running" elif ">Finished<" in line: read = None elif read == "running": corpus = line[line.find(".") + 1 : line.find("<", 2)] if corpus not in status: status[corpus] = 0 status[corpus] += 1 elif read: status[read] += 1 returnD(status)
def get_scrapyd_status(self): url = "%sjobs" % self.scrapyd try: jobs = yield getPage(url) except TimeoutError: logger.msg( "WARNING: ScrapyD's monitoring website seems like not answering" ) returnD(None) except Exception as e: logger.msg( "WARNING: ScrapyD's monitoring website seems down: %s %s" % (type(e), e)) returnD(None) status = {"pending": 0} read = None for line in jobs.split("><tr"): if ">Pending<" in line: read = "pending" elif ">Running<" in line: read = "running" elif ">Finished<" in line: read = None elif read == "running": corpus = line[line.find(".") + 1:line.find("<", 2)] if corpus not in status: status[corpus] = 0 status[corpus] += 1 elif read: status[read] += 1 returnD(status)
def add_job(self, corpus, webentity_id, args, timestamp=None): if not timestamp: timestamp = now_ts() _id = str(uuid()) yield self.jobs(corpus).insert_one({ "_id": _id, "crawljob_id": None, "webentity_id": webentity_id, "nb_crawled_pages": 0, "nb_unindexed_pages": 0, "nb_pages": 0, "nb_links": 0, "crawl_arguments": args, "crawling_status": crawling_statuses.PENDING, "indexing_status": indexing_statuses.PENDING, "created_at": timestamp, "scheduled_at": None, "started_at": None, "crawled_at": None, "finished_at": None }) returnD(_id)
def process_dms(self, listdms, user): if not listdms: returnD(False) ids = [] dms = [] try: listdms = listdms["events"] assert(isinstance(listdms, list)) except: self.log("downloading DMs: %s" % listdms, error=True) returnD(False) for i in listdms: try: date = parse_timestamp(i.get('created_timestamp', '')) if datetime.today() - date > timedelta(hours=config.BACK_HOURS): break except Exception as e: self.log("processing DM %s: %s %s" % (i.get('created_timestamp'), type(e), e), error=True) continue tid = long(i.get('id', '')) msg = i.get('message_create', {}) if tid and msg: ids.append(tid) sender = msg.get('sender_id', '') target = msg.get('target', {}).get('recipient_id', '') dm, self.fact.cache_urls = yield clean_redir_urls(msg.get('message_data', {}).get('text', '').replace('\n', ' '), self.fact.cache_urls) dms.append({'_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user, 'sender_id': sender, 'target_id': target, 'message': dm, 'date': date, 'timestamp': datetime.today()}) existings = yield self.fact.db['dms'].find({'channel': self.fact.channel, 'id': {'$in': ids}}, fields=['_id'], filter=sortdesc('id')) existing = [t['_id'] for t in existings] news = [t for t in dms if t['_id'] not in existing] if news: news.reverse() conf = chanconf(self.fact.channel) conn = Microblog('twitter', conf, bearer_token=conf["oauth2"]) res = yield conn.resolve_userids([n["sender_id"] for n in news] + [n["target_id"] for n in news]) if "ERROR 429" in res or "ERROR 404" in res or not isinstance(res, list): self.log("resolving users from DMs %s: %s %s" % (res, type(e), e), error=True) returnD(False) users = dict((u['id_str'], u['screen_name']) for u in res) for n in news: n["screenname"] = users.get(n["sender_id"], "unknown") n["sender"] = n["screenname"].lower() n["target_screenname"] = users.get(n["target_id"], "unknown") n["target"] = n["target_screenname"].lower() yield self.fact.db['dms'].insert(news, safe=True) self.fact.ircclient._send_message([(True, "[DM] @%s ➜ @%s: %s — https://twitter.com/%s" % (n['screenname'].encode('utf-8'), n['target_screenname'].encode('utf-8'), n['message'].encode('utf-8'), n['screenname'].encode('utf-8'))) for n in news], self.fact.channel) returnD(True)