Example #1
0
 def save_WEs_query(self, corpus, ids, query_options):
     res = yield self.queries(corpus).insert_one({
       "webentities": ids,
       "total": len(ids),
       "query": query_options
     })
     returnD(str(res.inserted_id))
Example #2
0
 def save_WEs_query(self, corpus, ids, query_options):
     res = yield self.queries(corpus).insert({
       "webentities": ids,
       "total": len(ids),
       "query": query_options
     }, safe=True)
     returnD(str(res))
Example #3
0
def SingleMongo(coll, method, *args, **kwargs):
    conn = MongoConnection(MONGODB['HOST'], MONGODB['PORT'])
    db = conn[MONGODB['DATABASE']]
    yield db.authenticate(MONGODB['USER'], MONGODB['PSWD'])
    res = yield getattr(db[coll], method)(*args, **kwargs)
    conn.disconnect()
    returnD(res)
Example #4
0
def SingleMongo(coll, method, *args, **kwargs):
    conn = MongoConnection(MONGODB['HOST'], MONGODB['PORT'])
    db = conn[MONGODB['DATABASE']]
    yield db.authenticate(MONGODB['USER'], MONGODB['PSWD'])
    res = yield getattr(db[coll], method)(*args, **kwargs)
    conn.disconnect()
    returnD(res)
Example #5
0
 def save_WEs_query(self, corpus, ids, query_options):
     res = yield self.queries(corpus).insert_one({
         "webentities": ids,
         "total": len(ids),
         "query": query_options
     })
     returnD(str(res.inserted_id))
Example #6
0
 def flush_tweets(self):
     if self.depiler_running or not self.pile:
         returnD(None)
     self.depiler_running = True
     todo = []
     while self.pile and len(todo) < 35:
         todo.append(self.pile.pop())
     if len(self.pile) > 1500:
         self.fact.ircclient._show_error(failure.Failure(
             Exception(
                 "Warning, stream on %s has %d tweets late to display. Dumping the data to the trash now... You should still use %sfuckoff and %sunfollow to clean the guilty query."
                 % (self.fact.channel, len(
                     self.pile), COMMAND_CHAR_DEF, COMMAND_CHAR_DEF))),
                                         self.fact.channel,
                                         admins=True)
         del self.pile[:]
     elif len(self.pile) > 500:
         self.fact.ircclient._show_error(failure.Failure(
             Exception(
                 "Warning, stream on %s has %d tweets late to display. You should use %sfuckoff and %sunfollow the guilty query or at least restart."
                 % (self.fact.channel, len(
                     self.pile), COMMAND_CHAR_DEF, COMMAND_CHAR_DEF))),
                                         self.fact.channel,
                                         admins=True)
     if config.DEBUG:
         self.log("Flush %s tweets%s." %
                  (len(todo), " (%s left to do)" %
                   len(self.pile) if len(self.pile) else ""),
                  hint=True)
     yield self.process_twitter_feed(todo, "stream")
     self.depiler_running = False
     returnD(True)
Example #7
0
 def get_queue(self, corpus, specs={}, **kwargs):
     if "sort" not in kwargs:
         kwargs["sort"] = sortasc('timestamp')
     res = yield self.queue(corpus).find(specs, **kwargs)
     if res and "limit" in kwargs and kwargs["limit"] == 1:
         res = res[0]
     returnD(res)
Example #8
0
 def list_jobs(self, corpus, specs={}, **kwargs):
     if "sort" not in kwargs:
         kwargs["sort"] = sortasc("crawling_status") + sortasc("indexing_status") + sortasc("created_at")
     jobs = yield self.jobs(corpus).find(specs, **kwargs)
     if jobs and "limit" in kwargs and kwargs["limit"] == 1:
         jobs = jobs[0]
     returnD(jobs)
Example #9
0
 def save_WEs_query(self, corpus, ids, query_options):
     res = yield self.queries(corpus).insert({
       "webentities": ids,
       "total": len(ids),
       "query": query_options
     }, safe=True)
     returnD(str(res))
Example #10
0
 def start_stream(self, conf):
     if not self.fact.__init_timeout__():
         returnD(False)
     queries = yield self.fact.db['feeds'].find({'database': 'tweets', 'channel': self.fact.channel}, fields=['query'])
     track = []
     skip = []
     k = 0
     for query in queries:
         q = str(query['query'].encode('utf-8')).lower()
         # queries starting with @ should return only tweets from corresponding user, stream doesn not know how to handle this so skip
         if self.re_twitter_account.match(q):
             continue
         elif " OR " in q or " -" in q or '"' in q or len(q) > 60 or len(q) < 6:
             skip.append(q)
             continue
         track.append(q)
         k += 1
         if k > 395:
             break
     if self.fact.twuser not in track:
         track.append(self.fact.twuser)
     if len(skip):
         self.log("Skipping unprocessable queries for streaming: « %s »" % " » | « ".join(skip), hint=True)
     self.log("Start search streaming for: « %s »" % " » | « ".join(track), hint=True)
     conn = Microblog("twitter", conf, bearer_token=self.fact.twitter_token)
     # tries to find users corresponding with queries to follow with stream
     users, self.fact.ircclient.twitter['users'] = conn.lookup_users(track, self.fact.ircclient.twitter['users'])
     deferToThreadPool(reactor, self.threadpool, self.follow_stream, conf, users.values(), track)
     self.depiler = LoopingCall(self.flush_tweets)
     self.depiler.start(1)
     returnD(True)
Example #11
0
 def start_stream(self, conf):
     if not self.fact.__init_timeout__():
         returnD(False)
     queries = yield self.fact.db['feeds'].find({'database': 'tweets', 'channel': self.fact.channel}, fields=['query'])
     track = []
     skip = []
     k = 0
     for query in queries:
         q = str(query['query'].encode('utf-8')).lower()
         # queries starting with @ should return only tweets from corresponding user, stream doesn not know how to handle this so skip
         if self.re_twitter_account.match(q):
             continue
         elif " OR " in q or " -" in q or '"' in q or len(q) > 60 or len(q) < 6:
             skip.append(q)
             continue
         track.append(q)
         k += 1
         if k > 395:
             break
     if self.fact.twuser not in track:
         track.append(self.fact.twuser)
     if len(skip):
         self.log("Skipping unprocessable queries for streaming: « %s »" % " » | « ".join(skip), hint=True)
     self.log("Start search streaming for: « %s »" % " » | « ".join(track), hint=True)
     conn = Microblog("twitter", conf, bearer_token=self.fact.twitter_token)
     # tries to find users corresponding with queries to follow with stream
     users, self.fact.ircclient.twitter['users'] = conn.lookup_users(track, self.fact.ircclient.twitter['users'])
     deferToThreadPool(reactor, self.threadpool, self.follow_stream, conf, users.values(), track)
     self.depiler = LoopingCall(self.flush_tweets)
     self.depiler.start(1)
     returnD(True)
Example #12
0
 def get_queue(self, corpus, specs={}, **kwargs):
     if "sort" not in kwargs:
         kwargs["sort"] = sortasc('timestamp')
     res = yield self.queue(corpus).find(specs, **kwargs)
     if res and "limit" in kwargs and kwargs["limit"] == 1:
         res = res[0]
     returnD(res)
Example #13
0
    def depile(self):
        if self.queue is None:
            yield self.init_queue()
        if not len(self.queue):
            returnD(None)

        status = yield self.get_scrapyd_status()
        if status["pending"] > 0:
            returnD(None)
        # Add some random wait to allow possible concurrent Hyphe instance
        # to compete for ScrapyD's empty slots
        yield deferredSleep(1./randint(4,20))

        # Order jobs by corpus with less currently running crawls then age
        ordered = sorted(self.queue.items(), key=lambda x: \
          float("%s.%s" % (status.get(x[1]["corpus"], 0), x[1]["timestamp"])))
        job_id, job = ordered[0]
        res = yield self.send_scrapy_query('schedule', job["crawl_arguments"])
        ts = now_ts()
        if is_error(res):
            logger.msg("WARNING: error sending job %s to ScrapyD: %s" % (job, res))
            self.queue[job_id]['timestamp'] = ts    # let it retry a bit later
        else:
            yield self.db.update_job(job["corpus"], job_id, res['jobid'], ts)
            yield self.db.add_log(job["corpus"], job_id, "CRAWL_SCHEDULED", ts)
            del(self.queue[job_id])
Example #14
0
 def __run__(self, coll, method, *args, **kwargs):
     attempts_left = self.retries
     result = []
     lasttry = False
     if 'lasttry' in kwargs:
         lasttry = True
         del kwargs['lasttry']
     while True:
         try:
             self.coll = coll
             self.method = method
             if not self.conn and not self.db:
                 status = "Connec"
                 self.conn = yield MongoConnection(MONGODB['HOST'], MONGODB['PORT'], reconnect=False)
                 self.db = self.conn[MONGODB['DATABASE']]
                 status = "Authentica"
                 yield self.db.authenticate(MONGODB['USER'], MONGODB['PSWD'])
             status = "Communica"
             result = yield getattr(self.db[coll], method)(*args, **kwargs)
         except Exception as e:
             if not lasttry:
                 if attempts_left > 0:
                     attempts_left -= 1
                     if DEBUG:
                         self.logerr("%sting" % status, "Retry #%d" % (self.retries-attempts_left))
                     yield self.close(silent=True)
                     continue
                 if DEBUG:
                     self.logerr("%sting" % status, "HARD RETRY %s %s" % (type(e), str(e)))
                 result = yield Mongo(coll, method, *args, lasttry=True, **kwargs)
             yield self.close()
         returnD(result)
Example #15
0
 def stop_corpus(self, name, quiet=False):
     if self.stopped_corpus(name):
         if config["DEBUG"]:
             self.log(name, "Traph already stopped", quiet=quiet)
         returnD(False)
     if name in self.corpora:
         yield self.corpora[name].stop()
     returnD(True)
Example #16
0
 def get_WEs(self, corpus, query=None):
     if not query:
         res = yield self.WEs(corpus).find()
     else:
         if isinstance(query, list) and isinstance(query[0], int):
             query = {"_id": {"$in": query}}
         res = yield self.WEs(corpus).find(query)
     returnD(res)
Example #17
0
 def get_WEs(self, corpus, query=None, **kwargs):
     if not query:
         res = yield self.WEs(corpus).find({}, **kwargs)
     else:
         if isinstance(query, list) and isinstance(query[0], int):
             query = {"_id": {"$in": query}}
         res = yield self.WEs(corpus).find(query, **kwargs)
     returnD(res)
Example #18
0
 def stop_corpus(self, name, quiet=False):
     if self.stopped_corpus(name):
         if config["DEBUG"]:
             self.log(name, "Traph already stopped", quiet=quiet)
         returnD(False)
     if name in self.corpora:
         yield self.corpora[name].stop()
     returnD(True)
Example #19
0
 def run_twitter_search(self):
     if not self.__init_timeout__():
         returnD(False)
     queries = yield self.db['feeds'].find({'database': 'tweets', 'channel': self.channel})
     randorder = range(len(queries))
     shuffle(randorder)
     urls = yield getFeeds(self.db, self.channel, 'tweets', randorder=randorder)
     yield self.protocol.start_twitter_search(urls, randorder=randorder)
     self.status = "stopped"
Example #20
0
 def run_twitter_search(self):
     if not self.__init_timeout__():
         returnD(False)
     queries = yield self.db['feeds'].find({'database': 'tweets', 'channel': self.channel})
     randorder = range(len(queries))
     shuffle(randorder)
     urls = yield getFeeds(self.db, self.channel, 'tweets', randorder=randorder)
     yield self.protocol.start_twitter_search(urls, randorder=randorder)
     self.status = "stopped"
Example #21
0
 def list_logs(self, corpus, job, **kwargs):
     if "sort" not in kwargs:
         kwargs["sort"] = sortasc('timestamp')
     if "projection" not in kwargs:
         kwargs["projection"] = ['timestamp', 'log']
     if type(job) == list:
         job = {"$in": job}
     res = yield self.logs(corpus).find({"_job": job}, **kwargs)
     returnD(res)
Example #22
0
 def list_logs(self, corpus, job, **kwargs):
     if "sort" not in kwargs:
         kwargs["sort"] = sortasc('timestamp')
     if "projection" not in kwargs:
         kwargs["projection"] = ['timestamp', 'log']
     if type(job) == list:
         job = {"$in": job}
     res = yield self.logs(corpus).find({"_job": job}, **kwargs)
     returnD(res)
Example #23
0
 def count_pages(self, corpus, job, **kwargs):
     tot = yield self.pages(corpus).count(
         {
             "_job": job,
             "forgotten": {
                 "$ne": True
             }
         }, **kwargs)
     returnD(tot)
Example #24
0
 def list_logs(self, corpus, job, **kwargs):
     if "filter" not in kwargs:
         kwargs["filter"] = sortasc('timestamp')
     if "fields" not in kwargs:
         kwargs["fields"] = ['timestamp', 'log']
     kwargs["safe"] = True
     if type(job) == list:
         job = {"$in": job}
     res = yield self.logs(corpus).find({"_job": job}, **kwargs)
     returnD(res)
Example #25
0
 def list_logs(self, corpus, job, **kwargs):
     if "filter" not in kwargs:
         kwargs["filter"] = sortasc('timestamp')
     if "fields" not in kwargs:
         kwargs["fields"] = ['timestamp', 'log']
     kwargs["safe"] = True
     if type(job) == list:
         job = {"$in": job}
     res = yield self.logs(corpus).find({"_job": job}, **kwargs)
     returnD(res)
Example #26
0
 def add_job(self, args, corpus, webentity_id):
     ts = now_ts()
     job_id = yield self.db.add_job(corpus, webentity_id, args, ts)
     self.queue[job_id] = {
       "corpus": corpus,
       "timestamp": ts,
       "crawl_arguments": args
     }
     yield self.db.add_log(corpus, job_id, "CRAWL_ADDED", ts)
     returnD(job_id)
Example #27
0
 def list_jobs(self, corpus, *args, **kwargs):
     if "filter" not in kwargs:
         kwargs["filter"] = sortasc("crawling_status") + sortasc("indexing_status") + sortasc("created_at")
     jobs = yield self.jobs(corpus).find(*args, **kwargs)
     for j in jobs:
         if "created_at" not in j and "timestamp" in j:
             j["created_at"] = j["timestamp"]
             for k in ['start', 'crawl', 'finish']:
                 j["%sed_at" % k] = None
     if jobs and "limit" in kwargs and kwargs["limit"] == 1:
         jobs = jobs[0]
     returnD(jobs)
Example #28
0
def find_last_followers(user):
    res = yield SingleMongo(
        db_foll_coll(user), 'find', {
            "screen_name": {
                "$exists": True
            },
            "follows_me": True,
            "last_update": {
                "$gte": time.time() - 12 * 3600
            }
        })
    returnD(res)
Example #29
0
 def run_rss_feeds(self):
     if not self.__init_timeout__():
         returnD(False)
     urls = self.feeds
     if not urls:
         urls = yield getFeeds(self.db, self.channel, self.name, add_url=self.tweets_search_page)
     ct = 0
     for url in urls:
         yield deferredSleep(3 + int(random()*500)/100)
         self.update_timeout(extra=10)
         yield self.protocol.start(url)
     self.status = "stopped"
Example #30
0
 def list_jobs(self, corpus, *args, **kwargs):
     kwargs["safe"] = True
     if "filter" not in kwargs:
         kwargs["filter"] = sortasc("crawling_status") + sortasc("indexing_status") + sortasc("created_at")
     jobs = yield self.jobs(corpus).find(*args, **kwargs)
     for j in jobs:
         if "created_at" not in j and "timestamp" in j:
             j["created_at"] = j["timestamp"]
             for k in ['start', 'crawl', 'finish']:
                 j["%sed_at" % k] = None
     if jobs and "limit" in kwargs and kwargs["limit"] == 1:
         jobs = jobs[0]
     returnD(jobs)
Example #31
0
 def process_twitter_feed(self, listtweets, feedtype, query=None, pagecount=0):
     if not listtweets:
         returnD(False)
     if query:
         if not isinstance(listtweets, dict):
             returnD(False)
         nexturl = ""
         if 'max_id_str' in listtweets['search_metadata']:
             nexturl = listtweets['search_metadata']['max_id_str']
         elif 'next_results' in listtweets['search_metadata']:
             nexturl = self.re_max_id.sub(r'\1', listtweets['search_metadata']['next_results'])
         res = {'nexturl':  nexturl}
         listtweets = listtweets['statuses']
     elif not isinstance(listtweets, list):
         returnD(False)
     feed = []
     for tweet in listtweets:
         if not isinstance(tweet, dict):
             continue
         tw = {'created_at': tweet['created_at'], 'title': unescape_html(tweet['text']), 'link': tweet['url']}
         tw = grab_extra_meta(tweet, tw)
         feed.append(tw)
     if query:
         res['tweets'] = feed
         processed = yield self.process_tweets(res, 'search', query=query, pagecount=pagecount)
     else:
         processed = yield self.process_tweets(feed, 'my%s' % feedtype)
     returnD(processed)
Example #32
0
File: tlds.py Project: Dim25/hyphe
def collect_tlds():
    tree = {}
    double_list = {"rules": [], "exceptions": []}
    tldlist = yield getPage(MOZ_TLD_LIST)
    for line in tldlist.split("\n"):
        line = line.strip()
        if not line or line.startswith("//"):
            continue
        chunks = line.decode('utf-8').split('.')
        add_tld_chunks_to_tree(chunks, tree)
        if line[0] == '!':
            double_list["exceptions"].append(line[1:])
        else:
            double_list["rules"].append(line.strip())
    returnD((double_list, tree))
Example #33
0
 def search_twitter(self, data, query, max_id=None, page=0, randorder=None):
     if page and randorder:
         try:
             query = yield getFeeds(self.fact.db, self.fact.channel, "tweets", randorder=randorder)
             query = query[page]
         except Exception as e:
             returnD(False)
     if config.DEBUG:
         text = unquote(query)
         if max_id:
             text = "%s before id %s" % (text, max_id.encode('utf-8'))
         self.log("Query Twitter search for %s" % text)
     conn = Microblog('twitter', chanconf(self.fact.channel), bearer_token=self.fact.twitter_token)
     res = conn.search(query, max_id=max_id)
     returnD(res)
Example #34
0
 def search_twitter(self, data, query, max_id=None, page=0, randorder=None):
     if page and randorder:
         try:
             query = yield getFeeds(self.fact.db, self.fact.channel, "tweets", randorder=randorder)
             query = query[page]
         except Exception as e:
             returnD(False)
     if config.DEBUG:
         text = unquote(query)
         if max_id:
             text = "%s before id %s" % (text, max_id.encode('utf-8'))
         self.log("Query Twitter search for %s" % text)
     conn = Microblog('twitter', chanconf(self.fact.channel), bearer_token=self.fact.twitter_token)
     res = conn.search(query, max_id=max_id)
     returnD(res)
Example #35
0
def collect_tlds():
    tree = {}
    try:
        tldlist = yield getPage(MOZ_TLD_LIST)
    except:  #Fallback local copy
        from os.path import join, realpath, dirname
        with open(join(dirname(realpath(__file__)), "tld_list.txt")) as f:
            tldlist = f.read()
    for line in tldlist.split("\n"):
        line = line.strip()
        if not line or line.startswith("//"):
            continue
        chunks = line.decode('utf-8').split('.')
        add_tld_chunks_to_tree(chunks, tree)
    returnD(tree)
Example #36
0
 def stop(self, now=False):
     if self.monitor.running:
         self.monitor.stop()
     if self.stopping():
         returnD(None)
     self.status = "error" if self.error else "stopping"
     while not now and self.call_running:
         yield deferredSleep(0.1)
     if self.transport:
         self.protocol.stop()
         self.transport = None
     self.log("Traph stopped")
     if not self.error:
         self.status = "stopped"
     self.checkAndRemovePID()
Example #37
0
 def run_web_feeds(self):
     if not self.__init_timeout__():
         returnD(False)
     urls = self.feeds
     if not urls:
         urls = yield getFeeds(self.db, self.channel, self.name, add_url=self.tweets_search_page)
     ct = 0
     for url in urls:
         name = None
         if self.name == "pages":
             url, name = url
         yield deferredSleep(3 + int(random()*500)/100)
         self.update_timeout(extra=10)
         yield self.protocol.start_web(url, name=name)
     self.status = "stopped"
Example #38
0
def collect_tlds():
    tree = {}
    double_list = {"rules": [], "exceptions": []}
    tldlist = yield getPage(MOZ_TLD_LIST)
    for line in tldlist.split("\n"):
        line = line.strip()
        if not line or line.startswith("//"):
            continue
        chunks = line.decode('utf-8').split('.')
        add_tld_chunks_to_tree(chunks, tree)
        if line[0] == '!':
            double_list["exceptions"].append(line[1:])
        else:
            double_list["rules"].append(line.strip())
    returnD((double_list, tree))
Example #39
0
def collect_tlds():
    tree = {}
    try:
        tldlist = yield getPage(MOZ_TLD_LIST)
    except: #Fallback local copy
        from os.path import join, realpath, dirname
        with open(join(dirname(realpath(__file__)), "tld_list.txt")) as f:
            tldlist = f.read()
    for line in tldlist.split("\n"):
        line = line.strip()
        if not line or line.startswith("//"):
            continue
        chunks = line.decode('utf-8').split('.')
        add_tld_chunks_to_tree(chunks, tree)
    returnD(tree)
Example #40
0
 def stop(self, now=False):
     if self.monitor.running:
         self.monitor.stop()
     if self.stopping():
         returnD(None)
     self.status = "error" if self.error else "stopping"
     while not now and self.call_running:
         yield deferredSleep(0.1)
     if self.transport:
         self.protocol.stop()
         self.transport = None
     self.log("Traph stopped")
     if not self.error:
         self.status = "stopped"
     else:
         self.checkAndRemovePID()
Example #41
0
 def flush_tweets(self):
     if self.depiler_running or not self.pile:
         returnD(None)
     self.depiler_running = True
     todo = []
     while self.pile and len(todo) < 35:
         todo.append(self.pile.pop())
     if len(self.pile) > 1000:
         self.fact.ircclient._show_error(failure.Failure(Exception("Warning, stream on %s has %d tweets late to display. Dumping the data to the trash now... You should still use %sfuckoff and %sunfollow to clean the guilty query." % (self.fact.channel, len(self.pile), COMMAND_CHAR_DEF, COMMAND_CHAR_DEF))), self.fact.channel, admins=True)
         del self.pile[:]
     elif len(self.pile) > 300:
         self.fact.ircclient._show_error(failure.Failure(Exception("Warning, stream on %s has %d tweets late to display. You should use %sfuckoff and %sunfollow the guilty query or at least restart." % (self.fact.channel, len(self.pile), COMMAND_CHAR_DEF, COMMAND_CHAR_DEF))), self.fact.channel, admins=True)
     if config.DEBUG:
         self.log("Flush %s tweets%s." % (len(todo), " (%s left to do)" % len(self.pile) if len(self.pile) else ""), hint=True)
     yield self.process_twitter_feed(todo, "stream")
     self.depiler_running = False
     returnD(True)
Example #42
0
 def start_twitter(self, name, conf, user):
     if not self.fact.__init_timeout__():
         returnD(False)
     d = succeed(Microblog('twitter', conf, bearer_token=self.fact.twitter_token))
     if config.DEBUG:
         self.log("Query @%s's %s" % (user, name))
     def passs(*args, **kwargs):
         raise Exception("No process existing for %s" % name)
     source = getattr(Microblog, 'get_%s' % name, passs)
     processor = getattr(self, 'process_%s' % name, passs)
     d.addCallback(source, retweets_processed=self.fact.retweets_processed, bearer_token=self.fact.twitter_token)
     d.addErrback(self._handle_error, "downloading %s for" % name, user)
     d.addCallback(check_twitter_results)
     d.addErrback(self._handle_error, "examining %s for" % name, user)
     d.addCallback(processor, user.lower())
     d.addErrback(self._handle_error, "working on %s for" % name, user)
     d.addCallback(self.end_twitter)
     return d
Example #43
0
 def start_twitter(self, name, conf, user):
     if not self.fact.__init_timeout__():
         returnD(False)
     d = succeed(Microblog('twitter', conf, bearer_token=self.fact.twitter_token))
     if config.DEBUG:
         self.log("Query @%s's %s" % (user, name))
     def passs(*args, **kwargs):
         raise Exception("No process existing for %s" % name)
     source = getattr(Microblog, 'get_%s' % name, passs)
     processor = getattr(self, 'process_%s' % name, passs)
     d.addCallback(source, retweets_processed=self.fact.retweets_processed, bearer_token=self.fact.twitter_token)
     d.addErrback(self._handle_error, "downloading %s for" % name, user)
     d.addCallback(check_twitter_results)
     d.addErrback(self._handle_error, "examining %s for" % name, user)
     d.addCallback(processor, user.lower())
     d.addErrback(self._handle_error, "working on %s for" % name, user)
     d.addCallback(self.end_twitter)
     return d
Example #44
0
 def process_elements(self, feed, url):
     if not feed or not feed.entries:
         returnD(False)
     sourcename = url
     if feed.feed and 'title' in feed.feed:
         sourcename = feed.feed['title']
         sourcename = unescape_html(sourcename)
     ids = []
     news = []
     links = []
     for i in feed.entries:
         date = i.get('published_parsed', i.get('updated_parsed', ''))
         if date:
             date = datetime.fromtimestamp(time.mktime(date))
             if datetime.today() - date > timedelta(hours=config.BACK_HOURS+6):
                 break
         link, self.fact.cache_urls = yield clean_redir_urls(i.get('link', ''), self.fact.cache_urls)
         if not link.startswith('http'):
             link = "%s/%s" % (url[:url.find('/',8)], link.lstrip('/'))
         if link in links:
             continue
         links.append(link)
         title = i.get('title', '').replace('\n', ' ')
         try:
             title = unescape_html(title)
         except:
             pass
         _id = md5(("%s:%s:%s" % (self.fact.channel, link, title.lower())).encode('utf-8')).hexdigest()
         ids.append(_id)
         news.append({'_id': _id, 'channel': self.fact.channel, 'message': title, 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': url, 'sourcename': sourcename})
     existings = yield self.fact.db['news'].find({'channel': self.fact.channel, '_id': {'$in': ids}}, fields=['_id'], filter=sortdesc('_id'))
     existing = [n['_id'] for n in existings]
     new = [n for n in news if n['_id'] not in existing]
     if new:
         new.reverse()
         new = new[:5]
         try:
             yield self.fact.db['news'].insert(new, safe=True)
         except Exception as e:
             self._handle_error(e, "recording news batch", url)
         self.fact.ircclient._send_message([(True, "[%s] %s" % (n['sourcename'].encode('utf-8'), self.format_tweet(n))) for n in new], self.fact.channel)
     returnD(True)
Example #45
0
 def get_scrapyd_status(self):
     url = "%sjobs" % self.scrapyd
     jobs = yield getPage(url)
     status = {"pending": 0}
     read = None
     for line in jobs.split("><tr"):
         if ">Pending<" in line:
             read = "pending"
         elif ">Running<" in line:
             read = "running"
         elif ">Finished<" in line:
             read = None
         elif read == "running":
             corpus = line[line.find(".") + 1 : line.find("<", 2)]
             if corpus not in status:
                 status[corpus] = 0
             status[corpus] += 1
         elif read:
             status[read] += 1
     returnD(status)
Example #46
0
 def add_job(self, corpus, webentity_id, args, timestamp=None):
     if not timestamp:
         timestamp = now_ts()
     _id = str(uuid())
     yield self.jobs(corpus).insert({
       "_id": _id,
       "crawljob_id": None,
       "webentity_id": webentity_id,
       "nb_crawled_pages": 0,
       "nb_pages": 0,
       "nb_links": 0,
       "crawl_arguments": args,
       "crawling_status": crawling_statuses.PENDING,
       "indexing_status": indexing_statuses.PENDING,
       "created_at": timestamp,
       "scheduled_at": None,
       "started_at": None,
       "crawled_at": None,
       "finished_at": None
     }, safe=True)
     returnD(_id)
Example #47
0
 def get_scrapyd_status(self):
     url = "%sjobs" % self.scrapyd
     try:
         jobs = yield getPage(url)
     except TimeoutError:
         logger.msg(
             "WARNING: ScrapyD's monitoring website seems like not answering"
         )
         returnD(None)
     except Exception as e:
         logger.msg(
             "WARNING: ScrapyD's monitoring website seems down: %s %s" %
             (type(e), e))
         returnD(None)
     status = {"pending": 0, "running": 0}
     read = None
     for line in jobs.split("><tr"):
         if ">Pending<" in line:
             read = "pending"
         elif ">Running<" in line:
             read = "running"
         elif ">Finished<" in line:
             read = None
         elif read == "running":
             pattern = ">" + self.db_name + "_"
             if pattern not in line:
                 continue
             corpus = line.split(pattern)[1].split("</td>")[0]
             if corpus not in status:
                 status[corpus] = 0
             status[corpus] += 1
             status[read] += 1
         elif read:
             status[read] += 1
     returnD(status)
Example #48
0
 def process_dms(self, listdms, user):
     if not listdms:
         returnD(False)
     ids = []
     dms = []
     if not isinstance(listdms, list):
         self.log("downloading DMs: %s" % listdms, error=True)
         returnD(False)
     for i in listdms:
         try:
             date = datetime.fromtimestamp(time.mktime(time.strptime(i.get('created_at', ''), '%a %b %d %H:%M:%S +0000 %Y'))+2*60*60)
             if datetime.today() - date > timedelta(hours=config.BACK_HOURS):
                 break
         except:
             self.log("processing DM %s: %s" % (i, listdms), error=True)
             continue
         tid = long(i.get('id', ''))
         if tid:
             ids.append(tid)
             sender = i.get('sender_screen_name', '')
             dm, self.fact.cache_urls = yield clean_redir_urls(i.get('text', '').replace('\n', ' '), self.fact.cache_urls)
             dms.append({'_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user, 'sender': sender.lower(), 'screenname': sender, 'message': dm, 'date': date, 'timestamp': datetime.today()})
     existings = yield self.fact.db['dms'].find({'channel': self.fact.channel, 'id': {'$in': ids}}, fields=['_id'], filter=sortdesc('id'))
     existing = [t['_id'] for t in existings]
     news = [t for t in dms if t['_id'] not in existing]
     if news:
         news.reverse()
         yield self.fact.db['dms'].insert(news, safe=True)
         self.fact.ircclient._send_message([(True, "[DM] @%s: %s — https://twitter.com/%s" % (n['screenname'].encode('utf-8'), n['message'].encode('utf-8'), n['screenname'].encode('utf-8'))) for n in news], self.fact.channel)
     returnD(True)
Example #49
0
 def process_elements(self, data, url, name=None):
     if not data:
         returnD(False)
     if self.fact.name == "pages":
         differ = WebMonitor(name, url, self.fact.channel)
         info = yield differ.check_new(data)
         if info:
             self.fact.ircclient._send_message(info, self.fact.channel)
         returnD(True)
     if not data.entries:
         returnD(False)
     sourcename = url
     if data.feed and 'title' in data.feed:
         sourcename = data.feed['title']
         sourcename = unescape_html(sourcename)
     ids = []
     news = []
     links = []
     for i in data.entries:
         date = i.get('published_parsed', i.get('updated_parsed', ''))
         if date:
             date = datetime.fromtimestamp(time.mktime(date))
             if datetime.today() - date > timedelta(hours=config.BACK_HOURS+6):
                 break
         link, self.fact.cache_urls = yield clean_redir_urls(i.get('link', ''), self.fact.cache_urls)
         if not link.startswith('http'):
             link = "%s/%s" % (url[:url.find('/',8)], link.lstrip('/'))
         if link in links:
             continue
         links.append(link)
         title = i.get('title', '').replace('\n', ' ')
         try:
             title = unescape_html(title)
         except:
             pass
         _id = md5(("%s:%s:%s" % (self.fact.channel, link, title.lower())).encode('utf-8')).hexdigest()
         ids.append(_id)
         news.append({'_id': _id, 'channel': self.fact.channel, 'message': title, 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': url, 'sourcename': sourcename})
     existings = yield self.fact.db['news'].find({'channel': self.fact.channel, '_id': {'$in': ids}}, fields=['_id'], filter=sortdesc('_id'))
     existing = [n['_id'] for n in existings]
     new = [n for n in news if n['_id'] not in existing]
     if new:
         new.reverse()
         new = new[:5]
         try:
             yield self.fact.db['news'].insert(new, safe=True)
         except Exception as e:
             self._handle_error(e, "recording news batch", url)
         self.fact.ircclient._send_message([(True, "[%s] %s" % (n['sourcename'].encode('utf-8'), format_tweet(n))) for n in new], self.fact.channel)
     returnD(True)
Example #50
0
 def process_stats(self, res, user):
     if not res:
         returnD(False)
     stats, last, timestamp = res
     if not stats:
         returnD(False)
     if not last:
         last = {'tweets': 0, 'followers': 0}
         since = timestamp - timedelta(hours=1)
     else:
         since = last['timestamp']
     if 'lists' not in last:
         last['lists'] = 0
     re_match_rts = re.compile(u'(([MLR]T|%s|♺)\s*)+@?%s' % (QUOTE_CHARS, user), re.I)
     rts = yield Mongo('tweets', 'find', {'channel': self.fact.channel, 'message': re_match_rts, 'timestamp': {'$gte': since}}, fields=['_id'])
     nb_rts = len(rts)
     if config.TWITTER_API_VERSION == 1:
         stat = {'user': user, 'timestamp': timestamp, 'tweets': stats.get('updates', last['tweets']), 'followers': stats.get('followers', last['followers']), 'rts_last_hour': nb_rts}
     else:
         stat = {'user': user, 'timestamp': timestamp, 'tweets': stats.get('statuses_count', last['tweets']), 'followers': stats.get('followers_count', last['followers']), 'rts_last_hour': nb_rts, 'lists': stats.get('listed_count', last['lists'])}
     yield Mongo('stats', 'insert', stat)
     weekday = timestamp.weekday()
     laststats = Stats(user)
     if chan_displays_stats(self.fact.channel) and ((timestamp.hour == 13 and weekday < 5) or timestamp.hour == 18):
         stats = yield laststats.print_last()
         self.fact.ircclient._send_message(stats, self.fact.channel)
     last_tweet = yield Mongo('tweets', 'find', {'channel': self.fact.channel, 'user': user}, fields=['date'], limit=1, filter=sortdesc('timestamp'))
     if chan_displays_stats(self.fact.channel) and last_tweet and timestamp - last_tweet[0]['date'] > timedelta(days=3) and (timestamp.hour == 11 or timestamp.hour == 17) and weekday < 5:
         reactor.callFromThread(reactor.callLater, 3, self.fact.ircclient._send_message, "[FYI] No tweet was sent since %s days." % (timestamp - last_tweet[0]['date']).days, self.fact.channel)
     reactor.callFromThread(reactor.callLater, 1, laststats.dump_data)
     returnD(True)
Example #51
0
 def send_scrapy_query(self, action, arguments=None):
     url = "%s%s.json" % (self.scrapyd, action)
     method = "POST"
     headers = None
     if action.startswith('list'):
         method = "GET"
         if arguments:
             args = [
                 str(k) + '=' + str(v) for (k, v) in arguments.iteritems()
             ]
             url += '?' + '&'.join(args)
             arguments = None
     elif arguments:
         arguments = urlencode(arguments)
         headers = {'Content-Type': 'application/x-www-form-urlencoded'}
     try:
         res = yield getPage(url, method=method, postdata=arguments, \
           headers=headers, timeout=30)
         result = loadjson(res)
         returnD(result)
     except ConnectionRefusedError:
         returnD(format_error("Could not contact scrapyd server, " + \
           "maybe it's not started..."))
     except Exception as e:
         returnD(format_error(e))
Example #52
0
 def get_scrapyd_status(self):
     url = "%sjobs" % self.scrapyd
     try:
         jobs = yield getPage(url)
     except TimeoutError:
         logger.msg("WARNING: ScrapyD's monitoring website seems like not answering")
         returnD(None)
     except Exception as e:
         logger.msg("WARNING: ScrapyD's monitoring website seems down: %s %s" % (type(e), e))
         returnD(None)
     status = {"pending": 0}
     read = None
     for line in jobs.split("><tr"):
         if ">Pending<" in line:
             read = "pending"
         elif ">Running<" in line:
             read = "running"
         elif ">Finished<" in line:
             read = None
         elif read == "running":
             corpus = line[line.find(".") + 1 : line.find("<", 2)]
             if corpus not in status:
                 status[corpus] = 0
             status[corpus] += 1
         elif read:
             status[read] += 1
     returnD(status)
Example #53
0
 def get_scrapyd_status(self):
     url = "%sjobs" % self.scrapyd
     try:
         jobs = yield getPage(url)
     except TimeoutError:
         logger.msg(
             "WARNING: ScrapyD's monitoring website seems like not answering"
         )
         returnD(None)
     except Exception as e:
         logger.msg(
             "WARNING: ScrapyD's monitoring website seems down: %s %s" %
             (type(e), e))
         returnD(None)
     status = {"pending": 0}
     read = None
     for line in jobs.split("><tr"):
         if ">Pending<" in line:
             read = "pending"
         elif ">Running<" in line:
             read = "running"
         elif ">Finished<" in line:
             read = None
         elif read == "running":
             corpus = line[line.find(".") + 1:line.find("<", 2)]
             if corpus not in status:
                 status[corpus] = 0
             status[corpus] += 1
         elif read:
             status[read] += 1
     returnD(status)
Example #54
0
 def add_job(self, corpus, webentity_id, args, timestamp=None):
     if not timestamp:
         timestamp = now_ts()
     _id = str(uuid())
     yield self.jobs(corpus).insert_one({
       "_id": _id,
       "crawljob_id": None,
       "webentity_id": webentity_id,
       "nb_crawled_pages": 0,
       "nb_unindexed_pages": 0,
       "nb_pages": 0,
       "nb_links": 0,
       "crawl_arguments": args,
       "crawling_status": crawling_statuses.PENDING,
       "indexing_status": indexing_statuses.PENDING,
       "created_at": timestamp,
       "scheduled_at": None,
       "started_at": None,
       "crawled_at": None,
       "finished_at": None
     })
     returnD(_id)
Example #55
0
 def process_dms(self, listdms, user):
     if not listdms:
         returnD(False)
     ids = []
     dms = []
     try:
         listdms = listdms["events"]
         assert(isinstance(listdms, list))
     except:
         self.log("downloading DMs: %s" % listdms, error=True)
         returnD(False)
     for i in listdms:
         try:
             date = parse_timestamp(i.get('created_timestamp', ''))
             if datetime.today() - date > timedelta(hours=config.BACK_HOURS):
                 break
         except Exception as e:
             self.log("processing DM %s: %s %s" % (i.get('created_timestamp'), type(e), e), error=True)
             continue
         tid = long(i.get('id', ''))
         msg = i.get('message_create', {})
         if tid and msg:
             ids.append(tid)
             sender = msg.get('sender_id', '')
             target = msg.get('target', {}).get('recipient_id', '')
             dm, self.fact.cache_urls = yield clean_redir_urls(msg.get('message_data', {}).get('text', '').replace('\n', ' '), self.fact.cache_urls)
             dms.append({'_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user, 'sender_id': sender, 'target_id': target, 'message': dm, 'date': date, 'timestamp': datetime.today()})
     existings = yield self.fact.db['dms'].find({'channel': self.fact.channel, 'id': {'$in': ids}}, fields=['_id'], filter=sortdesc('id'))
     existing = [t['_id'] for t in existings]
     news = [t for t in dms if t['_id'] not in existing]
     if news:
         news.reverse()
         conf = chanconf(self.fact.channel)
         conn = Microblog('twitter', conf, bearer_token=conf["oauth2"])
         res = yield conn.resolve_userids([n["sender_id"] for n in news] + [n["target_id"] for n in news])
         if "ERROR 429" in res or "ERROR 404" in res or not isinstance(res, list):
             self.log("resolving users from DMs %s: %s %s" % (res, type(e), e), error=True)
             returnD(False)
         users = dict((u['id_str'], u['screen_name']) for u in res)
         for n in news:
             n["screenname"] = users.get(n["sender_id"], "unknown")
             n["sender"] = n["screenname"].lower()
             n["target_screenname"] = users.get(n["target_id"], "unknown")
             n["target"] = n["target_screenname"].lower()
         yield self.fact.db['dms'].insert(news, safe=True)
         self.fact.ircclient._send_message([(True, "[DM] @%s ➜ @%s: %s — https://twitter.com/%s" % (n['screenname'].encode('utf-8'), n['target_screenname'].encode('utf-8'), n['message'].encode('utf-8'), n['screenname'].encode('utf-8'))) for n in news], self.fact.channel)
     returnD(True)