def test_change_source(): ret = execute(Job4) assert ret["state"] == "complete" base = Job4() coll = base.config.tests.test_collection for doc in coll.find(): assert doc["source"] == doc["_src"]
def test_multi_source(): ret = execute(Job3) assert ret["state"] == "complete" base = Job3() coll = base.config.tests.test_collection for doc in coll.find(): assert doc["source"] == doc["_src"] assert doc["_job_id"] == ret["_id"] assert sorted(ret["sources"]) == ['test1.txt', 'test2.txt']
def test_job_collection(): ret = execute(Job2) assert ret["state"] == "complete"
res = r.func1(collection, db, url) #self.logger.info('Dataframe size is %s', res.shape) return res def func2(self, r): """ In this function R recieves the data as an R dataframe object using the function pandas2ri.py2ri :param r: r session with required libraries :return: """ cur = self.data.find() df = pd.DataFrame(list(cur)) df = df.replace(np.nan, 0) df['Monat'] = df['Monat'].dt.strftime( '%Y-%m-%d') df.columns = [self.remove_accents(c.replace("%"," percent")) for c in df.columns] for col in df.columns: df[col] = df[col].apply(self.remove_accents) if col in ['Grundgesamtheit', 'Titel', 'Analyse'] else df[col] #df = py_to_r(df) r.source('script1.R') res = r.func2() return res def remove_accents(self, string): return unidecode.unidecode(string) if __name__ == '__main__': from core4.queue.helper.functool import execute execute(RJob)
def test_nojob_collection(): ret = execute(Job1) assert ret["last_error"]["exception"].startswith( """AttributeError('_id and _src must not be None""" ) assert ret["state"] == "error"
df.Date = pd.to_datetime(df.Date, format="%Y-%m-%d") df = df[(df.Date >= start) & (df.Date <= end)] df.Date = df.Date.apply(lambda x: datetime.strftime(x, "%Y-%m-%d")) df['Kontakte Mio'].replace("--", None, inplace=True) df = df.replace(np.nan, 0) g = df.groupby(["Date"])['Kontakte Mio'].agg('sum') results['firstGraph'] = g.to_dict() self.logger.info("Data created for the first graph") # second graph df_new = df[df.Medientyp != 0] g1 = df_new.groupby(["Medientyp"])['Kontakte Mio'].agg('sum') results['secondGraph'] = g1.to_dict() self.logger.info("Data created for the second graph") # third graph df_new = df[df.Medientyp != 0] # Monthly contacts for each media group g2 = df_new.groupby(["Date", "Medientyp"])['Kontakte Mio'].agg('sum') results['thirdGraph'] = g2.reset_index().to_dict('rec') self.logger.info("Data created for the third graph") self.set_source(str(self._id)) self.temp.insert_one(results) self.logger.info("inserted results") if __name__ == '__main__': from core4.queue.helper.functool import execute execute(ExtractFacts, test=False)
self.data = self.config.driverlicense.collection.data self.temp = self.config.driverlicense.collection.temp r_session = self.get_rsession() self.func1(r_session) def func1(self, r): """ In this function the connection to mongoDB happens in R with the help of mongolite package :param r: r session with required libraries :return: """ url = self.config.driverlicense.mongo_url db = self.config.driverlicense.collection.data.database collection = self.config.driverlicense.collection.data.name r.source('script1.R') # r.debug("func1") res = r.func1(collection, db, url) results = {} names = ['firstGraph', 'secondGraph', 'thirdGraph'] for i, df in enumerate(res): results[names[i]] = df.to_dict('rec') self.set_source(str(self._id)) self.temp.insert_one(results) if __name__ == '__main__': from core4.queue.helper.functool import execute execute(RJob3)
"created": { "$gte": start, "$lt": end }, "message": re.compile("successful login"), "user": { "$ne": "admin" } }, sort=[("_id", -1)], projection=["created", "user"]) data = list(cur) self.logger.debug("extracted [%d] records in [%s] - [%s]", len(data), start, end) if data: self.set_source(str(start.date())) self.target_collection.update_one(filter={"_id": start}, update={ "$set": { "data": [(d["user"], d["created"]) for d in data] } }, upsert=True) if __name__ == '__main__': from core4.queue.helper.functool import execute execute(AggregateCore4Usage, reset=True)
links = re.findall("href=[\"\'](.+?)[\"\']", body) xls_all = [ href for href in links if href.endswith(".xls") or href.endswith(".xlsx") ] xls = [ filename for filename in xls_all if "Angebote_Ranking" in filename ] self.logger.info("found [%d] xlsx files", len(xls)) download = 0 for link in xls: # check if file already exists in the database doc = self.gfs.find_one({"filename": link}) if doc is None: # if not save the file to mongoDB self.logger.info("download [%s]", link) rv = requests.get(link) if not test: self.gfs.put(rv.content, filename=link) download += 1 self.progress(download / len(xls)) self.logger.info("successfully retrieved [%d] of [%d] files", download, len(xls)) enqueue(ProcessFiles, concurrent=True) if __name__ == '__main__': from core4.queue.helper.functool import execute execute(ScrapeFacts, test=False)
import random import time class SwarmJob(CoreJob): author = "mra" def execute(self, count=20, prob_launch=0.5, max_launch=3, min_sleep=3, max_sleep=10, **kwargs): sleep = random.randint(min_sleep, max_sleep) self.logger.info("sleeping [%d] seconds", sleep) time.sleep(sleep) if count > 0: n = random.randint(0, count) for i in range(n): if random.random() <= prob_launch: enqueue(self.__class__, count=count - 2, prob_launch=prob_launch, max_launch=max_launch, id="%s.%d" % (self._id, i)) if __name__ == '__main__': execute(SwarmJob, count=3, min_sleep=0, max_sleep=3)
"" if pd.isnull(c) else c.replace("\n", " ").replace(".", "") for c in cols ] if "" in d.columns: d.drop([""], axis=1, inplace=True) d["Analyse"] = analyse d["Grundgesamtheit"] = grundgesamtheit d["Zeitraum"] = zeitraum d["Vorfilter"] = vorfilter d["Zielgruppe"] = zielgruppe doc = d.to_dict("rec") n = 0 d = self.target.delete_many({"_src": basename}) if d.deleted_count > 0: self.logger.info("reset [%d] records for [%s]", d.deleted_count, basename) for rec in doc: nrec = {} for k, v in rec.items(): if not pd.isnull(v): nrec[k] = v self.target.insert_one(nrec, _src=basename) n += 1 self.logger.info("inserted [%d] records for [%s]", n, basename) if __name__ == '__main__': from core4.queue.helper.functool import execute #execute(ScrapeFacts, test=False) execute(ExtractFacts, test=True)
"").split()) d["Monat"] = [ datetime.datetime.strptime("01." + MONAT[m[0]] + "." + m[1], "%d.%m.%Y") for m in monat] d["val"] = d["Kontakte Mio"].apply(pd.to_numeric, errors='coerce') d['Date'] = d.Monat.apply(lambda x: x.date().isoformat()) doc = d.to_dict("rec") n = 0 # delete any previous version of the file in the database d = self.target.delete_many({"_src": basename}) if d.deleted_count > 0: self.logger.info("reset [%d] records for [%s]", d.deleted_count, basename) # insert the processed file in the database # each row of the dataframe is inserted as a record for rec in doc: nrec = {} for k, v in rec.items(): if not pd.isnull(v): nrec[k] = v self.target.insert_one(nrec) n += 1 self.logger.info("inserted [%d] records for [%s]", n, basename) if __name__ == '__main__': from core4.queue.helper.functool import execute execute(ProcessFiles, test=False, threaded=True)
url = DEP_URL.format(id=station_id) resp = requests.get(url, headers={ 'X-MVG-Authorization-Key': self.config.meetup.mvg.key}) self.logger.debug("response:\n%s", resp.content) if resp.status_code != 200: raise RuntimeError("MVG API returned [%s]", resp.status_code) data = set() for r in resp.json()["departures"]: data.add(( station_id, r["product"], r["label"], r["lineBackgroundColor"], r["destination"], datetime.datetime.fromtimestamp( r["departureTime"] / 1000).replace(second=0), int(distance), name )) return data if __name__ == '__main__': from core4.queue.helper.functool import execute execute(MyJob)
if running > 0: total = float(math.ceil((end - start + 1) / size)) p = 1. - running / total self.progress(p, "%d of %d running", running, total) self.defer("check %f: waiting for %d of %d jobs to complete", p, running, total) self.logger.info("check: seen all jobs complete") return # calculating coll = self.config.mypro.prime_collection self.set_source(self.started_at.isoformat()) n = 0 for i in range(start, end): if check_prime(i): n += 1 try: coll.insert_one({"_id": i}) except DuplicateKeyError: pass except: raise self.progress(i / end) self.logger.debug("found [%d] primes", n) if __name__ == '__main__': from core4.queue.helper.functool import execute execute(PrimeJob, start=1, end=3000000, size=500000) # execute(PrimeJob, start=1, end=100000, mid="5c7445f0ad7071140796f3c6")