Exemple #1
0
def test_change_source():
    ret = execute(Job4)
    assert ret["state"] == "complete"
    base = Job4()
    coll = base.config.tests.test_collection
    for doc in coll.find():
        assert doc["source"] == doc["_src"]
Exemple #2
0
def test_multi_source():
    ret = execute(Job3)
    assert ret["state"] == "complete"
    base = Job3()
    coll = base.config.tests.test_collection
    for doc in coll.find():
        assert doc["source"] == doc["_src"]
        assert doc["_job_id"] == ret["_id"]
    assert sorted(ret["sources"]) == ['test1.txt', 'test2.txt']
Exemple #3
0
def test_job_collection():
    ret = execute(Job2)
    assert ret["state"] == "complete"
Exemple #4
0
        res = r.func1(collection, db, url)
        #self.logger.info('Dataframe size is %s', res.shape)
        return res

    def func2(self, r):
        """
        In this function R recieves the data as an R dataframe
        object using the function pandas2ri.py2ri
        :param r: r session with required libraries
        :return:
        """
        cur = self.data.find()
        df = pd.DataFrame(list(cur))
        df = df.replace(np.nan, 0)
        df['Monat'] = df['Monat'].dt.strftime(
            '%Y-%m-%d')
        df.columns = [self.remove_accents(c.replace("%"," percent")) for c in df.columns]
        for col in df.columns:
           df[col] = df[col].apply(self.remove_accents) if col in ['Grundgesamtheit', 'Titel', 'Analyse'] else df[col]
        #df = py_to_r(df)
        r.source('script1.R')
        res = r.func2()
        return res

    def remove_accents(self, string):
        return unidecode.unidecode(string)

if __name__ == '__main__':
    from core4.queue.helper.functool import execute
    execute(RJob)
Exemple #5
0
def test_nojob_collection():
    ret = execute(Job1)
    assert ret["last_error"]["exception"].startswith(
        """AttributeError('_id and _src must not be None"""
    )
    assert ret["state"] == "error"
Exemple #6
0
            df.Date = pd.to_datetime(df.Date, format="%Y-%m-%d")
            df = df[(df.Date >= start) & (df.Date <= end)]
            df.Date = df.Date.apply(lambda x: datetime.strftime(x, "%Y-%m-%d"))
        df['Kontakte Mio'].replace("--", None, inplace=True)
        df = df.replace(np.nan, 0)
        g = df.groupby(["Date"])['Kontakte Mio'].agg('sum')
        results['firstGraph'] = g.to_dict()
        self.logger.info("Data created for the first graph")

        # second graph
        df_new = df[df.Medientyp != 0]
        g1 = df_new.groupby(["Medientyp"])['Kontakte Mio'].agg('sum')
        results['secondGraph'] = g1.to_dict()
        self.logger.info("Data created for the second graph")

        # third graph
        df_new = df[df.Medientyp != 0]
        # Monthly contacts for each media group
        g2 = df_new.groupby(["Date", "Medientyp"])['Kontakte Mio'].agg('sum')
        results['thirdGraph'] = g2.reset_index().to_dict('rec')
        self.logger.info("Data created for the third graph")
        self.set_source(str(self._id))
        self.temp.insert_one(results)
        self.logger.info("inserted results")



if __name__ == '__main__':
    from core4.queue.helper.functool import execute
    execute(ExtractFacts, test=False)
Exemple #7
0
        self.data = self.config.driverlicense.collection.data
        self.temp = self.config.driverlicense.collection.temp
        r_session = self.get_rsession()
        self.func1(r_session)

    def func1(self, r):
        """
        In this function the connection to mongoDB happens in
        R with the help of mongolite package
        :param r: r session with required libraries
        :return:
        """
        url = self.config.driverlicense.mongo_url
        db = self.config.driverlicense.collection.data.database
        collection = self.config.driverlicense.collection.data.name
        r.source('script1.R')
        # r.debug("func1")
        res = r.func1(collection, db, url)
        results = {}
        names = ['firstGraph', 'secondGraph', 'thirdGraph']
        for i, df in enumerate(res):
            results[names[i]] = df.to_dict('rec')
        self.set_source(str(self._id))
        self.temp.insert_one(results)


if __name__ == '__main__':
    from core4.queue.helper.functool import execute

    execute(RJob3)
Exemple #8
0
                "created": {
                    "$gte": start,
                    "$lt": end
                },
                "message": re.compile("successful login"),
                "user": {
                    "$ne": "admin"
                }
            },
            sort=[("_id", -1)],
            projection=["created", "user"])
        data = list(cur)
        self.logger.debug("extracted [%d] records in [%s] - [%s]", len(data),
                          start, end)
        if data:
            self.set_source(str(start.date()))
            self.target_collection.update_one(filter={"_id": start},
                                              update={
                                                  "$set": {
                                                      "data": [(d["user"],
                                                                d["created"])
                                                               for d in data]
                                                  }
                                              },
                                              upsert=True)


if __name__ == '__main__':
    from core4.queue.helper.functool import execute
    execute(AggregateCore4Usage, reset=True)
Exemple #9
0
        links = re.findall("href=[\"\'](.+?)[\"\']", body)
        xls_all = [
            href for href in links
            if href.endswith(".xls") or href.endswith(".xlsx")
        ]
        xls = [
            filename for filename in xls_all if "Angebote_Ranking" in filename
        ]
        self.logger.info("found [%d] xlsx files", len(xls))

        download = 0
        for link in xls:
            # check if file already exists in the database
            doc = self.gfs.find_one({"filename": link})
            if doc is None:
                # if not save the file to mongoDB
                self.logger.info("download [%s]", link)
                rv = requests.get(link)
                if not test:
                    self.gfs.put(rv.content, filename=link)
            download += 1
            self.progress(download / len(xls))
        self.logger.info("successfully retrieved [%d] of [%d] files", download,
                         len(xls))
        enqueue(ProcessFiles, concurrent=True)


if __name__ == '__main__':
    from core4.queue.helper.functool import execute
    execute(ScrapeFacts, test=False)
Exemple #10
0
import random
import time


class SwarmJob(CoreJob):
    author = "mra"

    def execute(self,
                count=20,
                prob_launch=0.5,
                max_launch=3,
                min_sleep=3,
                max_sleep=10,
                **kwargs):
        sleep = random.randint(min_sleep, max_sleep)
        self.logger.info("sleeping [%d] seconds", sleep)
        time.sleep(sleep)
        if count > 0:
            n = random.randint(0, count)
            for i in range(n):
                if random.random() <= prob_launch:
                    enqueue(self.__class__,
                            count=count - 2,
                            prob_launch=prob_launch,
                            max_launch=max_launch,
                            id="%s.%d" % (self._id, i))


if __name__ == '__main__':
    execute(SwarmJob, count=3, min_sleep=0, max_sleep=3)
Exemple #11
0
            "" if pd.isnull(c) else c.replace("\n", " ").replace(".", "")
            for c in cols
        ]
        if "" in d.columns:
            d.drop([""], axis=1, inplace=True)
        d["Analyse"] = analyse
        d["Grundgesamtheit"] = grundgesamtheit
        d["Zeitraum"] = zeitraum
        d["Vorfilter"] = vorfilter
        d["Zielgruppe"] = zielgruppe
        doc = d.to_dict("rec")
        n = 0
        d = self.target.delete_many({"_src": basename})
        if d.deleted_count > 0:
            self.logger.info("reset [%d] records for [%s]", d.deleted_count,
                             basename)
        for rec in doc:
            nrec = {}
            for k, v in rec.items():
                if not pd.isnull(v):
                    nrec[k] = v
            self.target.insert_one(nrec, _src=basename)
            n += 1
        self.logger.info("inserted [%d] records for [%s]", n, basename)


if __name__ == '__main__':
    from core4.queue.helper.functool import execute
    #execute(ScrapeFacts, test=False)
    execute(ExtractFacts, test=True)
Exemple #12
0
                                                               "").split())
        d["Monat"] = [
            datetime.datetime.strptime("01." + MONAT[m[0]] + "." + m[1],
                                       "%d.%m.%Y") for m in monat]
        d["val"] = d["Kontakte Mio"].apply(pd.to_numeric,
                                                     errors='coerce')
        d['Date'] = d.Monat.apply(lambda x: x.date().isoformat())

        doc = d.to_dict("rec")
        n = 0
        # delete any previous version of the file in the database
        d = self.target.delete_many({"_src": basename})
        if d.deleted_count > 0:
            self.logger.info("reset [%d] records for [%s]", d.deleted_count,
                             basename)
        # insert the processed file in the database
        # each row of the dataframe is inserted as a record
        for rec in doc:
            nrec = {}
            for k, v in rec.items():
                if not pd.isnull(v):
                    nrec[k] = v
            self.target.insert_one(nrec)
            n += 1
        self.logger.info("inserted [%d] records for [%s]", n, basename)


if __name__ == '__main__':
    from core4.queue.helper.functool import execute
    execute(ProcessFiles, test=False, threaded=True)
Exemple #13
0
        url = DEP_URL.format(id=station_id)
        resp = requests.get(url, headers={
            'X-MVG-Authorization-Key': self.config.meetup.mvg.key})

        self.logger.debug("response:\n%s", resp.content)
        if resp.status_code != 200:
            raise RuntimeError("MVG API returned [%s]", resp.status_code)

        data = set()
        for r in resp.json()["departures"]:
            data.add((
                station_id,
                r["product"],
                r["label"],
                r["lineBackgroundColor"],
                r["destination"],
                datetime.datetime.fromtimestamp(
                    r["departureTime"] / 1000).replace(second=0),
                int(distance),
                name
            ))

        return data


if __name__ == '__main__':
    from core4.queue.helper.functool import execute

    execute(MyJob)

Exemple #14
0
            if running > 0:
                total = float(math.ceil((end - start + 1) / size))
                p = 1. - running / total
                self.progress(p, "%d of %d running", running, total)
                self.defer("check %f: waiting for %d of %d jobs to complete",
                           p, running, total)
            self.logger.info("check: seen all jobs complete")
            return

        # calculating
        coll = self.config.mypro.prime_collection
        self.set_source(self.started_at.isoformat())
        n = 0
        for i in range(start, end):
            if check_prime(i):
                n += 1
                try:
                    coll.insert_one({"_id": i})
                except DuplicateKeyError:
                    pass
                except:
                    raise
            self.progress(i / end)
        self.logger.debug("found [%d] primes", n)


if __name__ == '__main__':
    from core4.queue.helper.functool import execute
    execute(PrimeJob, start=1, end=3000000, size=500000)
    # execute(PrimeJob, start=1, end=100000, mid="5c7445f0ad7071140796f3c6")