def test_sleep_for_rate_limit(self):
        """Test whether the time to reset is zero if the sleep time is negative"""

        client = MockedClient(CLIENT_API_URL, sleep_time=0.1, max_retries=1,
                              min_rate_to_sleep=100,
                              sleep_for_rate=True)
        client.rate_limit = 50
        self.rate_limit_reset_ts = -1

        before = datetime_utcnow().replace(microsecond=0).timestamp()
        client.sleep_for_rate_limit()
        after = datetime_utcnow().replace(microsecond=0).timestamp()

        self.assertEqual(before, after)
    def test_archived_after(self):
        """Test if only those items archived after a date are returned"""

        manager = ArchiveManager(self.test_path)

        category = 'mock_item'
        args = {
            'origin': 'http://example.com/',
            'tag': 'test',
            'subtype': 'mocksubtype',
            'from-date': str_to_datetime('2015-01-01')
        }

        items = fetch(CommandBackend, args, category, manager=manager)
        items = [item for item in items]
        self.assertEqual(len(items), 5)

        archived_dt = datetime_utcnow()

        items = fetch(CommandBackend, args, category, manager=manager)
        items = [item for item in items]
        self.assertEqual(len(items), 5)

        # Fetch items from the archive
        items = fetch_from_archive(CommandBackend, args, manager, category,
                                   str_to_datetime('1970-01-01'))
        items = [item for item in items]
        self.assertEqual(len(items), 10)

        # Fetch items archived after the given date
        items = fetch_from_archive(CommandBackend, args, manager, category,
                                   archived_dt)
        items = [item for item in items]
        self.assertEqual(len(items), 5)
Beispiel #3
0
 def __init__(self, type, job_id, task_id, payload):
     self.uuid = str(uuid.uuid4())
     self.timestamp = datetime_utcnow()
     self.type = type
     self.job_id = job_id
     self.task_id = task_id
     self.payload = payload
Beispiel #4
0
    def metadata(self, item, filter_classified=False):
        """Add metadata to an item.

        It adds metadata to a given item such as how and
        when it was fetched. The contents from the original item will
        be stored under the 'data' keyword.

        :param item: an item fetched by a backend
        :param filter_classified: sets if classified fields were filtered
        """
        item = {
            'backend_name': self.__class__.__name__,
            'backend_version': self.version,
            'perceval_version': __version__,
            'timestamp': datetime_utcnow().timestamp(),
            'origin': self.origin,
            'uuid': uuid(self.origin, self.metadata_id(item)),
            'updated_on': self.metadata_updated_on(item),
            'classified_fields_filtered': self.classified_fields if filter_classified else None,
            'category': self.metadata_category(item),
            'search_fields': self.search_fields(item),
            'tag': self.tag,
            'data': item,
        }

        return item
Beispiel #5
0
    def __init__(self,
                 task_id,
                 backend,
                 category,
                 backend_args,
                 archiving_cfg=None,
                 scheduling_cfg=None):
        try:
            bklass = perceval.backend.find_backends(
                perceval.backends)[0][backend]
        except KeyError:
            raise NotFoundError(element=backend)

        self._task_id = task_id
        self._has_resuming = bklass.has_resuming()

        self.status = TaskStatus.NEW
        self.age = 0
        self.num_failures = 0
        self.jobs = []
        self.created_on = datetime_utcnow().timestamp()
        self.backend = backend
        self.category = category
        self.backend_args = backend_args
        self.archiving_cfg = archiving_cfg if archiving_cfg else None
        self.scheduling_cfg = scheduling_cfg if scheduling_cfg else None
Beispiel #6
0
    def fetch_items(self, category, **kwargs):
        """Fetch the entries.

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """
        logger.info("Looking for a mkt form at '%s'", self.origin)

        nentries = 0

        entries = self.client.get_entries()

        for item in _parse_entries(entries):
            # Need to pass which columns are IDs to metadata_id static function
            ret = {'_id_columns': ID_COLUMNS}

            for i, column in enumerate(CSV_HEADER.split(',')):
                value = item[i]
                if isinstance(item[i], str):
                    value = item[i].strip()

                ret[column.strip()] = value

            ret['timestamp'] = datetime_utcnow().isoformat()
            yield ret
            nentries += 1

        logger.info("Done. %s form entries fetched", nentries)
    def __parse_hits(self, hit_raw):
        """Parse the hits returned by the Google Search API"""

        # Create the soup and get the desired div
        bs_result = bs4.BeautifulSoup(hit_raw, 'html.parser')
        hit_string = bs_result.find("div", id="resultStats").text

        # Remove commas or dots
        hit_string = hit_string.replace(',', u'')
        hit_string = hit_string.replace('.', u'')

        fetched_on = datetime_utcnow().timestamp()
        id_args = self.keywords[:]
        id_args.append(str(fetched_on))

        hits_json = {
            'fetched_on': fetched_on,
            'id': uuid(*id_args),
            'keywords': self.keywords,
            'type': 'googleSearchHits'
        }

        if not hit_string:
            logger.warning("No hits for %s", self.keywords)
            hits_json['hits'] = 0

            return hits_json

        str_hits = re.search(r'\d+', hit_string).group(0)
        hits = int(str_hits)
        hits_json['hits'] = hits

        return hits_json
Beispiel #8
0
    def __get_rich_bugs(self, data):
        """Create enriched data for bugs"""

        rich_bugtask = {}

        # Time to
        if not data["is_complete"]:
            rich_bugtask["time_open_days"] = get_time_diff_days(
                data['date_created'],
                datetime_utcnow().replace(tzinfo=None))
        else:
            rich_bugtask["time_open_days"] = get_time_diff_days(
                data['date_created'], data['date_closed'])
            rich_bugtask["time_created_to_assigned"] = get_time_diff_days(
                data['date_created'], data['date_assigned'])
            rich_bugtask['time_assigned_to_closed'] = get_time_diff_days(
                data['date_assigned'], data['date_closed'])
            rich_bugtask["time_to_close_days"] = get_time_diff_days(
                data['date_created'], data['date_closed'])

        if data['activity_data']:
            rich_bugtask['time_to_last_update_days'] = \
                get_time_diff_days(data['date_created'], data['activity_data'][-1]['datechanged'])

        rich_bugtask['reopened'] = 1 if data['date_left_closed'] else 0
        rich_bugtask['time_to_fix_commit'] = get_time_diff_days(
            data['date_created'], data['date_fix_committed'])
        rich_bugtask['time_worked_on'] = get_time_diff_days(
            data['date_in_progress'], data['date_fix_committed'])
        rich_bugtask['time_to_confirm'] = get_time_diff_days(
            data['date_created'], data['date_confirmed'])

        # Author and assignee data
        owner = data.get('owner_data', None)
        if owner:
            rich_bugtask['user_login'] = owner.get('name', None)
            rich_bugtask['user_name'] = owner.get('display_name', None)
            rich_bugtask['user_joined'] = owner.get('date_created', None)
            rich_bugtask['user_karma'] = owner.get('karma', None)
            rich_bugtask['user_time_zone'] = owner.get('time_zone', None)

        assignee = data.get('assignee_data', None)
        if assignee:
            assignee = data['assignee_data']
            rich_bugtask['assignee_login'] = assignee.get('name', None)
            rich_bugtask['assignee_name'] = assignee.get('display_name', None)
            rich_bugtask['assignee_joined'] = assignee.get(
                'date_created', None)
            rich_bugtask['assignee_karma'] = assignee.get('karma', None)
            rich_bugtask['assignee_time_zone'] = assignee.get(
                'time_zone', None)

        # Extract info related to bug
        rich_bugtask.update(self.__extract_bug_info(data['bug_data']))

        rich_bugtask['time_to_first_attention'] = \
            get_time_diff_days(data['date_created'], self.get_time_to_first_attention(data))
        rich_bugtask['activity_count'] = len(data['activity_data'])

        return rich_bugtask
Beispiel #9
0
    def fetch_items(self, category, **kwargs):
        """Fetch the metrics

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """
        from_date = kwargs['from_date']

        nmetrics = 0
        component_metrics_raw = self.client.component_metrics(from_date=from_date)

        component = json.loads(component_metrics_raw)['component']
        for metric in component['measures']:

            fetched_on = datetime_utcnow().timestamp()
            id_args = [component['key'], metric['metric'], str(fetched_on)]
            metric['id'] = uuid(*id_args)
            metric['fetched_on'] = fetched_on

            yield metric
            nmetrics += 1

        logger.info("Fetch process completed: %s metrics fetched", nmetrics)
    def calculate_time_to_reset(self):
        """Number of seconds to wait. They are contained in the rate limit reset header"""

        time_to_reset = self.rate_limit_reset_ts - (datetime_utcnow().replace(microsecond=0).timestamp() + 1)
        time_to_reset = 0 if time_to_reset < 0 else time_to_reset

        return time_to_reset
Beispiel #11
0
    def test_datetime_utcnow(self):
        """Check whether timezone information is added"""

        now = datetime_utcnow()
        timezone = str(now.tzinfo)
        expected = "UTC+00:00"

        self.assertTrue(timezone, expected)
Beispiel #12
0
    def __fetch_summary(self):
        """Fetch summary"""

        raw_summary = self.client.summary()
        summary = json.loads(raw_summary)
        summary['fetched_on'] = str(datetime_utcnow())

        yield summary
Beispiel #13
0
    def enrich_git_branches(self, ocean_backend, enrich_backend, run_month_days=[7, 14, 21, 28]):
        """Update the information about branches within the documents representing
        commits in the enriched index.

        The example below shows how to activate the study by modifying the setup.cfg. The study
        `enrich_git_branches` will be run on days depending on the parameter `run_month_days`,
        by default the days are 7, 14, 21, and 28 of each month.

        ```
        [git]
        raw_index = git_raw
        enriched_index = git_enriched
        ...
        studies = [enrich_git_branches]

        [enrich_git_branches]
        run_month_days = [5, 22]
        ```

        :param ocean_backend: the ocean backend
        :param enrich_backend: the enrich backend
        :param run_month_days: days of the month to run this study
        """
        logger.debug("[git] study git-branches start")
        day = datetime_utcnow().day
        run_month_days = list(map(int, run_month_days))
        if day not in run_month_days:
            logger.debug("[git] study git-branches will execute only the days {} of each month".format(run_month_days))
            logger.debug("[git] study git-branches end")
            return

        for ds in self.prjs_map:
            if ds != "git":
                continue

            urls = self.prjs_map[ds]

            for url in urls:
                cmd = GitCommand(*[url])

                git_repo = GitRepository(cmd.parsed_args.uri, cmd.parsed_args.gitpath)

                logger.debug("[git] study git-branches delete branch info for repo {} in index {}".format(
                             git_repo.uri, anonymize_url(enrich_backend.elastic.index_url)))
                self.delete_commit_branches(git_repo, enrich_backend)

                logger.debug("[git] study git-branches add branch info for repo {} in index {}".format(
                             git_repo.uri, anonymize_url(enrich_backend.elastic.index_url)))
                try:
                    self.add_commit_branches(git_repo, enrich_backend)
                except Exception as e:
                    logger.error("[git] study git-branches failed on repo {}, due to {}".format(git_repo.uri, e))
                    continue

                logger.debug("[git] study git-branches repo {} in index {} processed".format(
                             git_repo.uri, anonymize_url(enrich_backend.elastic.index_url)))

        logger.debug("[git] study git-branches end")
Beispiel #14
0
    def calculate_time_to_reset(self):
        """Calculate the seconds to reset the token requests, by obtaining the different
        between the current date and the next date when the token is fully regenerated.
        """

        time_to_reset = self.rate_limit_reset_ts - (datetime_utcnow().replace(microsecond=0).timestamp() + 1)
        time_to_reset = 0 if time_to_reset < 0 else time_to_reset

        return time_to_reset
Beispiel #15
0
 def decorator(self, *args, **kwargs):
     eitem = func(self, *args, **kwargs)
     metadata = {
         'metadata__gelk_version': self.gelk_version,
         'metadata__gelk_backend_name': self.__class__.__name__,
         'metadata__enriched_on': datetime_utcnow().isoformat()
     }
     eitem.update(metadata)
     return eitem
Beispiel #16
0
    def test_search_archived_after(self):
        """Check if a set of archives created after a given date are searched"""

        archive_mng_path = os.path.join(self.test_path, ARCHIVE_TEST_DIR)
        manager = ArchiveManager(archive_mng_path)

        # First set of archives to create
        metadata = [
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.8',
                'category': 'commit',
                'backend_params': {},
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'gerrit',
                'backend_version': '0.1',
                'category': 'changes',
                'backend_params': {}
            },
        ]

        for meta in metadata:
            archive = manager.create_archive()
            archive.init_metadata(**meta)

        # Second set, archived after the date we'll use to search
        after_dt = datetime_utcnow()
        metadata = [
            {
                'origin': 'https://example.org',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            }
        ]

        for meta in metadata:
            archive = manager.create_archive()
            archive.init_metadata(**meta)
            meta['filepath'] = archive.archive_path

        archives = manager.search('https://example.com', 'git', 'commit',
                                  after_dt)

        expected = [metadata[1]['filepath']]
        self.assertListEqual(archives, expected)
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the mbox files from the remote archiver.

        This method stores the archives in the path given during the
        initialization of this object.

        HyperKitty archives are accessed month by month and stored following
        the schema year-month. Archives are fetched from the given month
        till the current month.

        :param from_date: fetch archives that store messages
            equal or after the given date; only year and month values
            are compared

        :returns: a list of tuples, storing the links and paths of the
            fetched archives
        """
        logger.info("Downloading mboxes from '%s' to since %s",
                    self.client.base_url, str(from_date))
        logger.debug("Storing mboxes in '%s'", self.dirpath)

        self.client.fetch(self.client.base_url)

        from_date = datetime_to_utc(from_date)
        to_end = datetime_utcnow()
        to_end += dateutil.relativedelta.relativedelta(months=1)

        months = months_range(from_date, to_end)

        fetched = []

        if not os.path.exists(self.dirpath):
            os.makedirs(self.dirpath)

        tmbox = 0

        for dts in months:
            tmbox += 1
            start, end = dts[0], dts[1]
            filename = start.strftime("%Y-%m.mbox.gz")
            filepath = os.path.join(self.dirpath, filename)

            url = urijoin(self.client.base_url, 'export', filename)

            params = {
                'start': start.strftime("%Y-%m-%d"),
                'end': end.strftime("%Y-%m-%d")
            }

            success = self._download_archive(url, params, filepath)

            if success:
                fetched.append((url, filepath))

        logger.info("%s/%s MBoxes downloaded", len(fetched), tmbox)

        return fetched
Beispiel #18
0
 def __init__(self, uri, archive=None, from_archive=False):
     if uri.startswith('file://'):
         self.file_path = uri.split('file://', 1)[1]
     else:
         self.file_path = tempfile.mkdtemp(
         ) + "/perceval-ow2-mkt-backend-" + str(datetime_utcnow()) + ".csv"
         super().__init__(uri, archive=archive, from_archive=from_archive)
         response = self.session.get(uri)
         open(self.file_path, 'wb').write(response.content)
Beispiel #19
0
    def __fetch_repo_info(self):
        """Get repo info about stars, watchers and forks"""

        raw_repo = self.client.repo()
        repo = json.loads(raw_repo)

        fetched_on = datetime_utcnow()
        repo['fetched_on'] = fetched_on.timestamp()

        yield repo
    def calculate_time_to_reset(self):
        """Number of seconds to wait. They are contained in the rate limit reset header"""

        current_epoch = (datetime_utcnow().replace(microsecond=0).timestamp() +
                         1) * 1000
        time_to_reset = (self.rate_limit_reset_ts - current_epoch) / 1000

        if time_to_reset < 0:
            time_to_reset = 0

        return time_to_reset
    def enrich_onion(self, enrich_backend, in_index, out_index, data_source,
                     contribs_field, timeframe_field, sort_on_field,
                     seconds=ONION_INTERVAL, no_incremental=False):

        log_prefix = "[" + data_source + "] study onion"

        logger.info(log_prefix + "  Starting study - Input: " + in_index + " Output: " + out_index)

        # Creating connections
        es = Elasticsearch([enrich_backend.elastic.url], retry_on_timeout=True, timeout=100,
                           verify_certs=self.elastic.requests.verify)

        in_conn = ESOnionConnector(es_conn=es, es_index=in_index,
                                   contribs_field=contribs_field,
                                   timeframe_field=timeframe_field,
                                   sort_on_field=sort_on_field)
        out_conn = ESOnionConnector(es_conn=es, es_index=out_index,
                                    contribs_field=contribs_field,
                                    timeframe_field=timeframe_field,
                                    sort_on_field=sort_on_field,
                                    read_only=False)

        if not in_conn.exists():
            logger.info(log_prefix + " Missing index %s", in_index)
            return

        # Check last execution date
        latest_date = None
        if out_conn.exists():
            latest_date = out_conn.latest_enrichment_date()

        if latest_date:
            logger.info(log_prefix + " Latest enrichment date: " + latest_date.isoformat())
            update_after = latest_date + timedelta(seconds=seconds)
            logger.info(log_prefix + " Update after date: " + update_after.isoformat())
            if update_after >= datetime_utcnow():
                logger.info(log_prefix + " Too soon to update. Next update will be at " + update_after.isoformat())
                return

        # Onion currently does not support incremental option
        logger.info(log_prefix + " Creating out ES index")
        # Initialize out index
        filename = pkg_resources.resource_filename('grimoire_elk', 'enriched/mappings/onion.json')
        out_conn.create_index(filename, delete=out_conn.exists())

        onion_study(in_conn=in_conn, out_conn=out_conn, data_source=data_source)

        # Create alias if output index exists (index is always created from scratch, so
        # alias need to be created each time)
        if out_conn.exists() and not out_conn.exists_alias(out_index, ONION_ALIAS):
            logger.info(log_prefix + " Creating alias: %s", ONION_ALIAS)
            out_conn.create_alias(ONION_ALIAS)

        logger.info(log_prefix + " This is the end.")
def main():
    """This script downloads and processes the archives from githubarchive.com
    between two dates (--from-date and --to-date) to a folder (--folder). It returns a CSV
    file (--output), which contains the pull requests and issues opened by a set of GitHub users
    (included in the file --usernames).
    """
    logging.getLogger().setLevel(logging.DEBUG)

    args = parser(sys.argv[1:])

    folder = args.folder
    download = args.download
    from_date = args.from_date
    to_date = args.to_date
    usernames = args.usernames
    output = args.output

    start_time = datetime_utcnow().isoformat()
    logging.debug("script started at: %s", start_time)

    if not os.path.exists(folder):
        os.makedirs(folder)

    if download:
        download_archives(folder, from_date, to_date)

    if not os.path.exists(output):
        with open(output, 'w') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
            writer.writerow([g for g in SCHEMA])

    with open(usernames, 'r') as content:
        for line in content:
            activities = process_archives(folder, from_date, to_date, line.strip())

            with open(output, 'a') as csvfile:
                writer = csv.writer(csvfile, delimiter=',')
                writer.writerows(activities)

    end_time = datetime_utcnow().isoformat()
    logging.debug("script ended at: %s", end_time)
Beispiel #23
0
    def init_metadata(self, origin, backend_name, backend_version, category,
                      backend_params):
        """Init metadata information.

        Metatada is composed by basic information needed to identify
        where archived data came from and how it can be retrieved
        and built into Perceval items.

        :param: origin: identifier of the repository
        :param: backend_name: name of the backend
        :param: backend_version: version of the backend
        :param: category: category of the items fetched
        :param: backend_params: dict representation of the fetch parameters

        raises ArchiveError: when an error occurs initializing the metadata
        """
        created_on = datetime_to_utc(datetime_utcnow())
        created_on_dumped = created_on.isoformat()
        backend_params_dumped = pickle.dumps(backend_params, 0)

        metadata = (
            origin,
            backend_name,
            backend_version,
            category,
            backend_params_dumped,
            created_on_dumped,
        )

        try:
            cursor = self._db.cursor()
            insert_stmt = "INSERT INTO " + self.METADATA_TABLE + " "\
                          "(origin, backend_name, backend_version, " \
                          "category, backend_params, created_on) " \
                          "VALUES (?, ?, ?, ?, ?, ?)"
            cursor.execute(insert_stmt, metadata)

            self._db.commit()
            cursor.close()
        except sqlite3.DatabaseError as e:
            msg = "metadata initialization error; cause: %s" % str(e)
            raise ArchiveError(cause=msg)

        self.origin = origin
        self.backend_name = backend_name
        self.backend_version = backend_version
        self.category = category
        self.backend_params = backend_params
        self.created_on = created_on

        logger.debug("Metadata of archive %s initialized to %s",
                     self.archive_path, metadata)
    def calculate_time_to_reset(self):
        """Number of seconds to wait.

        The time is obtained by the different between the current date
        and the next date when the token is fully regenerated.
        """
        current_epoch = datetime_utcnow().replace(microsecond=0).timestamp() + 1
        time_to_reset = self.rate_limit_reset_ts - current_epoch

        if time_to_reset < 0:
            time_to_reset = 0

        return time_to_reset
Beispiel #25
0
    def test_init_metadata(self):
        """Test whether metadata information is properly initialized"""

        archive_path = os.path.join(self.test_path, 'myarchive')
        archive = Archive.create(archive_path)

        before_dt = datetime_to_utc(datetime_utcnow())
        archive.init_metadata('marvel.com', 'marvel-comics-backend', '0.1.0',
                              'issue', {'from_date': before_dt})
        after_dt = datetime_to_utc(datetime_utcnow())

        archive_copy = Archive(archive_path)

        # Both copies should have the same parameters
        for arch in [archive, archive_copy]:
            self.assertEqual(arch.origin, 'marvel.com')
            self.assertEqual(arch.backend_name, 'marvel-comics-backend')
            self.assertEqual(arch.backend_version, '0.1.0')
            self.assertEqual(arch.category, 'issue')
            self.assertGreaterEqual(arch.created_on, before_dt)
            self.assertLessEqual(arch.created_on, after_dt)
            self.assertDictEqual(arch.backend_params, {'from_date': before_dt})
Beispiel #26
0
    def test_initialization(self):
        """Test if the instance is correctly initialized"""

        dt_before = datetime_utcnow()
        event = JobEvent(JobEventType.COMPLETED, '1', None)
        dt_after = datetime_utcnow()

        self.assertEqual(event.type, JobEventType.COMPLETED)
        self.assertEqual(event.job_id, '1')
        self.assertEqual(event.payload, None)
        self.assertGreater(event.timestamp, dt_before)
        self.assertLess(event.timestamp, dt_after)

        dt_before = datetime_utcnow()
        event = JobEvent(JobEventType.FAILURE, '2', "Error")
        dt_after = datetime_utcnow()

        self.assertEqual(event.type, JobEventType.FAILURE)
        self.assertEqual(event.job_id, '2')
        self.assertEqual(event.payload, "Error")
        self.assertGreater(event.timestamp, dt_before)
        self.assertLess(event.timestamp, dt_after)
    def test_decorator(self):
        backend = MockedBackend('test', 'mytag')
        before = datetime_utcnow().timestamp()
        items = [item for item in backend.fetch()]
        after = datetime_utcnow().timestamp()

        for x in range(5):
            item = items[x]

            expected_uuid = uuid('test', str(x))

            self.assertEqual(item['data']['item'], x)
            self.assertEqual(item['backend_name'], 'MockedBackend')
            self.assertEqual(item['backend_version'], '0.2.0')
            self.assertEqual(item['perceval_version'], __version__)
            self.assertEqual(item['origin'], 'test')
            self.assertEqual(item['uuid'], expected_uuid)
            self.assertEqual(item['updated_on'], '2016-01-01')
            self.assertEqual(item['category'], 'mock_item')
            self.assertEqual(item['tag'], 'mytag')
            self.assertGreater(item['timestamp'], before)
            self.assertLess(item['timestamp'], after)

            before = item['timestamp']
    def test_calculate_time_to_reset(self):
        """Test whether the time to reset is zero if the sleep time is negative"""

        httpretty.register_uri(httpretty.GET,
                               GITLAB_URL_PROJECT,
                               body='',
                               status=200,
                               forcing_headers={
                                   'RateLimit-Remaining': '20',
                                   'RateLimit-Reset': int(datetime_utcnow().replace(microsecond=0).timestamp())
                               })

        client = GitLabClient("fdroid", "fdroiddata", "your-token")
        time_to_reset = client.calculate_time_to_reset()

        self.assertEqual(time_to_reset, 0)
Beispiel #29
0
    def delete_items(self, hours_to_retain, time_field="metadata__updated_on"):
        """Delete documents updated before a given date

        :param hours_to_retain: maximum number of hours wrt the current date to retain the data
        :param time_field: time field to delete the data
        """
        if hours_to_retain is None:
            logger.debug(
                "Data retention policy disabled, no items will be deleted.")
            return

        if hours_to_retain <= 0:
            logger.debug("Hours to retain must be greater than 0.")
            return

        before_date = datetime_utcnow() - datetime.timedelta(
            hours=hours_to_retain)
        before_date = before_date.replace(minute=0, second=0, microsecond=0)
        before_date_str = before_date.isoformat()

        es_query = '''
                    {
                      "query": {
                        "range": {
                            "%s": {
                                "lte": "%s"
                            }
                        }
                      }
                    }
                    ''' % (time_field, before_date_str)

        r = self.requests.post(self.index_url + "/_delete_by_query?refresh",
                               data=es_query,
                               headers=HEADER_JSON,
                               verify=False)
        try:
            r.raise_for_status()
            r_json = r.json()
            logger.debug("%s items deleted from %s before %s.",
                         r_json['deleted'], self.anonymize_url(self.index_url),
                         before_date)
        except requests.exceptions.HTTPError as ex:
            logger.error("Error deleted items from %s.",
                         self.anonymize_url(self.index_url))
            logger.error(ex)
            return
Beispiel #30
0
    def test_search(self):
        """Test if a set of archives is found based on the given criteria"""

        archive_mng_path = os.path.join(self.test_path, ARCHIVE_TEST_DIR)
        manager = ArchiveManager(archive_mng_path)

        dt = datetime_utcnow()
        metadata = [
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.8',
                'category': 'commit',
                'backend_params': {},
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'gerrit',
                'backend_version': '0.1',
                'category': 'changes',
                'backend_params': {}
            },
            {
                'origin': 'https://example.org',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            }
        ]

        for meta in metadata:
            archive = manager.create_archive()
            archive.init_metadata(**meta)
            meta['filepath'] = archive.archive_path

        archives = manager.search('https://example.com', 'git', 'commit', dt)

        expected = [metadata[0]['filepath'], metadata[3]['filepath']]
        self.assertListEqual(archives, expected)