예제 #1
0
def eventize_and_enrich(commits, git_enrich):
    logging.info("New commits: " + str(len(commits)))

    # Create events from commits
    # TODO add tests for eventize method
    git_events = Git(commits, git_enrich)
    events_df = git_events.eventize(2)

    logging.info("New events: " + str(len(events_df)))

    # Filter information
    data_filtered = FilterRows(events_df)
    events_df = data_filtered.filter_(["filepath"], "-")

    logging.info("New events filtered: " + str(len(events_df)))

    # Add filetype info
    enriched_filetype = FileType(events_df)
    events_df = enriched_filetype.enrich('filepath')

    logging.info("New Filetype events: " + str(len(events_df)))

    # Split filepath info
    enriched_filepath = FilePath(events_df)
    events_df = enriched_filepath.enrich('filepath')

    logging.info("New Filepath events: " + str(len(events_df)))

    # Deal with surrogates
    convert = ToUTF8(events_df)
    events_df = convert.enrich(["owner"])

    logging.info("Final new events: " + str(len(events_df)))

    return events_df
예제 #2
0
 def test_column_not_exists(self):
     """ Test empty dataframe looking for the corresponding ValueError exception
     """
     df = pandas.DataFrame()
     data_filtered = FilterRows(df)
     with self.assertRaisesRegex(
             ValueError,
             "Column filepath not in DataFrame columns: \[\]") as context:
         data_filtered.filter_(["filepath"], "-")
예제 #3
0
    def process(self, items_block):
        """Process items to add file related information.

        Eventize items creating one new item per each file found in the commit (excluding
        files with no actions performed on them). For each event, file path, file name,
        path parts, file type and file extension are added as fields.

        :param items_block: items to be processed. Expects to find ElasticSearch hits _source part only.
        """

        logger.debug("{} New commits: {}".format(self.__log_prefix,
                                                 len(items_block)))

        # Create events from commits
        git_events = Git(items_block, self._git_enrich)
        events_df = git_events.eventize(2)

        logger.debug("{} New events: {}".format(self.__log_prefix,
                                                len(events_df)))

        if len(events_df) > 0:
            # Filter information
            data_filtered = FilterRows(events_df)
            events_df = data_filtered.filter_(["filepath"], "-")

            logger.debug("{} New events filtered: {}".format(
                self.__log_prefix, len(events_df)))

            events_df['message'] = events_df['message'].str.slice(
                stop=AreasOfCode.MESSAGE_MAX_SIZE)
            logger.debug("{} Remove message content".format(self.__log_prefix))

            # Add filetype info
            enriched_filetype = FileType(events_df)
            events_df = enriched_filetype.enrich('filepath')

            logger.debug("{} New Filetype events: {}".format(
                self.__log_prefix, len(events_df)))

            # Split filepath info
            enriched_filepath = FilePath(events_df)
            events_df = enriched_filepath.enrich('filepath')

            logger.debug("{} New Filepath events: {}".format(
                self.__log_prefix, len(events_df)))

            events_df['origin'] = events_df['repository']

            # Deal with surrogates
            convert = ToUTF8(events_df)
            events_df = convert.enrich(["owner"])

        logger.debug("{} Final new events: {}".format(self.__log_prefix,
                                                      len(events_df)))

        return self.ProcessResults(processed=len(events_df),
                                   out_items=events_df)
예제 #4
0
    def test_filter_rows(self):
        """ Test several cases for filtering rows by column value
        """

        # One column, values of different types
        df = pandas.DataFrame()
        filepaths = [
            '', None, '-', '/file/path', 1, True, pandas.np.nan, '-', [1, 2]
        ]
        df["filepath"] = filepaths
        data_filtered = FilterRows(df)
        df = data_filtered.filter_(["filepath"], "-")

        self.assertEqual(len(df), 7)

        # One empty column
        df = pandas.DataFrame()
        df["filepath"] = []
        data_filtered = FilterRows(df)
        df = data_filtered.filter_(["filepath"], "-")

        self.assertEqual(len(df), 0)

        # Several columns and just one empty
        df = pandas.DataFrame()
        df["filepath"] = []
        df["name"] = ["name", "-", "other", "-"]
        df["dirname"] = ["dir", "-", "-", "-"]
        data_filtered = FilterRows(df)
        df = data_filtered.filter_(["filepath", "name", "dirname"], "-")

        self.assertEqual(len(df), 1)
예제 #5
0
    def process(self, items_block):
        """Process items to add file related information.

        Eventize items creating one new item per each file found in the commit (excluding
        files with no actions performed on them). For each event, file path, file name,
        path parts, file type and file extension are added as fields.

        :param items_block: items to be processed. Expects to find ElasticSearch hits _source part only.
        """

        logger.info("New commits: " + str(len(items_block)))

        # Create events from commits
        git_events = Git(items_block, self._git_enrich)
        events_df = git_events.eventize(2)

        logger.info("New events: " + str(len(events_df)))

        if len(events_df) > 0:
            # Filter information
            data_filtered = FilterRows(events_df)
            events_df = data_filtered.filter_(["filepath"], "-")

            logger.info("New events filtered: " + str(len(events_df)))

            # Add filetype info
            enriched_filetype = FileType(events_df)
            events_df = enriched_filetype.enrich('filepath')

            logger.info("New Filetype events: " + str(len(events_df)))

            # Split filepath info
            enriched_filepath = FilePath(events_df)
            events_df = enriched_filepath.enrich('filepath')

            logger.info("New Filepath events: " + str(len(events_df)))

            # Deal with surrogates
            convert = ToUTF8(events_df)
            events_df = convert.enrich(["owner"])

        logger.info("Final new events: " + str(len(events_df)))

        return self.ProcessResults(processed=len(events_df),
                                   out_items=events_df)