def rule_based(self, issues): """ This method applies rule_based algorithms to predict labels Args: issues(list): a list of issue numbers Return: rule_based_predictions(list of lists): labels which satisfy rules """ DF = DataFetcher() df_test = DF.fetch_issues(issues) rule_based_predictions = [] for i in range(len(issues)): # extract every issue's title row = df_test.loc[i, 'title'] # apply rule-based algorithms single_issue_predictions = [] if "feature request" in row.lower(): single_issue_predictions.append("Feature") if "c++" in row.lower(): single_issue_predictions.append("C++") tokens = self.tokenize(row) for k, v in self.keywords.items(): for keyword in v: if keyword in tokens: single_issue_predictions.append(k) rule_based_predictions.append(single_issue_predictions) return rule_based_predictions
def ml_predict(self, issues, threshold=0.3): """ This method applies machine learning algorithms to predict labels Args: issues(list): a list of issue numbers threshold(float): threshold of probability Return: ml_predictions(list of lists): predictions """ # step1: fetch data DF = DataFetcher() df_test = DF.fetch_issues(issues) # step2: data cleaning SP = SentenceParser() SP.data = df_test SP.clean_body('body', True, True) SP.merge_column(['title', 'title', 'title', 'body'], 'train') test_text = SP.process_text('train', True, False, True) # step3: word embedding test_data_tfidf = self.tv.transform(test_text).toarray() le = LabelEncoder() le.fit_transform(self.labels) # step4: classification probs = self.clf.predict_proba(test_data_tfidf) # pick up top 2 predictions which exceeds threshold best_n = np.argsort(probs, axis=1)[:, -2:] ml_predictions = [] for i in range(len(best_n)): # INFO:Predictor:issue:11919,Performance:0.47353076240017744,Question:0.2440056213336274 logging.info("issue:{}, {}:{}, {}:{}".format( str(issues[i]), str(le.classes_[best_n[i][-1]]), str(probs[i][best_n[i][-1]]), str(le.classes_[best_n[i][-2]]), str(probs[i][best_n[i][-2]]))) single_issue_predictions = [ le.classes_[best_n[i][j]] for j in range(-1, -3, -1) if probs[i][best_n[i][j]] > threshold ] ml_predictions.append(single_issue_predictions) return ml_predictions
class TestLabelBot(unittest.TestCase): def setUp(self): self.df = DataFetcher() self.df.repo = "apache/incubator-mxnet" self.df.github_user = "******" self.df.github_oauth_token = "123" def tearDown(self): pass def test_cleanstr(self): new_string = self.df.cleanstr("a_b", "") self.assertEqual(new_string, "ab") def test_count_pages(self): with patch('DataFetcher.requests.get') as mocked_get: mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = [{ "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11925, "labels": [{ 'name': 'Doc' }], "state": "open", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", }, { "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11924, "labels": [], "state": "closed", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", }] page = self.df.count_pages('all') self.assertEqual(page, 1) def test_fetch_issues(self): with patch('DataFetcher.requests.get') as mocked_get: mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = { "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11925, "labels": [{ 'name': 'Feature' }], "state": "open", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", } data = self.df.fetch_issues([11925]) expected_data = [{ 'id': "11925", 'title': "issue's title", 'body': "issue's body" }] assert_frame_equal(data, pd.DataFrame(expected_data)) def test_data2json(self): with patch('DataFetcher.requests.get') as mocked_get: mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = [{ "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11925, "labels": [{ 'name': 'Feature' }], "state": "open", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", }, { "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11924, "labels": [], "state": "closed", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", }] self.df.data2json('all', labels=["Feature"], other_labels=False) expected_data = [{ 'id': 11925, 'title': "issue's title", 'body': "issue's body", 'labels': 'Feature' }] self.assertEqual(expected_data, self.df.json_data)