def test_date_format(self): test = '{"born": "Thu, 1 Mar 2012"}' expected = datetime.datetime(2012, 3, 1) format = '%a, %d %b %Y' decoded = json.loads(test, datetime_format=format).get('born') self.assertIs(type(decoded), datetime.datetime) self.assertEqual(decoded, expected)
def test_equivalence(self, json_str, python): try: decoded = json.loads(json_str) except Exception as e: self.fail("Unexpected failire: %s" % e) self.assertEqual(decoded, python) back_again = json.dumps(python, sort_keys=True) self.assertEqual(back_again, json_str)
def find(self, resource, req): """Retrieves a set of documents matching a given request. Queries can be expressed in two different formats: the mongo query syntax, and the python syntax. The first kind of query would look like: :: ?where={"name": "john doe} while the second would look like: :: ?where=name=="john doe" The resultset if paginated. :param resource: resource name. :param req: a :class:`ParsedRequest`instance. """ args = dict() args['limit'] = req.max_results if req.page > 1: args['skip'] = (req.page - 1) * req.max_results # TODO sort syntax should probably be coherent with 'where': either # mongo-like # or python-like. Currently accepts only mongo-like sort # syntax. # TODO should validate on unknown sort fields (mongo driver doesn't # return an error) if req.sort: args['sort'] = ast.literal_eval(req.sort) spec = dict() if req.where: try: spec = json.loads(req.where) except: try: spec = parse(req.where) except ParseError: abort(400) if req.if_modified_since: spec[config.LAST_UPDATED] = \ {'$gt': req.if_modified_since} if len(spec) > 0: args['spec'] = spec return self.driver.db[resource].find(**args)
def test_object_hook(self): def hook(dct): dct["hookjob"] = "I'm hooked!" return dct decoded = json.loads( '{"name": "John Doe", "born": "Thu, 1 Mar 2012 10:00:49 UTC"}', object_hook=hook, ) self.assertEqual( decoded.get("born"), datetime.datetime(2012, 3, 1, 10, 0, 49, tzinfo=tzutc()), ) self.assertIn("hookjob", decoded)
def parseBuyer(self, response): #pprint.pprint(locals()) urlRequestRaw = response.meta['urlRequestRaw'] fuwuISV = response.meta['fuwuISV'] #fuwuISV['name'] = response.meta['fuwuISV.name'] #fuwuISV['category'] = response.meta['fuwuISV.category'] self.log(response.body,level = log.DEBUG) #format return format from puedo json to formal json j=re.sub(r"(,|{)(\w+):", r"\1'\2':", response.body); j = re.sub(r"'", r'"', j) decodedResponse = json.loads(striphtml(j),datetime_format="%Y-%m-%d %H:%M:%S") currentPage = decodedResponse['currentPage'] pageCount = decodedResponse['pageCount'] if currentPage == pageCount: return else: #form request, maybe try to use str.format urlRequestBuyer = urlRequestRaw.format(page = currentPage+1, count=pageCount) request = Request('http://fuwu.taobao.com' + urlRequestBuyer, callback=self.parseBuyer) request.meta['urlRequestRaw'] = urlRequestRaw yield request #extract buyer's data #items = [] for buyer in decodedResponse['data']: item = FuwuPurchaseItem() item['buyerNameMasked']=buyer['nick'].strip() item['buyerRank']=buyer['rateSum'].strip() item['purchaseTime'] = buyer['payTime'] item['licLength'] = buyer['deadline'].strip() item['licVersion'] = buyer['version'].strip() item['fuwuISV']=fuwuISV yield item
def test_default_date_format(self): decoded = json.loads(self.test).get('born') self.assertIs(type(decoded), datetime.datetime) self.assertEqual(decoded, self.expected)
def test_nested_dicts(self): test = '{"updated": {"$gte": "Thu, 1 Mar 2012 10:00:49 UTC"}}' decoded = json.loads(test).get('updated').get('$gte') self.assertIs(type(decoded), datetime.datetime) self.assertEqual(decoded, self.expected)
def test_float_value(self): decoded = json.loads('{"key": "2.5"}') self.assertEqual(decoded.get('key'), "2.5")
def test_no_dates(self): test = '{"name": "John Doe"}' try: json.loads(test) except Exception as e: self.fail("Unexpected failure: %s" % e)
def test_numeric_value(self): decoded = json.loads('{"key": "2"}') self.assertEqual(decoded.get('key'), "2")
def json_renderer(**data): return jsonify(loads(dumps(data)))
def train_model(db, cluster, test_date): #Getting the clusters data collection = db[settings.get('Potential_CFD', 'proj_cluster')] cursor = collection.find({}) clusters = pd.DataFrame(list(cursor)) project_clusters = [] cluster_status = True groups = clusters.groupby('Cluster') for name, group in groups: project_clusters.append(list(group['Project'])) #db.Project_Clusters.aggregate([{$group :{"_id": {"Cluster":"$Cluster"} ,"Projects" : {$push: "$Project"}}}]) p = 0 if (cluster != "all"): cluster_status = False clusters = cluster.split(',') req_cluster = [] for i in range(0, len(clusters)): req_cluster.append(project_clusters[int(clusters[i]) - 1]) project_clusters = req_cluster print(project_clusters) #Problem is how to manage cluster ID ?? p won't be needed now. Something else needs to be there #Running the 3 models at cluster level... for cluster in project_clusters: if (not cluster_status): p = int(clusters[0]) clusters = clusters[1:] else: p = p + 1 print('Running on cluster ', p) print(cluster) df = pd.DataFrame() #Fetching the data for each project in the cluster cluster = cluster[:4] #['CSC.datacenter'] for proj in cluster: df2 = pd.DataFrame() collection = db[settings.get('Potential_CFD', 'trainPrefix') + proj.replace('.', '_')] cursor = collection.find({}) if (cursor.count() > 300000): cursor = collection.find().limit(300000) print(proj) df2 = pd.DataFrame(list(cursor)) df = df.append(df2) #df3 = pd.read_csv('/auto/vgapps-cstg02-vapps/analytics/csap/ingestion/opfiles/potCFD/Train/180505/CSC.labtrunk/BugFinal.csv') #print(df3['SUBMITTED_DATE']) #to_datetime_fmt = partial(pd.to_datetime, format='%Y-%m-%d %H:%M:%S') #df3['SUBMITTED_DATE'] = df3['SUBMITTED_DATE'].apply(to_datetime_fmt) #print(df3['SUBMITTED_DATE']) #print(df['SUBMITTED_DATE']) #df = df.append(df3) #cluster = ['CSC.sys','CSC.labtrunk', 'CSC.sibu.dev'] df = df[df['LIFECYCLE_STATE_CODE'].isin( ['C', 'J', 'U', 'D', 'M', 'R', 'V'])] print(df['PROJECT'].unique()) #Test and train split #test_date = options.testdate[:4] + '-' + options.testdate[4:] #print(test_date) test_df = df[df['SUBMITTED_DATE'] >= str(test_date)] train_df = df #print(list(test_df.columns)) majority = train_df[train_df["IFD_CFD_INDIC"] == 0] minority = train_df[train_df["IFD_CFD_INDIC"] == 1] majority = majority.sample(n=len(minority) * 3) train_df = majority train_df = train_df.append(minority) print(train_df.shape) print(test_df.shape) del [[df, majority, minority]] #These are the set of columns from which the model need to choose the best features feature_columns_to_use = [ 'DE_MANAGER_USERID', 'SEVERITY_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'AGE', 'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC', 'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC', 'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC', 'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC', 'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC', 'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC', 'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC', 'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC', 'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC', 'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC', 'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG' ] nonnumeric_columns = [ 'DE_MANAGER_USERID', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'FEATURE', 'RELEASE_NOTE', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'REGRESSION_BUG_FLAG' ] #Data imputation big_X = train_df[feature_columns_to_use].append( test_df[feature_columns_to_use]) big_X = big_X.replace(np.nan, '', regex=True) big_X_imputed = DataFrameImputer().fit_transform(big_X.iloc[:, :]) le = LabelEncoder() for feature in nonnumeric_columns: big_X_imputed[feature] = big_X_imputed[feature].astype(str) for feature in nonnumeric_columns: big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature]) train_X = big_X_imputed[0:train_df.shape[0]].as_matrix() test_X = big_X_imputed[train_df.shape[0]::].as_matrix() train_y = train_df['IFD_CFD_INDIC'] train_X[train_X == ''] = 0 test_X[test_X == ''] = 0 #Running the model and hypertuning to find the best features feature_indices = xgb_training(train_X, train_y, test_X, True, p) new_features_list = [] for i in feature_indices: new_features_list.append(feature_columns_to_use[i]) #Dumping the features to a file thefile = str(settings.get( "Potential_CFD", "temp_path_mod_potCFD")) + '/' + settings.get( 'Potential_CFD', 'potCFD_features') + str(p) + '.txt' with open(thefile, 'wb') as fp: pickle.dump(feature_indices, fp, protocol=2) #Dumping the features into a collection features_df = pd.DataFrame( columns=['clusterId', 'features_list', 'date']) features_df.loc[0] = [ p, ",".join(new_features_list), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") ] coll = 'PotCFD_Features' records = json2.loads( features_df.T.to_json(date_format='iso')).values() db[coll].insert(records) #Data Imputation big_X_imputed = big_X_imputed.iloc[:, feature_indices] train_X = big_X_imputed[0:train_df.shape[0]].as_matrix() test_X = big_X_imputed[train_df.shape[0]::].as_matrix() train_y = train_df['IFD_CFD_INDIC'] train_X[train_X == ''] = 0 test_X[test_X == ''] = 0 #Running the model test_probs, train_probs = xgb_training(train_X, train_y, test_X, False, p) print("Model 1 ran") test_df["Prediction"] = test_probs train_df["Prediction"] = train_probs ##################################SECOND MODEL - TEXT################################## train_data = train_df[[ "ENCL-Description", "Headline", "ATTRIBUTE", "IFD_CFD_INDIC" ]] test_data = test_df[[ "ENCL-Description", "Headline", "ATTRIBUTE", "IFD_CFD_INDIC" ]] train_data['ATTRIBUTE'] = train_data["ATTRIBUTE"].replace(np.nan, ' ') test_data['ATTRIBUTE'] = test_data["ATTRIBUTE"].replace(np.nan, ' ') train_data['Headline'] = train_data["Headline"].replace(np.nan, ' ') test_data['Headline'] = test_data["Headline"].replace(np.nan, ' ') train_data['ENCL-Description'] = train_data[ "ENCL-Description"].replace(np.nan, ' ') test_data['ENCL-Description'] = test_data["ENCL-Description"].replace( np.nan, ' ') #Compiling all the text data into single column train_data["complete"] = train_data["ENCL-Description"].astype( str) + " " + train_data["Headline"].astype( str) + " " + train_data["ATTRIBUTE"].astype(str) test_data["complete"] = test_data["ENCL-Description"].astype( str) + " " + test_data["Headline"].astype( str) + " " + test_data["ATTRIBUTE"].astype(str) top_words = 10000 corpus_words = [] unique_words = [] stemmer = SnowballStemmer("english") i = 0 #Building the vocabulary set for text in train_data["complete"]: #print(i) i = i + 1 for word in nltk.word_tokenize(text): case = get_word(word.lower()) if case: stemmed_word = stemmer.stem(word.lower()) if (stemmed_word == 'is'): print(word) corpus_words.append(stemmed_word) if stemmed_word not in unique_words: unique_words.append(stemmed_word) fdist1 = FreqDist(corpus_words) num_words = 5000 top_words_freq = fdist1.most_common(num_words) top_words = [] for word in top_words_freq: top_words.append(word[0]) thefile = str(settings.get( "Potential_CFD", "temp_path_mod_potCFD")) + '/top_words_cluster_' + str(p) + '.txt' with open(thefile, 'wb') as fp: pickle.dump(top_words, fp, protocol=2) indexes = {} i = 1 for word in top_words: indexes[word] = i i = i + 1 f = str(settings.get( "Potential_CFD", "temp_path_mod_potCFD")) + '/indexes_cluster_' + str(p) + '.json' f = open(f, 'w') json1 = json.dumps(indexes, indent=4) f.write(json1) f.close() #Mapping the vocabulary and indices for both train and test datasets training_data = [] i = 0 for text in train_data["complete"]: #print(i) i = i + 1 text_list = [] for word in nltk.word_tokenize(text): case = get_word(word) if case: #word not in chars and '*' not in word and '=' not in word and '++' not in word and '___' not in word and (not word.isdigit()): stemmed_word = stemmer.stem(word.lower()) if stemmed_word not in top_words: text_list.append(0) else: text_list.append(indexes[stemmed_word]) training_data.append(text_list)
def main(): options = parse_options() if(options.env == "Prod"): key = "csap_prod_database" else: key = "csap_stage_database" db = get_db(settings, key) if(options.train == "True"): coll_name = "BugDupsTrainSet_" + str(options.cluster) #+ "_2" collection = db[coll_name] #db['BugDupsTrainSet_all_3'] print(collection) df = load_data(db, collection, True) print(df) #X_train, Y_train, X_validation, Y_validation = build_data_cat(df, True) #cat_model = build_model_cat(X_train, Y_train, X_validation, Y_validation, int(options.cluster)) vocabulary, w2vmodel, train_df = build_data_text(df, int(options.cluster), db) model = build_model_text(vocabulary, w2vmodel, 500, train_df, int(options.cluster)) else: ''' coll_name = "BugDupsTestSet_" + str(options.viewID) + "_" + str(options.queryID) collection = db[coll_name] #db['BugDupsTrainSet_all_639_1968_new'] print(collection) df = load_data(db, collection) cluster_id = 3 #get_cluster(db, df) print(cluster_id) ''' coll_name = "BugDupsTrainSet_" + str(options.cluster) collection = db[coll_name] #db['BugDupsTrainSet_all_3'] print(collection) df = load_data(db, collection, False) cluster_id = int(options.cluster) print(cluster_id) X_test, Y_test = build_data_cat(df, False) cat_predictions = test_model_cat(X_test, Y_test, cluster_id) vocabulary, w2vmodel, words, test_df = build_test_data_text(df, cluster_id, db) text_predictions = test_model_text(vocabulary, w2vmodel, words, test_df, cluster_id) print(text_predictions, cat_predictions) d = pd.DataFrame() d['IDENTIFIER'] = df['IDENTIFIER'] d['DUPLICATE_OF'] = df['DUPLICATE_OF'] p = [] for i in cat_predictions: p.append(i[0]) d['pred_cat'] = p p = [] for i in text_predictions: p.append(i[0]) d['pred_text'] = p #ensemble_predictions = stacking_test(d, cluster_id) #d['pred_ensemble'] = list(ensemble_predictions) d['actual'] = df['is_duplicate'] print(d) res_coll_name = 'BugDupsTestSet_' + str(options.cluster) + '_description_results' collection = db[res_coll_name] records = json2.loads(d.T.to_json(date_format='iso')).values() collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("DUPLICATE_OF", pymongo.ASCENDING)], unique=True) print(collection.index_information()) collection.insert(records)
def test_view(db, view_id, query_id, bu_id, view, options): #Getting the clusters data collection = db[settings.get('Potential_CFD', 'proj_cluster')] cursor = collection.find({}) clusters = pd.DataFrame(list(cursor)) project_clusters = [] groups = clusters.groupby('Cluster') for name, group in groups: project_clusters.append(list(group['Project'])) print(project_clusters) #Fetch the data from the respective collection if(view): vi_col_name = settings.get('Potential_CFD', 'viewPrefix') + str(view_id) + '_' + str(query_id) tr_col_name = settings.get('Potential_CFD', 'trainPrefix') else: vi_col_name = settings.get('Potential_CFD', 'viewPrefix') + str(bu_id) + '_' + str(query_id) tr_col_name = settings.get('Potential_CFD', 'trainPrefix') collection = db[vi_col_name] print(vi_col_name) cursor = collection.find({}) test_df = pd.DataFrame(list(cursor)) #test_df = test_df.loc[test_df['PROJECT'].isin(['CSC.sys', 'CSC.labtrunk', 'CSC.sibu.dev'])] print(test_df['PROJECT'].value_counts()) #Get the cluster number if it exists, else create new cluster req_cluster = list(test_df['PROJECT'].unique()) status = False print(req_cluster) for a in ['CSC.sys-doc', 'CSC.autons', 'CSC.asics', 'CSC.hw', 'CSC.general', 'CSC.voice']: if a in req_cluster: req_cluster.remove(a) print(req_cluster) if req_cluster in project_clusters: status = True p = 0 cluster_id = 0 f_c = [] for cluster in project_clusters: p = p + 1 if set(req_cluster).issubset(cluster): cluster_id = p f_c = cluster status = True te_col_name = settings.get('Potential_CFD', 'testPrefix') + str(cluster_id) #status = True #cluster_id = 4 print(cluster_id) print(status) if(status == True): #Fetching the cut_off collection = db[settings.get('Potential_CFD', 'testPrefix') + str(cluster_id)] cursor = collection.find({}) df = pd.DataFrame(list(cursor)) fpr, tpr, thresholds = roc_curve(df['IFD_CFD_INDIC'], df['Final_prediction']) roc_auc = auc(fpr, tpr) i = np.arange(len(tpr)) roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(thresholds, index = i)}) r = roc.ix[(roc.tf-0).abs().argsort()[:1]] cut_off = list(r['thresholds'])[0]/100 print(cut_off) if(options.cutOff != ""): cut_off = int(options.cutOff)/100 #cut_off = 0.5 print(cut_off) del[df] #Get all the saved model paths model1 = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) + '/cluster' +str(cluster_id) + '_' + str(settings.get("Potential_CFD","xgboost_model")) model2 = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) +'/cluster' +str(cluster_id) + '_' + str(settings.get("Potential_CFD","cnn_lstm_model")) model3 = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) + '/' + str(settings.get("Potential_CFD","dnn_model")) + '_cluster' + str(cluster_id) feature_columns_to_use = ['DE_MANAGER_USERID', 'SEVERITY_CODE', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'AGE', 'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC', 'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC', 'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC', 'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC', 'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC', 'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC', 'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC', 'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC', 'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC', 'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC', 'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG'] nonnumeric_columns = ['DE_MANAGER_USERID', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'FEATURE', 'RELEASE_NOTE', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'REGRESSION_BUG_FLAG'] big_X = test_df[feature_columns_to_use] big_X = big_X.replace(np.nan, '', regex=True) big_X_imputed = DataFrameImputer().fit_transform(big_X) le = LabelEncoder() big_X_imputed["COMPONENT"] = big_X_imputed["COMPONENT"].astype(str) big_X_imputed["PRODUCT"] = big_X_imputed["PRODUCT"].astype(str) for feature in nonnumeric_columns: big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature]) thefile = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) +'/'+ settings.get('Potential_CFD', 'potCFD_features')+str(cluster_id)+'.txt' with open (thefile, 'rb') as fp: feature_indices = pickle.load(fp) big_X_imputed = big_X_imputed.iloc[:, feature_indices] test_X = big_X_imputed.as_matrix() with open(model1, 'rb') as f: clf = pickle.load(f) test_probs = clf.predict_proba(test_X)[:,1] print("Model 1 ran") test_df["Prediction"]= test_probs ##################################SECOND MODEL################################ top_words = 10000 test_data = test_df[["ENCL-Description", "Headline", "ATTRIBUTE"]] stemmer = LancasterStemmer() i = 0 test_data['ATTRIBUTE']=test_data["ATTRIBUTE"].replace(np.nan, ' ') test_data['Headline']=test_data["Headline"].replace(np.nan, ' ') test_data["complete"] = test_data["ENCL-Description"].astype(str) + test_data["Headline"].astype(str)+ " "+ test_data["ATTRIBUTE"].astype(str) thefile = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) + '/top_words_cluster_' +str(cluster_id)+'.txt' with open (thefile, 'rb') as fp: top_words = pickle.load(fp) f = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) + '/indexes_cluster_' +str(cluster_id)+'.json' indexes = json.load(open(f, 'r')) testing_data = [] i = 0 for text in test_data["complete"]: #print(i) i = i + 1 text_list = [] if(not(pd.isnull(text))): for word in nltk.word_tokenize(text): if word.lower() not in ["?", "'s", ">", "<", ",", ":", "'", "''", "--", "`", "``", "...", "", "!", "#", '"', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', ';', '=', '@', '[', '\\', ']', '^', '_', '{', '}', '|', '~', '\t', '\n',''] and '*' not in word.lower() and '=' not in word.lower() and '++' not in word.lower() and '___' not in word.lower() and (not word.isdigit()) and word.lower() not in stop_words and (len(word) >1): stemmed_word = stemmer.stem(word.lower()) if stemmed_word not in top_words: text_list.append(0) else: text_list.append(indexes[stemmed_word]) testing_data.append(text_list) max_text_length = 150 X_test = sequence.pad_sequences(testing_data, maxlen=max_text_length) model = load_model(model2) prediction = model.predict(X_test) print("Model 2 ran") test_df["test_pred"] = prediction test_df["Final_prediction"] = stacking_test(test_df, cluster_id) ##############################Model3############################## print("Starting model 3") test_df1 = test_df[test_df['Final_prediction'] >= cut_off] print(test_df1) if(test_df1.shape[0] > 0): test_df1['month_created'] = pd.to_datetime(test_df1['SUBMITTED_DATE']).dt.month test_df1['year_created'] = pd.to_datetime(test_df1['SUBMITTED_DATE']).dt.year test_df1['COMPONENT'] = test_df1['COMPONENT'].astype(str) test_df1['PRODUCT'] = test_df1['PRODUCT'].astype(str) test_df1['SEVERITY_CODE'] = test_df1['SEVERITY_CODE'].astype(str) test_df1['SS_INDIC'] = test_df1['SS_INDIC'].astype(str) test_df1['TS_INDIC'] = test_df1['TS_INDIC'].astype(str) thefile = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) +'/'+ settings.get('Potential_CFD', 'potCFD_features')+'dnn_'+str(cluster_id)+'.txt' with open (thefile, 'rb') as fp: new_feature_columns_to_use = pickle.load(fp) feature_columns_to_use = new_feature_columns_to_use + ['month_created', 'year_created'] categorical_features = new_feature_columns_to_use continuous_features = ['month_created', 'year_created'] for feature in categorical_features: test_df1[feature] = test_df1[feature].astype(str) new_test_df = test_df1[feature_columns_to_use] engineered_features = [] for continuous_feature in continuous_features: engineered_features.append(tf.contrib.layers.real_valued_column(continuous_feature)) for categorical_feature in categorical_features: sparse_column = tf.contrib.layers.sparse_column_with_hash_bucket(categorical_feature, hash_bucket_size=1000) engineered_features.append(tf.contrib.layers.embedding_column(sparse_id_column=sparse_column, dimension=16,combiner="sum")) regressor2 = tf.contrib.learn.DNNRegressor(feature_columns=engineered_features, hidden_units=[64, 32, 10], model_dir=model3) #TensorFlow input functions for Text Analysis def input_fn(df, training = True): continuous_cols = {k: tf.constant(df[k].values) for k in continuous_features} categorical_cols = {k: tf.SparseTensor(indices=[[i, 0] for i in range(df[k].size)],values=df[k].values,dense_shape=[df[k].size, 1]) for k in categorical_features} feature_cols = dict(list(continuous_cols.items()) + list(categorical_cols.items())) if training: label = tf.constant(df[LABEL_COLUMN].values) return feature_cols, label return feature_cols def train_input_fn(): return input_fn(train_df1) def eval_input_fn(): return input_fn(evaluate_df) def test_input_fn(): return input_fn(new_test_df, False) predicted_output = regressor2.predict(input_fn=test_input_fn)#input_fn(new_test_df, False)) test_df1['Days_Predictions'] = list(predicted_output) now = datetime.datetime.now() test_df1[test_df1['Days_Predictions'] < 0]['Days_Predictions'] = 0 test_df1['days_ahead'] = (pd.to_datetime(test_df1['SUBMITTED_DATE']) - now)/np.timedelta64(1, 'D') + test_df1['Days_Predictions'] test_df2 = test_df[['IDENTIFIER', 'LIFECYCLE_STATE_CODE', 'DISPOSED_INDIC', 'CFD_INDIC', 'AGE', 'ATTRIBUTE', 'COMPONENT', 'DE_MANAGER_USERID', 'ENCL-Description', 'ENGINEER', 'Headline', 'IMPACT', 'PRIORITY_CODE', 'PRODUCT','PROJECT', 'SS_INDIC','TS_INDIC', 'SEVERITY_CODE', 'SUBMITTED_DATE', 'SUBMITTER_ID', 'TICKETS_COUNT', 'VERSION_TEXT', 'IFD_CFD_INDIC','Prediction', 'test_pred', 'Final_prediction']] #test_df2 = test_df[['IDENTIFIER', 'LIFECYCLE_STATE_CODE', 'DISPOSED_INDIC', 'CFD_INDIC', 'Prediction', 'test_pred', 'Final_prediction']] test_df3 = test_df1[['IDENTIFIER', 'Days_Predictions', 'days_ahead']] final_test_df = pd.DataFrame() final_test_df = test_df2.join(test_df3.set_index('IDENTIFIER'), on='IDENTIFIER') final_test_df = final_test_df.drop_duplicates('IDENTIFIER') final_test_df['Prediction'] = final_test_df['Prediction']*100 final_test_df['test_pred'] = final_test_df['test_pred']*100 final_test_df['Final_prediction'] = final_test_df['Final_prediction']*100 final_test_df['Cluster'] = cluster_id final_test_df['last_run_date'] = now.strftime("%Y-%m-%d") final_test_df = final_test_df[final_test_df['Final_prediction'] >= (cut_off*100)] print(final_test_df.shape) print(test_df1.shape) #Inserting data to view results collection if(view): vi_col_name_results = settings.get('Potential_CFD', 'viewPrefix') + str(view_id) + '_' + str(query_id) + '_results' collection = db[vi_col_name_results] else: vi_col_name_results = settings.get('Potential_CFD', 'viewPrefix') + str(bu_id) + '_' + str(query_id) + '_results' collection = db[vi_col_name_results] records = json2.loads(final_test_df.T.to_json(date_format='iso')).values() collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("last_run_date", pymongo.ASCENDING)], unique=True) print(collection.index_information()) collection.insert(records) print("Inserted data to results collection") #Inserting data to View Mapper collection collection = db[settings.get('Potential_CFD', 'Pot_cfd_viewCluster')] df = pd.DataFrame(columns = ['viewSetCollectionName', 'trainedOnCollectionName', 'testCollectionName', 'clusterId', 'viewId', 'queryId', 'BU', 'projectList', 'csap_last_run_date', 'cutoff']) proj_list = ",".join(f_c) dat = now.strftime("%Y-%m-%d") if(view): df.loc[0] = [vi_col_name_results, tr_col_name, te_col_name, int(cluster_id), int(view_id), int(query_id), bu_id, proj_list, dat, float(cut_off*100)] else: df.loc[0] = [vi_col_name_results, tr_col_name, te_col_name, int(cluster_id), view_id, int(query_id), str(bu_id), proj_list, dat, float(cut_off*100)] records = json2.loads(df.T.to_json(date_format='iso')).values() collection.insert(records) print("Inserted data to View mapper collection") else: print("No predicted CFDs in this ViewSet")
def test_object_hook(self): decoded = json.loads(self.test, object_hook=self.hook) self.assertEqual(decoded.get('born'), self.expected) self.assertIn("hookjob", decoded)
def build_data_text(train_df, cluster, db): # Prepare embedding vocabulary = dict() inverse_vocabulary = ['<unk>'] # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding print("Loading the W2V") #word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True) #WE NEED TO HAVE OUR OWN W2V MODEL #Our own word2vec model sentences = [] for dataset in [train_df]: for index, row in dataset.iterrows(): #print(text_to_word_list(row["complete1"])) #print(index) if(type(row['complete1']) != float): sentences.append(text_to_word_list(row["complete1"])) if(type(row['complete2']) != float): sentences.append(text_to_word_list(row["complete2"])) # train model model = Word2Vec(sentences, min_count=1, size = 150) # summarize the loaded model #print(model) # summarize vocabulary words = list(model.wv.vocab) # save model filename = '/data/ingestion/bugDuplicates/w2vmodel_' + str(cluster) + '.bin' model.save(filename) #model.save('/users/sumreddi/model_3.bin') c=0 q1=[0]*(train_df.shape[0]) q2=[0]*(train_df.shape[0]) # Iterate over the questions only of both training and test datasets for dataset in [train_df]:#[train_df, test_df]: for index, row in dataset.iterrows(): #print(index) # Iterate through the text of both questions of the row for question in text_cols: c = c + 1 q2n = [] # q2n -> question numbers representation for word in text_to_word_list(row[question]): # Check for unwanted words if word in stops and word not in words: #word2vec.vocab: continue if word not in vocabulary: vocabulary[word] = len(inverse_vocabulary) q2n.append(len(inverse_vocabulary)) inverse_vocabulary.append(word) else: q2n.append(vocabulary[word]) # Replace questions as word to question as number representation if(c%2 != 0): q1[index] = q2n else: q2[index] = q2n #dataset.set_value(index, question, q2n) train_df['complete1'] = q1 train_df['complete2'] = q2 #Write the vocabulary and inverse vocabulary into a file f = '/data/ingestion/bugDuplicates/vocab_model_' + str(cluster) + '.json' #f = "/users/sumreddi/vocab_model_3.json" f = open(f,'w') json1 = json.dumps(vocabulary, indent=4) f.write(json1) f.close() thefile = '/data/ingestion/bugDuplicates/inv_vocab_model_' + str(cluster) + '.json' #thefile = "/users/sumreddi/inv_vocab_model_3.json" #str(settings.get("Potential_CFD","temp_path_mod_bugDuplicates")) + '/top_words_cluster_' +str(p)+'.txt' with open(thefile, 'wb') as fp: pickle.dump(inverse_vocabulary, fp, protocol=2) #Store train_df in a collection collection = db['BugDupsTrainSet_3_complete'] train_df.reset_index(drop = True, inplace = True) records = json2.loads(train_df.T.to_json(date_format='iso')).values() collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("DUPLICATE_OF", pymongo.ASCENDING)], unique=True) collection.insert(records) return vocabulary, model, train_df
def build_collection(df, duplicates, org_dup_ids, non_req_list, collection): print("ref_df starting") ''' req_df = pd.DataFrame() j=0 for i in range(0, int(df.shape[0]/100000)+1): print(i) a = df.iloc[j: j+1000000, ] b = a[a['IDENTIFIER'].isin(org_dup_ids)] req_df = req_df.append(b) ''' req_df = df[df['IDENTIFIER'].isin(org_dup_ids)] print("req_df done") feature_columns_to_use = ['IDENTIFIER', 'Headline', 'ENCL-Description', 'DE_MANAGER_USERID', 'SEVERITY_CODE', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'AGE', 'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC', 'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC', 'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC', 'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC', 'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC', 'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC', 'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC', 'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC', 'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC', 'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC', 'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG', 'SUBMITTED_DATE'] ids = list(req_df['IDENTIFIER'].unique()) waste = [] c=0 print("new_duplicates staring") new_duplicates = duplicates[duplicates['DUPLICATE_OF'].isin(ids)] print("new_duplicates done") a = pd.DataFrame() b = pd.DataFrame() for i, row in new_duplicates.iterrows(): c = c + 1 #print(c, len(new_duplicates), 1) identifier = row['DUPLICATE_OF'] if identifier in ids: b = b.append(req_df[req_df['IDENTIFIER'] == identifier][feature_columns_to_use]) if(b.shape[0] >= 1000): a = a.append(b) del b b = pd.DataFrame() a = a.append(b) print(a.shape, new_duplicates.shape) #new_feature_columns = ['DUPLICATE_OF','Headline2', 'ENCL-Description2', 'DE_MANAGER_USERID2', 'SEVERITY_CODE2', 'LIFECYCLE_STATE_CODE2', 'PROJECT2', 'PRODUCT2', 'COMPONENT2', 'AGE2', 'FEATURE2', 'RELEASE_NOTE2', 'SA_ATTACHMENT_INDIC2', 'CR_ATTACHMENT_INDIC2', 'UT_ATTACHMENT_INDIC2', 'IMPACT2', 'ORIGIN2', 'IS_CUSTOMER_VISIBLE2', 'TICKETS_COUNT2', 'INCOMING_INDIC2', 'BACKLOG_INDIC2', 'DISPOSED_INDIC2', 'TS_INDIC2', 'SS_INDIC2', 'OIB_INDIC2', 'STATE_ASSIGN_INDIC2', 'STATE_CLOSE_INDIC2', 'STATE_DUPLICATE_INDIC2', 'STATE_FORWARD_INDIC2', 'STATE_HELD_INDIC2', 'STATE_INFO_INDIC2', 'STATE_JUNK_INDIC2', 'STATE_MORE_INDIC2', 'STATE_NEW_INDIC2', 'STATE_OPEN_INDIC2', 'STATE_POSTPONE_INDIC2', 'STATE_RESOLVE_INDIC2', 'STATE_SUBMIT_INDIC2', 'STATE_UNREP_INDIC2', 'STATE_VERIFY_INDIC2', 'STATE_WAIT_INDIC2', 'CFR_INDIC2', 'S12RD_INDIC2', 'S123RD_INDIC2', 'MISSING_SS_EVAL_INDIC2', 'S123_INDIC2', 'S12_INDIC2', 'RNE_INDIC2', 'UPDATED_BY2', 'DEV_ESCAPE_ACTIVITY2', 'RELEASED_CODE2', 'TEST_EDP_ACTIVITY2', 'TEST_EDP_PHASE2', 'RESOLVER_ANALYSIS_INDIC2', 'SUBMITTER_ANALYSIS_INDIC2', 'EDP_ANALYSIS_INDIC2', 'RETI_ANALYSIS_INDIC2', 'DESIGN_REVIEW_ESCAPE_INDIC2', 'STATIC_ANALYSIS_ESCAPE_INDIC2', 'FUNC_TEST_ESCAPE_INDIC2', 'SELECT_REG_ESCAPE_INDIC2', 'CODE_REVIEW_ESCAPE_INDIC2', 'UNIT_TEST_ESCAPE_INDIC2', 'DEV_ESCAPE_INDIC2', 'FEATURE_TEST_ESCAPE_INDIC2', 'REG_TEST_ESCAPE_INDIC2', 'SYSTEM_TEST_ESCAPE_INDIC2', 'SOLUTION_TEST_ESCAPE_INDIC2', 'INT_TEST_ESCAPE_INDIC2', 'GO_TEST_ESCAPE_INDIC2', 'COMPLETE_ESCAPE_INDIC2', 'SR_CNT2', 'PSIRT_INDIC2', 'BADCODEFLAG2', 'RISK_OWNER2', 'SIR2', 'PSIRT_FLAG2', 'URC_DISPOSED_INDIC2', 'CLOSED_DISPOSED_INDIC2', 'REGRESSION_BUG_FLAG2', 'SUBMITTED_DATE2'] new_feature_columns = ['DUPLICATE_OF','DUP_Headline','DUP_ENCL-Description','DUP_DE_MANAGER_USERID','DUP_SEVERITY_CODE','DUP_LIFECYCLE_STATE_CODE','DUP_PROJECT','DUP_PRODUCT','DUP_COMPONENT','DUP_AGE','DUP_FEATURE','DUP_RELEASE_NOTE','DUP_SA_ATTACHMENT_INDIC','DUP_CR_ATTACHMENT_INDIC','DUP_UT_ATTACHMENT_INDIC','DUP_IMPACT','DUP_ORIGIN','DUP_IS_CUSTOMER_VISIBLE','DUP_TICKETS_COUNT','DUP_INCOMING_INDIC','DUP_BACKLOG_INDIC','DUP_DISPOSED_INDIC','DUP_TS_INDIC','DUP_SS_INDIC','DUP_OIB_INDIC','DUP_STATE_ASSIGN_INDIC','DUP_STATE_CLOSE_INDIC','DUP_STATE_DUPLICATE_INDIC','DUP_STATE_FORWARD_INDIC','DUP_STATE_HELD_INDIC','DUP_STATE_INFO_INDIC','DUP_STATE_JUNK_INDIC','DUP_STATE_MORE_INDIC','DUP_STATE_NEW_INDIC','DUP_STATE_OPEN_INDIC','DUP_STATE_POSTPONE_INDIC','DUP_STATE_RESOLVE_INDIC','DUP_STATE_SUBMIT_INDIC','DUP_STATE_UNREP_INDIC','DUP_STATE_VERIFY_INDIC','DUP_STATE_WAIT_INDIC','DUP_CFR_INDIC','DUP_S12RD_INDIC','DUP_S123RD_INDIC','DUP_MISSING_SS_EVAL_INDIC','DUP_S123_INDIC','DUP_S12_INDIC','DUP_RNE_INDIC','DUP_UPDATED_BY','DUP_DEV_ESCAPE_ACTIVITY','DUP_RELEASED_CODE','DUP_TEST_EDP_ACTIVITY','DUP_TEST_EDP_PHASE','DUP_RESOLVER_ANALYSIS_INDIC','DUP_SUBMITTER_ANALYSIS_INDIC','DUP_EDP_ANALYSIS_INDIC','DUP_RETI_ANALYSIS_INDIC','DUP_DESIGN_REVIEW_ESCAPE_INDIC','DUP_STATIC_ANALYSIS_ESCAPE_INDIC','DUP_FUNC_TEST_ESCAPE_INDIC','DUP_SELECT_REG_ESCAPE_INDIC','DUP_CODE_REVIEW_ESCAPE_INDIC','DUP_UNIT_TEST_ESCAPE_INDIC','DUP_DEV_ESCAPE_INDIC','DUP_FEATURE_TEST_ESCAPE_INDIC','DUP_REG_TEST_ESCAPE_INDIC','DUP_SYSTEM_TEST_ESCAPE_INDIC','DUP_SOLUTION_TEST_ESCAPE_INDIC','DUP_INT_TEST_ESCAPE_INDIC','DUP_GO_TEST_ESCAPE_INDIC','DUP_COMPLETE_ESCAPE_INDIC','DUP_SR_CNT','DUP_PSIRT_INDIC','DUP_BADCODEFLAG', 'DUP_RISK_OWNER','DUP_SIR','DUP_PSIRT_FLAG','DUP_URC_DISPOSED_INDIC','DUP_CLOSED_DISPOSED_INDIC','DUP_REGRESSION_BUG_FLAG','DUP_SUBMITTED_DATE'] for i in range(0, len(new_feature_columns)): #print(i) new_duplicates[new_feature_columns[i]] = list(a[feature_columns_to_use[i]]) new_duplicates['is_duplicate'] = 1 new_non_duplicates = pd.DataFrame() non_a_1 = pd.DataFrame() non_a_2 = pd.DataFrame() non_a_1_sample = pd.DataFrame() non_a_2_sample = pd.DataFrame() a = pd.DataFrame() for c in range(0, len(non_req_list)): tup_id = non_req_list[c] if(c%1000 == 0): non_a_1 = non_a_1.append(non_a_1_sample) non_a_2 = non_a_2.append(non_a_2_sample) del non_a_1_sample del non_a_2_sample non_a_1_sample = pd.DataFrame() non_a_2_sample = pd.DataFrame() if(c%4 == 0): a = df[df['IDENTIFIER'] == tup_id[0]][feature_columns_to_use] non_a_1_sample = non_a_1_sample.append(a) non_a_2_sample = non_a_2_sample.append(df[df['IDENTIFIER'] == tup_id[1]][feature_columns_to_use]) else: non_a_1_sample = non_a_1_sample.append(a) non_a_2_sample = non_a_2_sample.append(df[df['IDENTIFIER'] == tup_id[1]][feature_columns_to_use]) #print(c, len(non_req_list)) new_non_duplicates = non_a_1 print(new_non_duplicates.columns) for i in range(0, len(new_feature_columns)): #print(i) new_non_duplicates[new_feature_columns[i]] = list(non_a_2[feature_columns_to_use[i]]) new_non_duplicates['is_duplicate'] = 0 new_duplicates_1 = new_duplicates[list(new_non_duplicates.columns)] new_duplicates_1 = new_duplicates_1.append(new_non_duplicates) new_duplicates_1.reset_index(drop = True, inplace = True) records = json2.loads(new_duplicates_1.T.to_json(date_format='iso')).values() print(new_duplicates_1) #collection = db['BugDupsTrainSet_all_639_1968_new'] collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("DUPLICATE_OF", pymongo.ASCENDING)], unique=True) print(collection.index_information()) collection.drop() collection.insert(records) print("Inserted data to results collection") return 0
]] test_df3 = test_df1[[ 'IDENTIFIER', 'Ticket_Predictions', 'TICKETS_COUNT', 'Days_Predictions', 'days_taken' ]] final_test_df = test_df2.join(test_df3.set_index('IDENTIFIER'), on='IDENTIFIER') final_test_df = final_test_df.drop_duplicates('IDENTIFIER') final_test_df[ 'Final_prediction'] = final_test_df['Final_prediction'] * 100 print("Predictions completed...") #Writing the test data into a collection final_test_df.reset_index(drop=True, inplace=True) records = json2.loads( final_test_df.T.to_json(date_format='iso')).values() test_collection = settings.get('Potential_CFD', 'testPrefix') + str(p) + '_ticketCNT' print(db[test_collection], len(records)) print(final_test_df.iloc[3]) db[test_collection].drop() db[test_collection].insert_many(records) print("Inserted to collection") ''' def get_collection_details(): #Setting the MongoDB configurations hostname = settings.get('csap_prod_database', 'host') port = settings.get('csap_prod_database', 'dbPort') username = settings.get('csap_prod_database', 'user')
a = topics_matrix[topic] for i in range(0,10): top.append(topic) top_words.append(a[1][i][0]) top_probs.append(a[1][i][1]) topics_df['Topic_number'] = top topics_df['keyword'] = top_words topics_df['probability'] = top_probs topics_df['PF'] = pf topics_df['underlying_cause_desc'] = desc topics_df['underlying_cause_code'] = ccode #final_key_df = final_key_df.append(topics_df) sample_1 = topics_df[['keyword', 'probability', 'PF', 'underlying_cause_desc', 'underlying_cause_code']] sample_1.reset_index(drop = True, inplace = True) records = json2.loads(sample_1.T.to_json(date_format='iso')).values() db.SR_topic_keywords_cause_desc.insert(records) print(desc) tk_asa_df = topics_df asa_df_1 = final_df list_of_keywords_asa = top_words keyword_srcases_df = pd.DataFrame() for word in list_of_keywords_asa: new_word = lemma.lemmatize(word,'v') sr_cases = [] temp = tk_asa_df[tk_asa_df['keyword'] == new_word] if(temp.size != 0): topics = temp.Topic_number.unique() for topic in topics: temp2 = asa_df_1[asa_df_1['topic']==topic][asa_df_1['topic_probability'] > 0.87]
def bugDuplicate_view_set(e, cluster_id, stops, max_seq_length, text_cols, embedding_dim, view_id, query_id, cut_off): embedding_vecor_length = 32 model1 = Sequential() model1.add(Embedding(3458230 + 1, embedding_vecor_length, input_length=6)) #len(X_test[0]))) model1.add( Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')) model1.add(MaxPooling1D(pool_size=2)) model1.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) model1.add(Dense(1, activation='sigmoid')) model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) filename = '/data/csap_models/bugDups/model3_lstm_cnn_cat_' + str( cluster_id) + '.h5' model1 = load_model( filename ) #model = load_model('/data/csap_models/bugDups/model3_lstm_cnn_cat.h5') filename = '/data/csap_models/bugDups/w2vmodel_' + str(cluster_id) + '.bin' w2vmodel = Word2Vec.load(filename) f = '/data/csap_models/bugDups/vocab_model_' + str(cluster_id) + '.json' vocabulary = json.load(open(f, 'r')) thefile = '/data/csap_models/bugDups/inv_vocab_model_' + str( cluster_id) + '.json' with open(thefile, 'rb') as fp: inverse_vocabulary = pickle.load(fp) words = list(w2vmodel.wv.vocab) thefile = "/data/csap_models/bugDups/embeddings_model_" + str( cluster_id) + '.json' with open(thefile, 'rb') as fp: embeddings = pickle.load(fp) n_hidden = 50 gradient_clipping_norm = 1.25 batch_size = 64 n_epoch = 1 #25 def exponent_neg_manhattan_distance(left, right): return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True)) left_input = Input(shape=(max_seq_length, ), dtype='int32') right_input = Input(shape=(max_seq_length, ), dtype='int32') embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False) # Embedded version of the inputs encoded_left = embedding_layer(left_input) encoded_right = embedding_layer(right_input) # Since this is a siamese network, both sides share the same LSTM shared_lstm = LSTM(n_hidden) left_output = shared_lstm(encoded_left) right_output = shared_lstm(encoded_right) # Calculates the distance as defined by the MaLSTM model malstm_distance = Merge( mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output]) # Pack it all up into a model malstm = Model([left_input, right_input], [malstm_distance]) # Adadelta optimizer, with gradient clipping by norm optimizer = Adadelta(clipnorm=gradient_clipping_norm) malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy']) # load weights into new model filename = '/data/csap_models/bugDups/text_model_' + str( cluster_id) + '.h5' malstm.load_weights(filename) final_df = pd.DataFrame() duplicate_bugs_length = sum(e['is_duplicate'] == 1) for j in range(0, duplicate_bugs_length): b1 = e.iloc[j, ] duplicate_of = b1['DUPLICATE_OF'] is_duplicates = b1['is_duplicate'] df = e[(e['PRODUCT2'] == b1['PRODUCT'] )] # & (e['PROJECT2'] == b1['PROJECT']) print(j, duplicate_bugs_length, df.shape[0]) df1 = get_test_data(df, b1) #X_test, Y_test = build_test_data_cat(df1) #cat_predictions = test_model_cat(model1, X_test, Y_test, cluster_id) #vocabulary, w2vmodel, words, test_df = build_test_data_text(w2vmodel, vocabulary, inverse_vocabulary, df1, cluster_id) #text_predictions = test_model_text(malstm, embeddings, vocabulary, w2vmodel, words, df1, cluster_id) X_test = {'left': df1.complete1, 'right': df1.complete2} for dataset, side in itertools.product([X_test], ['left', 'right']): dataset[side] = pad_sequences(dataset[side], maxlen=150) #print("predictions started") text_predictions = malstm.predict([X_test['left'], X_test['right']]) #print("predictions done") result = pd.DataFrame() result['IDENTIFIER'] = df1['IDENTIFIER'] result['DUPLICATE_OF'] = df1['DUPLICATE_OF'] result['Headline'] = df1['Headline'] result['ENCL-Description'] = df1['ENCL-Description'] #p = [] #for i in cat_predictions: # p.append(i[0]*100) #d['pred_cat'] = p p = [] for i in text_predictions: p.append(i[0] * 100) result['pred_text'] = p result = result.drop_duplicates(subset='DUPLICATE_OF', keep="last") result = result.sort_values(['pred_text'], ascending=[0]) result = result[result['pred_text'] > cut_off] if (result.shape[0] > 10): v = 10 else: v = result.shape[0] if (v != 0): df2 = pd.DataFrame() l = [] l.append(result.iloc[0, ]['IDENTIFIER']) df2['IDENTIFIER'] = l df2['Headline'] = df1['Headline'].iloc[0] df2['ENCL-Description'] = df1['ENCL-Description'].iloc[0] #df2['PRODUCT'] = result.iloc[0,]['PRODUCT'] #df2['PROJECT'] = b1['PROJECT'] #df2['Headline'] = b1['Headline'] df2['DUPLICATE_LIST'] = ' '.join( list(result.iloc[0:v]['DUPLICATE_OF'])) df2['PROBABILITIES'] = ' '.join( str(x) for x in list(result.iloc[0:v]['pred_text'])) if (is_duplicates == 1): df2['actual_duplicate'] = duplicate_of else: df2['actual_duplicate'] = "" final_df = final_df.append(df2) res_coll_name = "BugDupsTestSet_" + str(view_id) + "_" + str( query_id) + "_results" collection = db[res_coll_name] final_df = final_df.drop_duplicates(subset='IDENTIFIER', keep="last") final_df.reset_index(drop=True, inplace=True) final_df = final_df.drop_duplicates(subset='IDENTIFIER', keep="last") filename = res_coll_name + '.csv' final_df.to_csv(filename, encoding='utf-8') records = json2.loads(final_df.T.to_json(date_format='iso')).values() collection.create_index([("IDENTIFIER", pymongo.ASCENDING)], unique=True) collection.insert(records) return final_df
def build_collection(df, duplicates, org_dup_ids): req_df = df[df['IDENTIFIER'].isin(org_dup_ids)] feature_columns_to_use = ['IDENTIFIER', 'Headline', 'ENCL-Description', 'DE_MANAGER_USERID', 'SEVERITY_CODE', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'AGE', 'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC', 'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC', 'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC', 'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC', 'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC', 'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC', 'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC', 'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC', 'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC', 'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC', 'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG'] #nonnumeric_columns = ['DE_MANAGER_USERID', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'FEATURE', 'RELEASE_NOTE', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'REGRESSION_BUG_FLAG'] #duplicates['Headline2'] = "" #duplicates['ENCL-Description2'] = "" ids = list(req_df['IDENTIFIER'].unique()) waste = [] headlines = [] encl_descriptions = [] c=0 new_duplicates = duplicates[duplicates['DUPLICATE_OF'].isin(ids)] a = pd.DataFrame() for i, row in new_duplicates.iterrows(): c = c + 1 print(c) identifier = row['DUPLICATE_OF'] if identifier in ids: #duplicates.ix[i, 'Headline2'] = list(req_df[req_df['IDENTIFIER'] == identifier]['Headline'])[0] #duplicates.ix[i, 'ENCL-Description2'] = list(req_df[req_df['IDENTIFIER'] == identifier]['ENCL-Description'])[0] a = a.append(req_df[req_df['IDENTIFIER'] == identifier][feature_columns_to_use]) #headlines.append(list(req_df[req_df['IDENTIFIER'] == identifier]['Headline'])[0]) #encl_descriptions.append(list(req_df[req_df['IDENTIFIER'] == identifier]['ENCL-Description'])[0]) else: waste.append(c) #a['DUPLICATE_OF'] = a ['IDENTIFIER'] new_feature_columns = ['DUPLICATE_OF','Headline2', 'ENCL-Description2', 'DE_MANAGER_USERID2', 'SEVERITY_CODE2', 'LIFECYCLE_STATE_CODE2', 'PROJECT2', 'PRODUCT2', 'COMPONENT2', 'AGE2', 'FEATURE2', 'RELEASE_NOTE2', 'SA_ATTACHMENT_INDIC2', 'CR_ATTACHMENT_INDIC2', 'UT_ATTACHMENT_INDIC2', 'IMPACT2', 'ORIGIN2', 'IS_CUSTOMER_VISIBLE2', 'TICKETS_COUNT2', 'INCOMING_INDIC2', 'BACKLOG_INDIC2', 'DISPOSED_INDIC2', 'TS_INDIC2', 'SS_INDIC2', 'OIB_INDIC2', 'STATE_ASSIGN_INDIC2', 'STATE_CLOSE_INDIC2', 'STATE_DUPLICATE_INDIC2', 'STATE_FORWARD_INDIC2', 'STATE_HELD_INDIC2', 'STATE_INFO_INDIC2', 'STATE_JUNK_INDIC2', 'STATE_MORE_INDIC2', 'STATE_NEW_INDIC2', 'STATE_OPEN_INDIC2', 'STATE_POSTPONE_INDIC2', 'STATE_RESOLVE_INDIC2', 'STATE_SUBMIT_INDIC2', 'STATE_UNREP_INDIC2', 'STATE_VERIFY_INDIC2', 'STATE_WAIT_INDIC2', 'CFR_INDIC2', 'S12RD_INDIC2', 'S123RD_INDIC2', 'MISSING_SS_EVAL_INDIC2', 'S123_INDIC2', 'S12_INDIC2', 'RNE_INDIC2', 'UPDATED_BY2', 'DEV_ESCAPE_ACTIVITY2', 'RELEASED_CODE2', 'TEST_EDP_ACTIVITY2', 'TEST_EDP_PHASE2', 'RESOLVER_ANALYSIS_INDIC2', 'SUBMITTER_ANALYSIS_INDIC2', 'EDP_ANALYSIS_INDIC2', 'RETI_ANALYSIS_INDIC2', 'DESIGN_REVIEW_ESCAPE_INDIC2', 'STATIC_ANALYSIS_ESCAPE_INDIC2', 'FUNC_TEST_ESCAPE_INDIC2', 'SELECT_REG_ESCAPE_INDIC2', 'CODE_REVIEW_ESCAPE_INDIC2', 'UNIT_TEST_ESCAPE_INDIC2', 'DEV_ESCAPE_INDIC2', 'FEATURE_TEST_ESCAPE_INDIC2', 'REG_TEST_ESCAPE_INDIC2', 'SYSTEM_TEST_ESCAPE_INDIC2', 'SOLUTION_TEST_ESCAPE_INDIC2', 'INT_TEST_ESCAPE_INDIC2', 'GO_TEST_ESCAPE_INDIC2', 'COMPLETE_ESCAPE_INDIC2', 'SR_CNT2', 'PSIRT_INDIC2', 'BADCODEFLAG2', 'RISK_OWNER2', 'SIR2', 'PSIRT_FLAG2', 'URC_DISPOSED_INDIC2', 'CLOSED_DISPOSED_INDIC2', 'REGRESSION_BUG_FLAG2'] for i in range(0, len(new_feature_columns)): print(i) new_duplicates[new_feature_columns[i]] = list(a[feature_columns_to_use[i]]) #new_duplicates[new_feature_columns] = a[feature_columns_to_use] new_duplicates['Headline2'] = headlines new_duplicates['ENCL-Description2'] = encl_descriptions new_duplicates['is_duplicate'] = 1 #new_duplicates = duplicates.drop(duplicates.index[waste]) #For non dups dataset... #Fetch for 1st row and copy for next 3 indexes #Then for Headline2 do the same as above... identifiers = [] dup_identifiers = [] headlines1 = [] encl_descriptions1 = [] headlines2 = [] encl_descriptions2 = [] new_non_duplicates = pd.DataFrame() non_a_1 = pd.DataFrame() non_a_2 = pd.DataFrame() a = pd.DataFrame() for c in range(0, len(non_req_list)): tup_id = non_req_list[c] if(c%4 == 0): #a = df[df['IDENTIFIER'] == tup_id[0]] #h = list(a['Headline'])[0] #e = list(a['ENCL-Description'])[0] a = df[df['IDENTIFIER'] == tup_id[0]][feature_columns_to_use] non_a_1 = non_a_1.append(a) #for j in range(4): #identifiers.append(tup_id[0]) #headlines1.append(h) #encl_descriptions1.append(e) #a1 = df[df['IDENTIFIER'] == tup_id[1]] non_a_2 = non_a_2.append(df[df['IDENTIFIER'] == tup_id[1]][feature_columns_to_use]) #dup_identifiers.append(tup_id[1]) #headlines2.append(list(a1['Headline'])[0]) #encl_descriptions2.append(list(a1['ENCL-Description'])[0]) else: #identifiers.append(tup_id[0]) #headlines1.append(h) #encl_descriptions1.append(e) non_a_1 = non_a_1.append(a) #a1 = df[df['IDENTIFIER'] == tup_id[1]] non_a_2 = non_a_2.append(df[df['IDENTIFIER'] == tup_id[1]][feature_columns_to_use]) #dup_identifiers.append(tup_id[1]) #headlines2.append(list(a1['Headline'])[0]) #encl_descriptions2.append(list(a1['ENCL-Description'])[0]) print(c) new_non_duplicates = non_a_1 #non_a_2['DUPLICATE_OF'] = non_a_2['IDENTIFIER'] print(new_non_duplicates.columns) for i in range(0, len(new_feature_columns)): print(i) new_non_duplicates[new_feature_columns[i]] = list(non_a_2[feature_columns_to_use[i]]) new_non_duplicates['IDENTIFIER'] = identifiers new_non_duplicates['DUPLICATE_OF'] = dup_identifiers new_non_duplicates['Headline'] = headlines1 new_non_duplicates['Headline2'] = headlines2 new_non_duplicates['ENCL-Description'] = encl_descriptions1 new_non_duplicates['ENCL-Description2'] = encl_descriptions2 new_non_duplicates['is_duplicate'] = 0 new_duplicates_1 = new_duplicates[list(new_non_duplicates.columns)] #new_duplicates_1 = new_duplicates[['IDENTIFIER', 'DUPLICATE_OF','Headline', 'Headline2', 'ENCL-Description', 'ENCL-Description2', 'is_duplicate']] new_duplicates_1 = new_duplicates_1.append(new_non_duplicates) new_duplicates_1.reset_index(drop = True, inplace = True) records = json2.loads(new_duplicates_1.T.to_json(date_format='iso')).values() collection = db['BugDupsTrainSet_436_1452_new'] collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("DUPLICATE_OF", pymongo.ASCENDING)], unique=True) print(collection.index_information()) collection.insert(records) print("Inserted data to results collection") def fetch_bugs_list(db): cluster_id = options.cluster #Getting the clusters data collection = db[settings.get('Potential_CFD', 'proj_cluster')] cursor = collection.find({}) clusters = pd.DataFrame(list(cursor)) project_clusters = [] cluster_status = True groups = clusters.groupby('Cluster') for name, group in groups: project_clusters.append(list(group['Project'])) print(project_clusters) cluster = project_clusters[cluster_id - 1] df = pd.DataFrame() #Fetching the data for each project in the cluster for proj in cluster: collection = db[settings.get('Potential_CFD', 'trainPrefix')+ proj.replace('.', '_')] cursor = collection.find({}) print(proj) df2 = pd.DataFrame(list(cursor)) df = df.append(df2) print(df['PROJECT'].unique()) duplicates = df[df['DUPLICATE_OF'].isnull() == False] non_duplicates = df[df['DUPLICATE_OF'].isnull() == True] print(duplicates[['IDENTIFIER', 'DUPLICATE_OF']]) #https://stackoverflow.com/questions/48220643/add-values-to-an-existing-dataframe-from-list-of-tuples req_list = list(zip(duplicates['IDENTIFIER'], duplicates['DUPLICATE_OF'])) org_dup_ids = list(duplicates['DUPLICATE_OF'].unique()) dup_ids = list(duplicates['IDENTIFIER']) non_dup_ids = list(non_duplicates['IDENTIFIER']) non_req_list = [] for id in dup_ids: rand_items = random.sample(non_dup_ids, 2) for i in rand_items: req_list.append((id, i)) non_req_list.append((id, i)) org_non_dups_ids = [j for i,j in req_list] build_collection(df, duplicates, org_dup_ids) def main(): options = parse_options() if(options.env == "Prod"): key = "csap_prod_database" else: key = "csap_stage_database" db = get_db(settings, key) fetch_bugs_list(db)