コード例 #1
0
ファイル: tests.py プロジェクト: CIR2000/json-datetime
 def test_date_format(self):
     test = '{"born": "Thu, 1 Mar 2012"}'
     expected = datetime.datetime(2012, 3, 1)
     format = '%a, %d %b %Y'
     decoded = json.loads(test, datetime_format=format).get('born')
     self.assertIs(type(decoded), datetime.datetime)
     self.assertEqual(decoded, expected)
コード例 #2
0
ファイル: tests.py プロジェクト: CIR2000/json-datetime
 def test_date_format(self):
     test = '{"born": "Thu, 1 Mar 2012"}'
     expected = datetime.datetime(2012, 3, 1)
     format = '%a, %d %b %Y'
     decoded = json.loads(test, datetime_format=format).get('born')
     self.assertIs(type(decoded), datetime.datetime)
     self.assertEqual(decoded, expected)
コード例 #3
0
 def test_equivalence(self, json_str, python):
     try:
         decoded = json.loads(json_str)
     except Exception as e:
         self.fail("Unexpected failire: %s" % e)
     self.assertEqual(decoded, python)
     back_again = json.dumps(python, sort_keys=True)
     self.assertEqual(back_again, json_str)
コード例 #4
0
ファイル: mongo.py プロジェクト: marchon/eve
    def find(self, resource, req):
        """Retrieves a set of documents matching a given request. Queries can
        be expressed in two different formats: the mongo query syntax, and the
        python syntax. The first kind of query would look like: ::

            ?where={"name": "john doe}

        while the second would look like: ::

            ?where=name=="john doe"

        The resultset if paginated.

        :param resource: resource name.
        :param req: a :class:`ParsedRequest`instance.
        """
        args = dict()

        args['limit'] = req.max_results

        if req.page > 1:
            args['skip'] = (req.page - 1) * req.max_results

        # TODO sort syntax should probably be coherent with 'where': either
        # mongo-like # or python-like. Currently accepts only mongo-like sort
        # syntax.

        # TODO should validate on unknown sort fields (mongo driver doesn't
        # return an error)
        if req.sort:
            args['sort'] = ast.literal_eval(req.sort)

        spec = dict()
        if req.where:
            try:
                spec = json.loads(req.where)
            except:
                try:
                    spec = parse(req.where)
                except ParseError:
                    abort(400)

        if req.if_modified_since:
            spec[config.LAST_UPDATED] = \
                {'$gt': req.if_modified_since}

        if len(spec) > 0:
            args['spec'] = spec

        return self.driver.db[resource].find(**args)
コード例 #5
0
    def test_object_hook(self):
        def hook(dct):
            dct["hookjob"] = "I'm hooked!"
            return dct

        decoded = json.loads(
            '{"name": "John Doe", "born": "Thu, 1 Mar 2012 10:00:49 UTC"}',
            object_hook=hook,
        )
        self.assertEqual(
            decoded.get("born"),
            datetime.datetime(2012, 3, 1, 10, 0, 49, tzinfo=tzutc()),
        )
        self.assertIn("hookjob", decoded)
コード例 #6
0
ファイル: TBfuwuSpider.py プロジェクト: daiqing2009/EC_fuwu
 def parseBuyer(self, response):
     #pprint.pprint(locals())             
     urlRequestRaw = response.meta['urlRequestRaw']
     fuwuISV = response.meta['fuwuISV']
     #fuwuISV['name'] = response.meta['fuwuISV.name']
     #fuwuISV['category'] = response.meta['fuwuISV.category']
     
     self.log(response.body,level = log.DEBUG)
     
     #format return format from puedo json to formal json 
     j=re.sub(r"(,|{)(\w+):", r"\1'\2':", response.body);
     j = re.sub(r"'", r'"', j)
     decodedResponse = json.loads(striphtml(j),datetime_format="%Y-%m-%d %H:%M:%S")
     
     currentPage = decodedResponse['currentPage']
     pageCount = decodedResponse['pageCount']
     
     if currentPage == pageCount:
         return 
     else:
        #form request, maybe try to use str.format
         urlRequestBuyer = urlRequestRaw.format(page = currentPage+1, count=pageCount)
         
         request = Request('http://fuwu.taobao.com' + urlRequestBuyer,
                      callback=self.parseBuyer) 
         request.meta['urlRequestRaw'] = urlRequestRaw
         yield request
         
         #extract buyer's data
         #items = []
         for buyer in decodedResponse['data']:
             item = FuwuPurchaseItem()                
             item['buyerNameMasked']=buyer['nick'].strip()
             item['buyerRank']=buyer['rateSum'].strip()
             item['purchaseTime'] = buyer['payTime']
             item['licLength'] = buyer['deadline'].strip()
             item['licVersion'] = buyer['version'].strip()
             item['fuwuISV']=fuwuISV
             yield item     
コード例 #7
0
ファイル: tests.py プロジェクト: CIR2000/json-datetime
 def test_default_date_format(self):
     decoded = json.loads(self.test).get('born')
     self.assertIs(type(decoded), datetime.datetime)
     self.assertEqual(decoded, self.expected)
コード例 #8
0
 def test_nested_dicts(self):
     test = '{"updated": {"$gte": "Thu, 1 Mar 2012 10:00:49 UTC"}}'
     decoded = json.loads(test).get('updated').get('$gte')
     self.assertIs(type(decoded), datetime.datetime)
     self.assertEqual(decoded, self.expected)
コード例 #9
0
 def test_float_value(self):
     decoded = json.loads('{"key": "2.5"}')
     self.assertEqual(decoded.get('key'), "2.5")
コード例 #10
0
ファイル: tests.py プロジェクト: CIR2000/json-datetime
 def test_no_dates(self):
     test = '{"name": "John Doe"}'
     try:
         json.loads(test)
     except Exception as e:
         self.fail("Unexpected failure: %s" % e)
コード例 #11
0
 def test_numeric_value(self):
     decoded = json.loads('{"key": "2"}')
     self.assertEqual(decoded.get('key'), "2")
コード例 #12
0
ファイル: generic.py プロジェクト: aprimadi/gsapi
def json_renderer(**data):
    return jsonify(loads(dumps(data)))
コード例 #13
0
def train_model(db, cluster, test_date):
    #Getting the clusters data
    collection = db[settings.get('Potential_CFD', 'proj_cluster')]
    cursor = collection.find({})
    clusters = pd.DataFrame(list(cursor))
    project_clusters = []
    cluster_status = True
    groups = clusters.groupby('Cluster')

    for name, group in groups:
        project_clusters.append(list(group['Project']))

    #db.Project_Clusters.aggregate([{$group :{"_id": {"Cluster":"$Cluster"} ,"Projects" : {$push: "$Project"}}}])

    p = 0
    if (cluster != "all"):
        cluster_status = False
        clusters = cluster.split(',')
        req_cluster = []
        for i in range(0, len(clusters)):
            req_cluster.append(project_clusters[int(clusters[i]) - 1])
        project_clusters = req_cluster

    print(project_clusters)
    #Problem is how to manage cluster ID ?? p won't be needed now. Something else needs to be there
    #Running the 3 models at cluster level...
    for cluster in project_clusters:
        if (not cluster_status):
            p = int(clusters[0])
            clusters = clusters[1:]
        else:
            p = p + 1
        print('Running on cluster ', p)
        print(cluster)
        df = pd.DataFrame()
        #Fetching the data for each project in the cluster
        cluster = cluster[:4]  #['CSC.datacenter']
        for proj in cluster:
            df2 = pd.DataFrame()
            collection = db[settings.get('Potential_CFD', 'trainPrefix') +
                            proj.replace('.', '_')]
            cursor = collection.find({})
            if (cursor.count() > 300000):
                cursor = collection.find().limit(300000)
            print(proj)
            df2 = pd.DataFrame(list(cursor))
            df = df.append(df2)
            #df3 = pd.read_csv('/auto/vgapps-cstg02-vapps/analytics/csap/ingestion/opfiles/potCFD/Train/180505/CSC.labtrunk/BugFinal.csv')
            #print(df3['SUBMITTED_DATE'])
            #to_datetime_fmt = partial(pd.to_datetime, format='%Y-%m-%d %H:%M:%S')
            #df3['SUBMITTED_DATE'] = df3['SUBMITTED_DATE'].apply(to_datetime_fmt)
            #print(df3['SUBMITTED_DATE'])
            #print(df['SUBMITTED_DATE'])
            #df = df.append(df3)
            #cluster = ['CSC.sys','CSC.labtrunk', 'CSC.sibu.dev']
        df = df[df['LIFECYCLE_STATE_CODE'].isin(
            ['C', 'J', 'U', 'D', 'M', 'R', 'V'])]

        print(df['PROJECT'].unique())

        #Test and train split
        #test_date = options.testdate[:4] + '-' + options.testdate[4:]
        #print(test_date)
        test_df = df[df['SUBMITTED_DATE'] >= str(test_date)]
        train_df = df
        #print(list(test_df.columns))

        majority = train_df[train_df["IFD_CFD_INDIC"] == 0]
        minority = train_df[train_df["IFD_CFD_INDIC"] == 1]
        majority = majority.sample(n=len(minority) * 3)

        train_df = majority
        train_df = train_df.append(minority)

        print(train_df.shape)
        print(test_df.shape)
        del [[df, majority, minority]]

        #These are the set of columns from which the model need to choose the best features
        feature_columns_to_use = [
            'DE_MANAGER_USERID', 'SEVERITY_CODE', 'PROJECT', 'PRODUCT',
            'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'AGE', 'FEATURE',
            'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC',
            'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE',
            'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC',
            'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC',
            'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC',
            'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC',
            'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC',
            'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC',
            'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC',
            'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC',
            'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY',
            'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY',
            'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC',
            'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC',
            'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC',
            'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC',
            'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC',
            'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC',
            'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC',
            'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC',
            'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC',
            'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC', 'BADCODEFLAG',
            'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC',
            'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG'
        ]
        nonnumeric_columns = [
            'DE_MANAGER_USERID', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER',
            'SUBMITTER_ID', 'FEATURE', 'RELEASE_NOTE', 'IMPACT', 'ORIGIN',
            'IS_CUSTOMER_VISIBLE', 'INCOMING_INDIC', 'BACKLOG_INDIC',
            'DISPOSED_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY',
            'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE',
            'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG',
            'REGRESSION_BUG_FLAG'
        ]

        #Data imputation
        big_X = train_df[feature_columns_to_use].append(
            test_df[feature_columns_to_use])
        big_X = big_X.replace(np.nan, '', regex=True)
        big_X_imputed = DataFrameImputer().fit_transform(big_X.iloc[:, :])

        le = LabelEncoder()

        for feature in nonnumeric_columns:
            big_X_imputed[feature] = big_X_imputed[feature].astype(str)

        for feature in nonnumeric_columns:
            big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

        train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
        test_X = big_X_imputed[train_df.shape[0]::].as_matrix()
        train_y = train_df['IFD_CFD_INDIC']
        train_X[train_X == ''] = 0
        test_X[test_X == ''] = 0

        #Running the model and hypertuning to find the best features
        feature_indices = xgb_training(train_X, train_y, test_X, True, p)

        new_features_list = []
        for i in feature_indices:
            new_features_list.append(feature_columns_to_use[i])

        #Dumping the features to a file
        thefile = str(settings.get(
            "Potential_CFD", "temp_path_mod_potCFD")) + '/' + settings.get(
                'Potential_CFD', 'potCFD_features') + str(p) + '.txt'
        with open(thefile, 'wb') as fp:
            pickle.dump(feature_indices, fp, protocol=2)

        #Dumping the features into a collection
        features_df = pd.DataFrame(
            columns=['clusterId', 'features_list', 'date'])
        features_df.loc[0] = [
            p, ",".join(new_features_list),
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        ]
        coll = 'PotCFD_Features'
        records = json2.loads(
            features_df.T.to_json(date_format='iso')).values()
        db[coll].insert(records)

        #Data Imputation
        big_X_imputed = big_X_imputed.iloc[:, feature_indices]
        train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
        test_X = big_X_imputed[train_df.shape[0]::].as_matrix()
        train_y = train_df['IFD_CFD_INDIC']
        train_X[train_X == ''] = 0
        test_X[test_X == ''] = 0

        #Running the model
        test_probs, train_probs = xgb_training(train_X, train_y, test_X, False,
                                               p)
        print("Model 1 ran")
        test_df["Prediction"] = test_probs
        train_df["Prediction"] = train_probs

        ##################################SECOND MODEL - TEXT##################################
        train_data = train_df[[
            "ENCL-Description", "Headline", "ATTRIBUTE", "IFD_CFD_INDIC"
        ]]
        test_data = test_df[[
            "ENCL-Description", "Headline", "ATTRIBUTE", "IFD_CFD_INDIC"
        ]]

        train_data['ATTRIBUTE'] = train_data["ATTRIBUTE"].replace(np.nan, ' ')
        test_data['ATTRIBUTE'] = test_data["ATTRIBUTE"].replace(np.nan, ' ')
        train_data['Headline'] = train_data["Headline"].replace(np.nan, ' ')
        test_data['Headline'] = test_data["Headline"].replace(np.nan, ' ')
        train_data['ENCL-Description'] = train_data[
            "ENCL-Description"].replace(np.nan, ' ')
        test_data['ENCL-Description'] = test_data["ENCL-Description"].replace(
            np.nan, ' ')

        #Compiling all the text data into single column
        train_data["complete"] = train_data["ENCL-Description"].astype(
            str) + " " + train_data["Headline"].astype(
                str) + " " + train_data["ATTRIBUTE"].astype(str)
        test_data["complete"] = test_data["ENCL-Description"].astype(
            str) + " " + test_data["Headline"].astype(
                str) + " " + test_data["ATTRIBUTE"].astype(str)

        top_words = 10000

        corpus_words = []
        unique_words = []
        stemmer = SnowballStemmer("english")
        i = 0

        #Building the vocabulary set
        for text in train_data["complete"]:
            #print(i)
            i = i + 1
            for word in nltk.word_tokenize(text):
                case = get_word(word.lower())
                if case:
                    stemmed_word = stemmer.stem(word.lower())
                    if (stemmed_word == 'is'):
                        print(word)
                    corpus_words.append(stemmed_word)
                    if stemmed_word not in unique_words:
                        unique_words.append(stemmed_word)

        fdist1 = FreqDist(corpus_words)
        num_words = 5000
        top_words_freq = fdist1.most_common(num_words)

        top_words = []
        for word in top_words_freq:
            top_words.append(word[0])

        thefile = str(settings.get(
            "Potential_CFD",
            "temp_path_mod_potCFD")) + '/top_words_cluster_' + str(p) + '.txt'
        with open(thefile, 'wb') as fp:
            pickle.dump(top_words, fp, protocol=2)

        indexes = {}
        i = 1
        for word in top_words:
            indexes[word] = i
            i = i + 1

        f = str(settings.get(
            "Potential_CFD",
            "temp_path_mod_potCFD")) + '/indexes_cluster_' + str(p) + '.json'
        f = open(f, 'w')
        json1 = json.dumps(indexes, indent=4)
        f.write(json1)
        f.close()

        #Mapping the vocabulary and indices for both train and test datasets
        training_data = []
        i = 0
        for text in train_data["complete"]:
            #print(i)
            i = i + 1
            text_list = []
            for word in nltk.word_tokenize(text):
                case = get_word(word)
                if case:  #word not in chars and '*' not in word and '=' not in word and '++' not in word and '___' not in word and (not word.isdigit()):
                    stemmed_word = stemmer.stem(word.lower())
                    if stemmed_word not in top_words:
                        text_list.append(0)
                    else:
                        text_list.append(indexes[stemmed_word])
            training_data.append(text_list)
コード例 #14
0
def main():
	options = parse_options()
	if(options.env == "Prod"):
		key = "csap_prod_database"

	else:
		key = "csap_stage_database"

	db = get_db(settings, key)

	if(options.train == "True"):
		coll_name = "BugDupsTrainSet_" + str(options.cluster) #+ "_2"
		collection = db[coll_name] #db['BugDupsTrainSet_all_3']
		print(collection)

		df = load_data(db, collection, True)
		print(df)
		
		#X_train, Y_train, X_validation, Y_validation = build_data_cat(df, True)
		#cat_model = build_model_cat(X_train, Y_train, X_validation, Y_validation, int(options.cluster))
		vocabulary, w2vmodel, train_df = build_data_text(df, int(options.cluster), db)
		model = build_model_text(vocabulary, w2vmodel, 500, train_df, int(options.cluster))

	else:
		'''
		coll_name = "BugDupsTestSet_" + str(options.viewID) + "_" + str(options.queryID)
		collection = db[coll_name] #db['BugDupsTrainSet_all_639_1968_new']
		print(collection)
		
		df = load_data(db, collection)
		cluster_id = 3 #get_cluster(db, df)
		print(cluster_id)
		'''
		coll_name = "BugDupsTrainSet_" + str(options.cluster)
		collection = db[coll_name] #db['BugDupsTrainSet_all_3']
		print(collection)

		df = load_data(db, collection, False)
		cluster_id = int(options.cluster)
		print(cluster_id)

		X_test, Y_test = build_data_cat(df, False)
		cat_predictions = test_model_cat(X_test, Y_test, cluster_id)
		vocabulary, w2vmodel, words, test_df = build_test_data_text(df, cluster_id, db)
		text_predictions = test_model_text(vocabulary, w2vmodel, words, test_df, cluster_id)
		print(text_predictions, cat_predictions)

		d = pd.DataFrame()
		d['IDENTIFIER'] = df['IDENTIFIER']
		d['DUPLICATE_OF'] = df['DUPLICATE_OF']
		p = []
		for i in cat_predictions:
			p.append(i[0])
		d['pred_cat'] = p
		p = []
		for i in text_predictions:
			p.append(i[0])
		d['pred_text'] = p

		#ensemble_predictions = stacking_test(d, cluster_id)
		#d['pred_ensemble'] = list(ensemble_predictions)

		d['actual'] = df['is_duplicate']
		print(d)
		res_coll_name = 'BugDupsTestSet_' + str(options.cluster) + '_description_results'
		collection = db[res_coll_name]
		records = json2.loads(d.T.to_json(date_format='iso')).values()

		collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("DUPLICATE_OF", pymongo.ASCENDING)], unique=True)
		print(collection.index_information())
		collection.insert(records)
コード例 #15
0
def test_view(db, view_id, query_id, bu_id, view, options):
    #Getting the clusters data
    collection = db[settings.get('Potential_CFD', 'proj_cluster')]
    cursor = collection.find({})
    clusters =  pd.DataFrame(list(cursor))
    project_clusters = []
    groups = clusters.groupby('Cluster')
    
    for name, group in groups:
        project_clusters.append(list(group['Project']))

    print(project_clusters)

    #Fetch the data from the respective collection
    if(view):
        vi_col_name = settings.get('Potential_CFD', 'viewPrefix') + str(view_id) + '_' + str(query_id)
        tr_col_name = settings.get('Potential_CFD', 'trainPrefix')
    
    else:
        vi_col_name = settings.get('Potential_CFD', 'viewPrefix') + str(bu_id) + '_' + str(query_id)
        tr_col_name = settings.get('Potential_CFD', 'trainPrefix')

    collection = db[vi_col_name]
    print(vi_col_name)
    cursor = collection.find({})
    test_df =  pd.DataFrame(list(cursor))
    #test_df = test_df.loc[test_df['PROJECT'].isin(['CSC.sys', 'CSC.labtrunk', 'CSC.sibu.dev'])]
    print(test_df['PROJECT'].value_counts())

    #Get the cluster number if it exists, else create new cluster
    req_cluster = list(test_df['PROJECT'].unique())
    status = False
    print(req_cluster)
    for a in ['CSC.sys-doc', 'CSC.autons', 'CSC.asics', 'CSC.hw', 'CSC.general', 'CSC.voice']:
        if a in req_cluster:
            req_cluster.remove(a)

    print(req_cluster)
    if req_cluster in project_clusters:
        status = True

    p = 0
    cluster_id = 0
    f_c = []
    for cluster in project_clusters:
        p = p + 1
        if set(req_cluster).issubset(cluster):
            cluster_id = p
            f_c = cluster
            status = True

    te_col_name = settings.get('Potential_CFD', 'testPrefix') + str(cluster_id)
    #status = True
    #cluster_id = 4
    print(cluster_id)
    print(status)
    
    if(status == True):
        #Fetching the cut_off
        collection = db[settings.get('Potential_CFD', 'testPrefix') + str(cluster_id)]
        cursor = collection.find({})
        df =  pd.DataFrame(list(cursor))
        fpr, tpr, thresholds = roc_curve(df['IFD_CFD_INDIC'], df['Final_prediction'])
        roc_auc = auc(fpr, tpr)
        i = np.arange(len(tpr))
        roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(thresholds, index = i)})
        r = roc.ix[(roc.tf-0).abs().argsort()[:1]]
        cut_off = list(r['thresholds'])[0]/100
        print(cut_off)
        if(options.cutOff != ""):
            cut_off = int(options.cutOff)/100
        #cut_off = 0.5
        print(cut_off)
        del[df]

        #Get all the saved model paths
        model1 = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) + '/cluster' +str(cluster_id) + '_' + str(settings.get("Potential_CFD","xgboost_model"))
        model2 = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) +'/cluster' +str(cluster_id) + '_' + str(settings.get("Potential_CFD","cnn_lstm_model"))
        model3 = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) + '/' + str(settings.get("Potential_CFD","dnn_model")) + '_cluster' + str(cluster_id)

        feature_columns_to_use = ['DE_MANAGER_USERID', 'SEVERITY_CODE', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'AGE',  'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY',  'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC', 'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC', 'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC', 'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC', 'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC', 'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC', 'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC', 'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC', 'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC', 'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC',  'BADCODEFLAG',   'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC', 'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG']
        nonnumeric_columns = ['DE_MANAGER_USERID', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'FEATURE', 'RELEASE_NOTE', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'BADCODEFLAG',  'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'REGRESSION_BUG_FLAG']

        big_X = test_df[feature_columns_to_use]
        big_X = big_X.replace(np.nan, '', regex=True)
        big_X_imputed = DataFrameImputer().fit_transform(big_X)

        le = LabelEncoder()
        big_X_imputed["COMPONENT"] = big_X_imputed["COMPONENT"].astype(str)
        big_X_imputed["PRODUCT"] = big_X_imputed["PRODUCT"].astype(str)
        
        for feature in nonnumeric_columns:
            big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

        thefile = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) +'/'+ settings.get('Potential_CFD', 'potCFD_features')+str(cluster_id)+'.txt'
        with open (thefile, 'rb') as fp:
            feature_indices = pickle.load(fp)

        big_X_imputed = big_X_imputed.iloc[:, feature_indices]
        test_X = big_X_imputed.as_matrix()
        
        with open(model1, 'rb') as f:
            clf = pickle.load(f)

        test_probs = clf.predict_proba(test_X)[:,1]
        print("Model 1 ran")
        test_df["Prediction"]= test_probs
        
        ##################################SECOND MODEL################################
        
        top_words = 10000
        test_data = test_df[["ENCL-Description", "Headline", "ATTRIBUTE"]]
        stemmer = LancasterStemmer()
        i = 0
        
        test_data['ATTRIBUTE']=test_data["ATTRIBUTE"].replace(np.nan, ' ')
        test_data['Headline']=test_data["Headline"].replace(np.nan, ' ')
        test_data["complete"] = test_data["ENCL-Description"].astype(str) + test_data["Headline"].astype(str)+ " "+ test_data["ATTRIBUTE"].astype(str)

        thefile = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) + '/top_words_cluster_' +str(cluster_id)+'.txt'
        with open (thefile, 'rb') as fp:
            top_words = pickle.load(fp)

        f = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) + '/indexes_cluster_' +str(cluster_id)+'.json'
        indexes = json.load(open(f, 'r'))

        testing_data = []
        i = 0
        for text in test_data["complete"]:
            #print(i)
            i = i + 1
            text_list = []
            if(not(pd.isnull(text))):
                for word in nltk.word_tokenize(text):
                    if word.lower() not in ["?", "'s", ">", "<", ",", ":", "'", "''", "--", "`", "``", "...", "", "!", "#", '"', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', ';', '=', '@', '[', '\\', ']', '^', '_', '{', '}', '|', '~', '\t', '\n',''] and '*' not in word.lower() and '=' not in word.lower() and '++' not in word.lower() and '___' not in word.lower() and (not word.isdigit()) and word.lower() not in stop_words and (len(word) >1):
                        stemmed_word = stemmer.stem(word.lower())
                        if stemmed_word not in top_words:
                            text_list.append(0)
                        else:
                            text_list.append(indexes[stemmed_word])
                testing_data.append(text_list)

        max_text_length = 150
        X_test = sequence.pad_sequences(testing_data, maxlen=max_text_length)

        model = load_model(model2)
        prediction = model.predict(X_test)
        print("Model 2 ran")
        test_df["test_pred"] = prediction
        test_df["Final_prediction"] = stacking_test(test_df, cluster_id)
        
        ##############################Model3##############################
        
        print("Starting model 3")
        test_df1 = test_df[test_df['Final_prediction'] >= cut_off]
        print(test_df1)        
        if(test_df1.shape[0] > 0):
            test_df1['month_created'] = pd.to_datetime(test_df1['SUBMITTED_DATE']).dt.month
            test_df1['year_created'] = pd.to_datetime(test_df1['SUBMITTED_DATE']).dt.year

            test_df1['COMPONENT'] = test_df1['COMPONENT'].astype(str)
            test_df1['PRODUCT'] = test_df1['PRODUCT'].astype(str)
            test_df1['SEVERITY_CODE'] = test_df1['SEVERITY_CODE'].astype(str)
            test_df1['SS_INDIC'] = test_df1['SS_INDIC'].astype(str)
            test_df1['TS_INDIC'] = test_df1['TS_INDIC'].astype(str)

            thefile = str(settings.get("Potential_CFD","temp_path_mod_potCFD")) +'/'+ settings.get('Potential_CFD', 'potCFD_features')+'dnn_'+str(cluster_id)+'.txt'
            with open (thefile, 'rb') as fp:
                new_feature_columns_to_use = pickle.load(fp)

            feature_columns_to_use = new_feature_columns_to_use + ['month_created', 'year_created']
            categorical_features = new_feature_columns_to_use
            continuous_features = ['month_created', 'year_created']

            for feature in categorical_features:
                test_df1[feature] = test_df1[feature].astype(str)

            new_test_df = test_df1[feature_columns_to_use]
            
            engineered_features = []
            for continuous_feature in continuous_features:
                engineered_features.append(tf.contrib.layers.real_valued_column(continuous_feature))

            for categorical_feature in categorical_features:
                sparse_column = tf.contrib.layers.sparse_column_with_hash_bucket(categorical_feature, hash_bucket_size=1000)
                engineered_features.append(tf.contrib.layers.embedding_column(sparse_id_column=sparse_column, dimension=16,combiner="sum"))

            regressor2 = tf.contrib.learn.DNNRegressor(feature_columns=engineered_features, hidden_units=[64, 32, 10], model_dir=model3)

            #TensorFlow input functions for Text Analysis
            def input_fn(df, training = True):
                continuous_cols = {k: tf.constant(df[k].values) for k in continuous_features}
                categorical_cols = {k: tf.SparseTensor(indices=[[i, 0] for i in range(df[k].size)],values=df[k].values,dense_shape=[df[k].size, 1]) for k in categorical_features}
                feature_cols = dict(list(continuous_cols.items()) + list(categorical_cols.items()))
                if training:
                    label = tf.constant(df[LABEL_COLUMN].values)
                    return feature_cols, label
                
                return feature_cols

            def train_input_fn():
                return input_fn(train_df1)

            def eval_input_fn():
                return input_fn(evaluate_df)

            def test_input_fn():
                return input_fn(new_test_df, False)
            
            predicted_output = regressor2.predict(input_fn=test_input_fn)#input_fn(new_test_df, False))
            test_df1['Days_Predictions'] = list(predicted_output)

            now = datetime.datetime.now()
            test_df1[test_df1['Days_Predictions'] < 0]['Days_Predictions'] = 0
            test_df1['days_ahead'] = (pd.to_datetime(test_df1['SUBMITTED_DATE']) - now)/np.timedelta64(1, 'D') + test_df1['Days_Predictions']

            test_df2 = test_df[['IDENTIFIER', 'LIFECYCLE_STATE_CODE', 'DISPOSED_INDIC', 'CFD_INDIC', 'AGE', 'ATTRIBUTE', 'COMPONENT', 'DE_MANAGER_USERID', 'ENCL-Description', 'ENGINEER', 'Headline', 'IMPACT', 'PRIORITY_CODE', 'PRODUCT','PROJECT', 'SS_INDIC','TS_INDIC', 'SEVERITY_CODE', 'SUBMITTED_DATE', 'SUBMITTER_ID', 'TICKETS_COUNT', 'VERSION_TEXT', 'IFD_CFD_INDIC','Prediction', 'test_pred', 'Final_prediction']]
            #test_df2 = test_df[['IDENTIFIER', 'LIFECYCLE_STATE_CODE', 'DISPOSED_INDIC', 'CFD_INDIC', 'Prediction', 'test_pred', 'Final_prediction']]
            test_df3 = test_df1[['IDENTIFIER', 'Days_Predictions', 'days_ahead']]

            final_test_df = pd.DataFrame()
            final_test_df = test_df2.join(test_df3.set_index('IDENTIFIER'), on='IDENTIFIER')
            final_test_df = final_test_df.drop_duplicates('IDENTIFIER')
            final_test_df['Prediction'] = final_test_df['Prediction']*100
            final_test_df['test_pred'] = final_test_df['test_pred']*100
            final_test_df['Final_prediction'] = final_test_df['Final_prediction']*100
            final_test_df['Cluster'] = cluster_id
            final_test_df['last_run_date'] = now.strftime("%Y-%m-%d")

            final_test_df = final_test_df[final_test_df['Final_prediction'] >= (cut_off*100)]
            print(final_test_df.shape)
            print(test_df1.shape)
            #Inserting data to view results collection
            if(view):
                vi_col_name_results = settings.get('Potential_CFD', 'viewPrefix') + str(view_id) + '_' + str(query_id) + '_results'
                collection = db[vi_col_name_results]
            
            else:
                vi_col_name_results = settings.get('Potential_CFD', 'viewPrefix') + str(bu_id) + '_' + str(query_id) + '_results'
                collection = db[vi_col_name_results]

            records = json2.loads(final_test_df.T.to_json(date_format='iso')).values()
            collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("last_run_date", pymongo.ASCENDING)], unique=True)
            print(collection.index_information())
            collection.insert(records)
            print("Inserted data to results collection")

            #Inserting data to View Mapper collection
            collection = db[settings.get('Potential_CFD', 'Pot_cfd_viewCluster')]
            df = pd.DataFrame(columns = ['viewSetCollectionName', 'trainedOnCollectionName', 'testCollectionName', 'clusterId', 'viewId', 'queryId', 'BU', 'projectList', 'csap_last_run_date', 'cutoff'])
            proj_list = ",".join(f_c)
            dat = now.strftime("%Y-%m-%d")
            
            if(view):
                df.loc[0] = [vi_col_name_results, tr_col_name, te_col_name, int(cluster_id), int(view_id), int(query_id), bu_id, proj_list, dat, float(cut_off*100)]
            
            else:
                df.loc[0] = [vi_col_name_results, tr_col_name, te_col_name, int(cluster_id), view_id, int(query_id), str(bu_id), proj_list, dat, float(cut_off*100)]

            records = json2.loads(df.T.to_json(date_format='iso')).values()
            collection.insert(records)
            print("Inserted data to View mapper collection")
        else:
            print("No predicted CFDs in this ViewSet")
コード例 #16
0
ファイル: tests.py プロジェクト: CIR2000/json-datetime
 def test_object_hook(self):
     decoded = json.loads(self.test, object_hook=self.hook)
     self.assertEqual(decoded.get('born'), self.expected)
     self.assertIn("hookjob", decoded)
コード例 #17
0
 def test_object_hook(self):
     decoded = json.loads(self.test, object_hook=self.hook)
     self.assertEqual(decoded.get('born'), self.expected)
     self.assertIn("hookjob", decoded)
コード例 #18
0
 def test_default_date_format(self):
     decoded = json.loads(self.test).get('born')
     self.assertIs(type(decoded), datetime.datetime)
     self.assertEqual(decoded, self.expected)
コード例 #19
0
def build_data_text(train_df, cluster, db):
	# Prepare embedding
	vocabulary = dict()
	inverse_vocabulary = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
	print("Loading the W2V")
	#word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True) #WE NEED TO HAVE OUR OWN W2V MODEL
	#Our own word2vec model
	sentences = []
	for dataset in [train_df]:
		for index, row in dataset.iterrows():
			#print(text_to_word_list(row["complete1"]))
			#print(index)
			if(type(row['complete1']) != float):
				sentences.append(text_to_word_list(row["complete1"]))
			if(type(row['complete2']) != float):
				sentences.append(text_to_word_list(row["complete2"]))
	# train model
	model = Word2Vec(sentences, min_count=1, size = 150)
	# summarize the loaded model
	#print(model)
	# summarize vocabulary
	words = list(model.wv.vocab)
	# save model
	filename = '/data/ingestion/bugDuplicates/w2vmodel_' + str(cluster) + '.bin'
	model.save(filename) #model.save('/users/sumreddi/model_3.bin')
	c=0
	q1=[0]*(train_df.shape[0])
	q2=[0]*(train_df.shape[0])
	# Iterate over the questions only of both training and test datasets
	for dataset in [train_df]:#[train_df, test_df]:
		for index, row in dataset.iterrows():
			#print(index)
			# Iterate through the text of both questions of the row
			for question in text_cols:
				c = c + 1
				q2n = []  # q2n -> question numbers representation
				for word in text_to_word_list(row[question]):
					# Check for unwanted words
					if word in stops and word not in words: #word2vec.vocab:
						continue
					if word not in vocabulary:
						vocabulary[word] = len(inverse_vocabulary)
						q2n.append(len(inverse_vocabulary))
						inverse_vocabulary.append(word)
					else:
						q2n.append(vocabulary[word])
				# Replace questions as word to question as number representation
				if(c%2 != 0):
					 q1[index] = q2n
				else:
					 q2[index] = q2n
				#dataset.set_value(index, question, q2n)
	train_df['complete1'] = q1
	train_df['complete2'] = q2
	#Write the vocabulary and inverse vocabulary into a file
	f = '/data/ingestion/bugDuplicates/vocab_model_' + str(cluster) + '.json'
	#f = "/users/sumreddi/vocab_model_3.json"
	f = open(f,'w')
	json1 = json.dumps(vocabulary, indent=4)
	f.write(json1)
	f.close()
	thefile = '/data/ingestion/bugDuplicates/inv_vocab_model_' + str(cluster) + '.json'
	#thefile = "/users/sumreddi/inv_vocab_model_3.json" #str(settings.get("Potential_CFD","temp_path_mod_bugDuplicates")) + '/top_words_cluster_' +str(p)+'.txt'
	with open(thefile, 'wb') as fp:
		pickle.dump(inverse_vocabulary, fp, protocol=2)
	#Store train_df in a collection
	collection = db['BugDupsTrainSet_3_complete']
	train_df.reset_index(drop = True, inplace = True)
	records = json2.loads(train_df.T.to_json(date_format='iso')).values()
	collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("DUPLICATE_OF", pymongo.ASCENDING)], unique=True)
	collection.insert(records)
	return vocabulary, model, train_df
コード例 #20
0
ファイル: tests.py プロジェクト: CIR2000/json-datetime
 def test_nested_dicts(self):
     test = '{"updated": {"$gte": "Thu, 1 Mar 2012 10:00:49 UTC"}}'
     decoded = json.loads(test).get('updated').get('$gte')
     self.assertIs(type(decoded), datetime.datetime)
     self.assertEqual(decoded, self.expected)
def build_collection(df, duplicates, org_dup_ids, non_req_list, collection):
	print("ref_df starting")
	'''
	req_df = pd.DataFrame()
	j=0
	for i in range(0, int(df.shape[0]/100000)+1):
		print(i)
		a = df.iloc[j: j+1000000, ]
		b = a[a['IDENTIFIER'].isin(org_dup_ids)]
		req_df = req_df.append(b)
	'''
	req_df = df[df['IDENTIFIER'].isin(org_dup_ids)]
	print("req_df done")
	feature_columns_to_use = ['IDENTIFIER', 'Headline', 'ENCL-Description', 'DE_MANAGER_USERID', 'SEVERITY_CODE', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'AGE',  'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY',  'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC', 'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC', 'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC', 'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC', 'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC', 'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC', 'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC', 'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC', 'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC', 'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC',  'BADCODEFLAG',   'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC', 'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG', 'SUBMITTED_DATE']
	ids = list(req_df['IDENTIFIER'].unique())
	waste = []
	c=0
	print("new_duplicates staring")
	new_duplicates = duplicates[duplicates['DUPLICATE_OF'].isin(ids)]
	print("new_duplicates done")
	a = pd.DataFrame()
	b = pd.DataFrame()
	for i, row in new_duplicates.iterrows():
		c = c + 1
		#print(c, len(new_duplicates), 1)
		identifier = row['DUPLICATE_OF']
		if identifier in ids:
			b = b.append(req_df[req_df['IDENTIFIER'] == identifier][feature_columns_to_use])
		if(b.shape[0] >= 1000):
			a = a.append(b)
			del b
			b = pd.DataFrame()
	a = a.append(b)
	print(a.shape, new_duplicates.shape)
	#new_feature_columns = ['DUPLICATE_OF','Headline2', 'ENCL-Description2', 'DE_MANAGER_USERID2', 'SEVERITY_CODE2', 'LIFECYCLE_STATE_CODE2', 'PROJECT2', 'PRODUCT2', 'COMPONENT2', 'AGE2', 'FEATURE2', 'RELEASE_NOTE2', 'SA_ATTACHMENT_INDIC2', 'CR_ATTACHMENT_INDIC2', 'UT_ATTACHMENT_INDIC2', 'IMPACT2', 'ORIGIN2', 'IS_CUSTOMER_VISIBLE2', 'TICKETS_COUNT2', 'INCOMING_INDIC2', 'BACKLOG_INDIC2', 'DISPOSED_INDIC2', 'TS_INDIC2', 'SS_INDIC2', 'OIB_INDIC2', 'STATE_ASSIGN_INDIC2', 'STATE_CLOSE_INDIC2', 'STATE_DUPLICATE_INDIC2', 'STATE_FORWARD_INDIC2', 'STATE_HELD_INDIC2', 'STATE_INFO_INDIC2', 'STATE_JUNK_INDIC2', 'STATE_MORE_INDIC2', 'STATE_NEW_INDIC2', 'STATE_OPEN_INDIC2', 'STATE_POSTPONE_INDIC2', 'STATE_RESOLVE_INDIC2', 'STATE_SUBMIT_INDIC2', 'STATE_UNREP_INDIC2', 'STATE_VERIFY_INDIC2', 'STATE_WAIT_INDIC2', 'CFR_INDIC2', 'S12RD_INDIC2', 'S123RD_INDIC2', 'MISSING_SS_EVAL_INDIC2', 'S123_INDIC2', 'S12_INDIC2', 'RNE_INDIC2', 'UPDATED_BY2', 'DEV_ESCAPE_ACTIVITY2', 'RELEASED_CODE2', 'TEST_EDP_ACTIVITY2', 'TEST_EDP_PHASE2', 'RESOLVER_ANALYSIS_INDIC2', 'SUBMITTER_ANALYSIS_INDIC2', 'EDP_ANALYSIS_INDIC2', 'RETI_ANALYSIS_INDIC2', 'DESIGN_REVIEW_ESCAPE_INDIC2', 'STATIC_ANALYSIS_ESCAPE_INDIC2', 'FUNC_TEST_ESCAPE_INDIC2', 'SELECT_REG_ESCAPE_INDIC2', 'CODE_REVIEW_ESCAPE_INDIC2', 'UNIT_TEST_ESCAPE_INDIC2', 'DEV_ESCAPE_INDIC2', 'FEATURE_TEST_ESCAPE_INDIC2', 'REG_TEST_ESCAPE_INDIC2', 'SYSTEM_TEST_ESCAPE_INDIC2', 'SOLUTION_TEST_ESCAPE_INDIC2', 'INT_TEST_ESCAPE_INDIC2', 'GO_TEST_ESCAPE_INDIC2', 'COMPLETE_ESCAPE_INDIC2', 'SR_CNT2', 'PSIRT_INDIC2', 'BADCODEFLAG2',  'RISK_OWNER2', 'SIR2', 'PSIRT_FLAG2', 'URC_DISPOSED_INDIC2', 'CLOSED_DISPOSED_INDIC2', 'REGRESSION_BUG_FLAG2', 'SUBMITTED_DATE2']
	new_feature_columns = ['DUPLICATE_OF','DUP_Headline','DUP_ENCL-Description','DUP_DE_MANAGER_USERID','DUP_SEVERITY_CODE','DUP_LIFECYCLE_STATE_CODE','DUP_PROJECT','DUP_PRODUCT','DUP_COMPONENT','DUP_AGE','DUP_FEATURE','DUP_RELEASE_NOTE','DUP_SA_ATTACHMENT_INDIC','DUP_CR_ATTACHMENT_INDIC','DUP_UT_ATTACHMENT_INDIC','DUP_IMPACT','DUP_ORIGIN','DUP_IS_CUSTOMER_VISIBLE','DUP_TICKETS_COUNT','DUP_INCOMING_INDIC','DUP_BACKLOG_INDIC','DUP_DISPOSED_INDIC','DUP_TS_INDIC','DUP_SS_INDIC','DUP_OIB_INDIC','DUP_STATE_ASSIGN_INDIC','DUP_STATE_CLOSE_INDIC','DUP_STATE_DUPLICATE_INDIC','DUP_STATE_FORWARD_INDIC','DUP_STATE_HELD_INDIC','DUP_STATE_INFO_INDIC','DUP_STATE_JUNK_INDIC','DUP_STATE_MORE_INDIC','DUP_STATE_NEW_INDIC','DUP_STATE_OPEN_INDIC','DUP_STATE_POSTPONE_INDIC','DUP_STATE_RESOLVE_INDIC','DUP_STATE_SUBMIT_INDIC','DUP_STATE_UNREP_INDIC','DUP_STATE_VERIFY_INDIC','DUP_STATE_WAIT_INDIC','DUP_CFR_INDIC','DUP_S12RD_INDIC','DUP_S123RD_INDIC','DUP_MISSING_SS_EVAL_INDIC','DUP_S123_INDIC','DUP_S12_INDIC','DUP_RNE_INDIC','DUP_UPDATED_BY','DUP_DEV_ESCAPE_ACTIVITY','DUP_RELEASED_CODE','DUP_TEST_EDP_ACTIVITY','DUP_TEST_EDP_PHASE','DUP_RESOLVER_ANALYSIS_INDIC','DUP_SUBMITTER_ANALYSIS_INDIC','DUP_EDP_ANALYSIS_INDIC','DUP_RETI_ANALYSIS_INDIC','DUP_DESIGN_REVIEW_ESCAPE_INDIC','DUP_STATIC_ANALYSIS_ESCAPE_INDIC','DUP_FUNC_TEST_ESCAPE_INDIC','DUP_SELECT_REG_ESCAPE_INDIC','DUP_CODE_REVIEW_ESCAPE_INDIC','DUP_UNIT_TEST_ESCAPE_INDIC','DUP_DEV_ESCAPE_INDIC','DUP_FEATURE_TEST_ESCAPE_INDIC','DUP_REG_TEST_ESCAPE_INDIC','DUP_SYSTEM_TEST_ESCAPE_INDIC','DUP_SOLUTION_TEST_ESCAPE_INDIC','DUP_INT_TEST_ESCAPE_INDIC','DUP_GO_TEST_ESCAPE_INDIC','DUP_COMPLETE_ESCAPE_INDIC','DUP_SR_CNT','DUP_PSIRT_INDIC','DUP_BADCODEFLAG', 'DUP_RISK_OWNER','DUP_SIR','DUP_PSIRT_FLAG','DUP_URC_DISPOSED_INDIC','DUP_CLOSED_DISPOSED_INDIC','DUP_REGRESSION_BUG_FLAG','DUP_SUBMITTED_DATE']
	for i in range(0, len(new_feature_columns)):
		#print(i)
		new_duplicates[new_feature_columns[i]] = list(a[feature_columns_to_use[i]])
	new_duplicates['is_duplicate'] = 1
	new_non_duplicates = pd.DataFrame()
	non_a_1 = pd.DataFrame()
	non_a_2 = pd.DataFrame()
	non_a_1_sample = pd.DataFrame()
	non_a_2_sample = pd.DataFrame()
	a = pd.DataFrame()
	for c in range(0, len(non_req_list)):
		tup_id = non_req_list[c]
		if(c%1000 == 0):
			non_a_1 = non_a_1.append(non_a_1_sample)
			non_a_2 = non_a_2.append(non_a_2_sample)
			del non_a_1_sample 
			del non_a_2_sample 
			non_a_1_sample = pd.DataFrame()
			non_a_2_sample = pd.DataFrame()
		if(c%4 == 0):
			a = df[df['IDENTIFIER'] == tup_id[0]][feature_columns_to_use]
			non_a_1_sample = non_a_1_sample.append(a)
			non_a_2_sample = non_a_2_sample.append(df[df['IDENTIFIER'] == tup_id[1]][feature_columns_to_use])
		else:
			non_a_1_sample = non_a_1_sample.append(a)
			non_a_2_sample = non_a_2_sample.append(df[df['IDENTIFIER'] == tup_id[1]][feature_columns_to_use])
		#print(c, len(non_req_list))
	new_non_duplicates = non_a_1
	print(new_non_duplicates.columns)
	for i in range(0, len(new_feature_columns)):
		#print(i)
		new_non_duplicates[new_feature_columns[i]] = list(non_a_2[feature_columns_to_use[i]])
	new_non_duplicates['is_duplicate'] = 0
	new_duplicates_1 = new_duplicates[list(new_non_duplicates.columns)]
	new_duplicates_1 = new_duplicates_1.append(new_non_duplicates)
	new_duplicates_1.reset_index(drop = True, inplace = True)
	records = json2.loads(new_duplicates_1.T.to_json(date_format='iso')).values()
	print(new_duplicates_1)
	#collection = db['BugDupsTrainSet_all_639_1968_new']
	collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("DUPLICATE_OF", pymongo.ASCENDING)], unique=True)
	print(collection.index_information())
	collection.drop()
	collection.insert(records)
	print("Inserted data to results collection")
	return 0
コード例 #22
0
        ]]
        test_df3 = test_df1[[
            'IDENTIFIER', 'Ticket_Predictions', 'TICKETS_COUNT',
            'Days_Predictions', 'days_taken'
        ]]

        final_test_df = test_df2.join(test_df3.set_index('IDENTIFIER'),
                                      on='IDENTIFIER')
        final_test_df = final_test_df.drop_duplicates('IDENTIFIER')
        final_test_df[
            'Final_prediction'] = final_test_df['Final_prediction'] * 100
        print("Predictions completed...")

        #Writing the test data into a collection
        final_test_df.reset_index(drop=True, inplace=True)
        records = json2.loads(
            final_test_df.T.to_json(date_format='iso')).values()
        test_collection = settings.get('Potential_CFD',
                                       'testPrefix') + str(p) + '_ticketCNT'
        print(db[test_collection], len(records))
        print(final_test_df.iloc[3])
        db[test_collection].drop()
        db[test_collection].insert_many(records)
        print("Inserted to collection")


'''
def get_collection_details():
    #Setting the MongoDB configurations
    hostname = settings.get('csap_prod_database', 'host')
    port = settings.get('csap_prod_database', 'dbPort')
    username = settings.get('csap_prod_database', 'user')
				a = topics_matrix[topic]
				for i in range(0,10):
					top.append(topic)
					top_words.append(a[1][i][0])
					top_probs.append(a[1][i][1])
			topics_df['Topic_number'] = top
			topics_df['keyword'] = top_words
			topics_df['probability'] = top_probs
			topics_df['PF'] = pf
			topics_df['underlying_cause_desc'] = desc
			topics_df['underlying_cause_code'] = ccode
			#final_key_df = final_key_df.append(topics_df)

			sample_1 = topics_df[['keyword', 'probability', 'PF', 'underlying_cause_desc', 'underlying_cause_code']]
			sample_1.reset_index(drop = True, inplace = True)
			records = json2.loads(sample_1.T.to_json(date_format='iso')).values()
			db.SR_topic_keywords_cause_desc.insert(records)

			print(desc)
			tk_asa_df = topics_df
			asa_df_1 = final_df
			list_of_keywords_asa = top_words
			keyword_srcases_df = pd.DataFrame()
			for word in list_of_keywords_asa:
				new_word = lemma.lemmatize(word,'v')
				sr_cases = []
				temp = tk_asa_df[tk_asa_df['keyword'] == new_word]
				if(temp.size != 0):
					topics = temp.Topic_number.unique()
					for topic in topics:
						temp2 = asa_df_1[asa_df_1['topic']==topic][asa_df_1['topic_probability'] > 0.87]
コード例 #24
0
def bugDuplicate_view_set(e, cluster_id, stops, max_seq_length, text_cols,
                          embedding_dim, view_id, query_id, cut_off):
    embedding_vecor_length = 32
    model1 = Sequential()
    model1.add(Embedding(3458230 + 1, embedding_vecor_length,
                         input_length=6))  #len(X_test[0])))
    model1.add(
        Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model1.add(MaxPooling1D(pool_size=2))
    model1.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model1.add(Dense(1, activation='sigmoid'))
    model1.compile(loss='binary_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])
    filename = '/data/csap_models/bugDups/model3_lstm_cnn_cat_' + str(
        cluster_id) + '.h5'
    model1 = load_model(
        filename
    )  #model = load_model('/data/csap_models/bugDups/model3_lstm_cnn_cat.h5')

    filename = '/data/csap_models/bugDups/w2vmodel_' + str(cluster_id) + '.bin'
    w2vmodel = Word2Vec.load(filename)
    f = '/data/csap_models/bugDups/vocab_model_' + str(cluster_id) + '.json'
    vocabulary = json.load(open(f, 'r'))
    thefile = '/data/csap_models/bugDups/inv_vocab_model_' + str(
        cluster_id) + '.json'
    with open(thefile, 'rb') as fp:
        inverse_vocabulary = pickle.load(fp)

    words = list(w2vmodel.wv.vocab)

    thefile = "/data/csap_models/bugDups/embeddings_model_" + str(
        cluster_id) + '.json'
    with open(thefile, 'rb') as fp:
        embeddings = pickle.load(fp)

    n_hidden = 50
    gradient_clipping_norm = 1.25
    batch_size = 64
    n_epoch = 1  #25

    def exponent_neg_manhattan_distance(left, right):
        return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True))

    left_input = Input(shape=(max_seq_length, ), dtype='int32')
    right_input = Input(shape=(max_seq_length, ), dtype='int32')
    embedding_layer = Embedding(len(embeddings),
                                embedding_dim,
                                weights=[embeddings],
                                input_length=max_seq_length,
                                trainable=False)
    # Embedded version of the inputs
    encoded_left = embedding_layer(left_input)
    encoded_right = embedding_layer(right_input)
    # Since this is a siamese network, both sides share the same LSTM
    shared_lstm = LSTM(n_hidden)
    left_output = shared_lstm(encoded_left)
    right_output = shared_lstm(encoded_right)
    # Calculates the distance as defined by the MaLSTM model
    malstm_distance = Merge(
        mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),
        output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
    # Pack it all up into a model
    malstm = Model([left_input, right_input], [malstm_distance])
    # Adadelta optimizer, with gradient clipping by norm
    optimizer = Adadelta(clipnorm=gradient_clipping_norm)
    malstm.compile(loss='mean_squared_error',
                   optimizer=optimizer,
                   metrics=['accuracy'])
    # load weights into new model
    filename = '/data/csap_models/bugDups/text_model_' + str(
        cluster_id) + '.h5'
    malstm.load_weights(filename)

    final_df = pd.DataFrame()
    duplicate_bugs_length = sum(e['is_duplicate'] == 1)

    for j in range(0, duplicate_bugs_length):
        b1 = e.iloc[j, ]
        duplicate_of = b1['DUPLICATE_OF']
        is_duplicates = b1['is_duplicate']
        df = e[(e['PRODUCT2'] == b1['PRODUCT']
                )]  # & (e['PROJECT2'] == b1['PROJECT'])
        print(j, duplicate_bugs_length, df.shape[0])
        df1 = get_test_data(df, b1)
        #X_test, Y_test = build_test_data_cat(df1)
        #cat_predictions = test_model_cat(model1, X_test, Y_test, cluster_id)
        #vocabulary, w2vmodel, words, test_df = build_test_data_text(w2vmodel, vocabulary, inverse_vocabulary, df1, cluster_id)
        #text_predictions = test_model_text(malstm, embeddings, vocabulary, w2vmodel, words, df1, cluster_id)
        X_test = {'left': df1.complete1, 'right': df1.complete2}
        for dataset, side in itertools.product([X_test], ['left', 'right']):
            dataset[side] = pad_sequences(dataset[side], maxlen=150)
        #print("predictions started")
        text_predictions = malstm.predict([X_test['left'], X_test['right']])
        #print("predictions done")
        result = pd.DataFrame()
        result['IDENTIFIER'] = df1['IDENTIFIER']
        result['DUPLICATE_OF'] = df1['DUPLICATE_OF']
        result['Headline'] = df1['Headline']
        result['ENCL-Description'] = df1['ENCL-Description']
        #p = []
        #for i in cat_predictions:
        #	p.append(i[0]*100)
        #d['pred_cat'] = p
        p = []
        for i in text_predictions:
            p.append(i[0] * 100)
        result['pred_text'] = p
        result = result.drop_duplicates(subset='DUPLICATE_OF', keep="last")
        result = result.sort_values(['pred_text'], ascending=[0])
        result = result[result['pred_text'] > cut_off]
        if (result.shape[0] > 10):
            v = 10
        else:
            v = result.shape[0]
        if (v != 0):
            df2 = pd.DataFrame()
            l = []
            l.append(result.iloc[0, ]['IDENTIFIER'])
            df2['IDENTIFIER'] = l
            df2['Headline'] = df1['Headline'].iloc[0]
            df2['ENCL-Description'] = df1['ENCL-Description'].iloc[0]
            #df2['PRODUCT'] = result.iloc[0,]['PRODUCT']
            #df2['PROJECT'] = b1['PROJECT']
            #df2['Headline'] = b1['Headline']
            df2['DUPLICATE_LIST'] = ' '.join(
                list(result.iloc[0:v]['DUPLICATE_OF']))
            df2['PROBABILITIES'] = ' '.join(
                str(x) for x in list(result.iloc[0:v]['pred_text']))
            if (is_duplicates == 1):
                df2['actual_duplicate'] = duplicate_of
            else:
                df2['actual_duplicate'] = ""
            final_df = final_df.append(df2)

    res_coll_name = "BugDupsTestSet_" + str(view_id) + "_" + str(
        query_id) + "_results"
    collection = db[res_coll_name]

    final_df = final_df.drop_duplicates(subset='IDENTIFIER', keep="last")
    final_df.reset_index(drop=True, inplace=True)
    final_df = final_df.drop_duplicates(subset='IDENTIFIER', keep="last")
    filename = res_coll_name + '.csv'
    final_df.to_csv(filename, encoding='utf-8')
    records = json2.loads(final_df.T.to_json(date_format='iso')).values()

    collection.create_index([("IDENTIFIER", pymongo.ASCENDING)], unique=True)
    collection.insert(records)
    return final_df
コード例 #25
0
 def test_no_dates(self):
     test = '{"name": "John Doe"}'
     try:
         json.loads(test)
     except Exception as e:
         self.fail("Unexpected failure: %s" % e)
コード例 #26
0
def build_collection(df, duplicates, org_dup_ids):
req_df = df[df['IDENTIFIER'].isin(org_dup_ids)]

feature_columns_to_use = ['IDENTIFIER', 'Headline', 'ENCL-Description', 'DE_MANAGER_USERID', 'SEVERITY_CODE', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'AGE',  'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY',  'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC', 'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC', 'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC', 'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC', 'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC', 'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC', 'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC', 'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC', 'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC', 'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC',  'BADCODEFLAG',   'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC', 'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG']
#nonnumeric_columns = ['DE_MANAGER_USERID', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'FEATURE', 'RELEASE_NOTE', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'TICKETS_COUNT', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'BADCODEFLAG',  'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'REGRESSION_BUG_FLAG']

#duplicates['Headline2'] = ""
#duplicates['ENCL-Description2'] = ""

ids = list(req_df['IDENTIFIER'].unique())

waste = []
headlines = []
encl_descriptions = []
c=0
new_duplicates = duplicates[duplicates['DUPLICATE_OF'].isin(ids)]

a = pd.DataFrame()
for i, row in new_duplicates.iterrows():
	c = c + 1
	print(c)
	identifier = row['DUPLICATE_OF']
	if identifier in ids:
		#duplicates.ix[i, 'Headline2'] = list(req_df[req_df['IDENTIFIER'] == identifier]['Headline'])[0]
		#duplicates.ix[i, 'ENCL-Description2'] = list(req_df[req_df['IDENTIFIER'] == identifier]['ENCL-Description'])[0]
		a = a.append(req_df[req_df['IDENTIFIER'] == identifier][feature_columns_to_use])
		#headlines.append(list(req_df[req_df['IDENTIFIER'] == identifier]['Headline'])[0])
		#encl_descriptions.append(list(req_df[req_df['IDENTIFIER'] == identifier]['ENCL-Description'])[0])
	else:
		waste.append(c)


#a['DUPLICATE_OF'] = a ['IDENTIFIER']
new_feature_columns = ['DUPLICATE_OF','Headline2', 'ENCL-Description2', 'DE_MANAGER_USERID2', 'SEVERITY_CODE2', 'LIFECYCLE_STATE_CODE2', 'PROJECT2', 'PRODUCT2', 'COMPONENT2', 'AGE2', 'FEATURE2', 'RELEASE_NOTE2', 'SA_ATTACHMENT_INDIC2', 'CR_ATTACHMENT_INDIC2', 'UT_ATTACHMENT_INDIC2', 'IMPACT2', 'ORIGIN2', 'IS_CUSTOMER_VISIBLE2', 'TICKETS_COUNT2', 'INCOMING_INDIC2', 'BACKLOG_INDIC2', 'DISPOSED_INDIC2', 'TS_INDIC2', 'SS_INDIC2', 'OIB_INDIC2', 'STATE_ASSIGN_INDIC2', 'STATE_CLOSE_INDIC2', 'STATE_DUPLICATE_INDIC2', 'STATE_FORWARD_INDIC2', 'STATE_HELD_INDIC2', 'STATE_INFO_INDIC2', 'STATE_JUNK_INDIC2', 'STATE_MORE_INDIC2', 'STATE_NEW_INDIC2', 'STATE_OPEN_INDIC2', 'STATE_POSTPONE_INDIC2', 'STATE_RESOLVE_INDIC2', 'STATE_SUBMIT_INDIC2', 'STATE_UNREP_INDIC2', 'STATE_VERIFY_INDIC2', 'STATE_WAIT_INDIC2', 'CFR_INDIC2', 'S12RD_INDIC2', 'S123RD_INDIC2', 'MISSING_SS_EVAL_INDIC2', 'S123_INDIC2', 'S12_INDIC2', 'RNE_INDIC2', 'UPDATED_BY2', 'DEV_ESCAPE_ACTIVITY2', 'RELEASED_CODE2', 'TEST_EDP_ACTIVITY2', 'TEST_EDP_PHASE2', 'RESOLVER_ANALYSIS_INDIC2', 'SUBMITTER_ANALYSIS_INDIC2', 'EDP_ANALYSIS_INDIC2', 'RETI_ANALYSIS_INDIC2', 'DESIGN_REVIEW_ESCAPE_INDIC2', 'STATIC_ANALYSIS_ESCAPE_INDIC2', 'FUNC_TEST_ESCAPE_INDIC2', 'SELECT_REG_ESCAPE_INDIC2', 'CODE_REVIEW_ESCAPE_INDIC2', 'UNIT_TEST_ESCAPE_INDIC2', 'DEV_ESCAPE_INDIC2', 'FEATURE_TEST_ESCAPE_INDIC2', 'REG_TEST_ESCAPE_INDIC2', 'SYSTEM_TEST_ESCAPE_INDIC2', 'SOLUTION_TEST_ESCAPE_INDIC2', 'INT_TEST_ESCAPE_INDIC2', 'GO_TEST_ESCAPE_INDIC2', 'COMPLETE_ESCAPE_INDIC2', 'SR_CNT2', 'PSIRT_INDIC2', 'BADCODEFLAG2',  'RISK_OWNER2', 'SIR2', 'PSIRT_FLAG2', 'URC_DISPOSED_INDIC2', 'CLOSED_DISPOSED_INDIC2', 'REGRESSION_BUG_FLAG2']
for i in range(0, len(new_feature_columns)):
	print(i)
	new_duplicates[new_feature_columns[i]] = list(a[feature_columns_to_use[i]])

#new_duplicates[new_feature_columns] = a[feature_columns_to_use]

new_duplicates['Headline2'] = headlines
new_duplicates['ENCL-Description2'] = encl_descriptions
new_duplicates['is_duplicate'] = 1
#new_duplicates = duplicates.drop(duplicates.index[waste])

#For non dups dataset...
#Fetch for 1st row and copy for next 3 indexes
#Then for Headline2 do the same as above...

identifiers = []
dup_identifiers = []
headlines1 = []
encl_descriptions1 = []
headlines2 = []
encl_descriptions2 = []

new_non_duplicates = pd.DataFrame()
non_a_1 = pd.DataFrame()
non_a_2 = pd.DataFrame()
a = pd.DataFrame()
for c in range(0, len(non_req_list)):
	tup_id = non_req_list[c]
	if(c%4 == 0):
		#a = df[df['IDENTIFIER'] == tup_id[0]]
		#h = list(a['Headline'])[0]
		#e = list(a['ENCL-Description'])[0]
		a = df[df['IDENTIFIER'] == tup_id[0]][feature_columns_to_use]
		non_a_1 = non_a_1.append(a)
		#for j in range(4):
		#identifiers.append(tup_id[0])
		#headlines1.append(h)
		#encl_descriptions1.append(e)
		#a1 = df[df['IDENTIFIER'] == tup_id[1]]
		non_a_2 = non_a_2.append(df[df['IDENTIFIER'] == tup_id[1]][feature_columns_to_use])
		#dup_identifiers.append(tup_id[1])
		#headlines2.append(list(a1['Headline'])[0])
		#encl_descriptions2.append(list(a1['ENCL-Description'])[0])
	else:
		#identifiers.append(tup_id[0])
		#headlines1.append(h)
		#encl_descriptions1.append(e)
		non_a_1 = non_a_1.append(a)
		#a1 = df[df['IDENTIFIER'] == tup_id[1]]
		non_a_2 = non_a_2.append(df[df['IDENTIFIER'] == tup_id[1]][feature_columns_to_use])
		#dup_identifiers.append(tup_id[1])
		#headlines2.append(list(a1['Headline'])[0])
		#encl_descriptions2.append(list(a1['ENCL-Description'])[0])
	print(c)

new_non_duplicates = non_a_1
#non_a_2['DUPLICATE_OF'] = non_a_2['IDENTIFIER']
print(new_non_duplicates.columns)
for i in range(0, len(new_feature_columns)):
	print(i)
	new_non_duplicates[new_feature_columns[i]] = list(non_a_2[feature_columns_to_use[i]])

new_non_duplicates['IDENTIFIER'] = identifiers
new_non_duplicates['DUPLICATE_OF'] = dup_identifiers
new_non_duplicates['Headline'] = headlines1
new_non_duplicates['Headline2'] = headlines2
new_non_duplicates['ENCL-Description'] = encl_descriptions1
new_non_duplicates['ENCL-Description2'] = encl_descriptions2
new_non_duplicates['is_duplicate'] = 0

new_duplicates_1 = new_duplicates[list(new_non_duplicates.columns)]
#new_duplicates_1 = new_duplicates[['IDENTIFIER', 'DUPLICATE_OF','Headline', 'Headline2', 'ENCL-Description', 'ENCL-Description2', 'is_duplicate']]
new_duplicates_1 = new_duplicates_1.append(new_non_duplicates)

new_duplicates_1.reset_index(drop = True, inplace = True)
records = json2.loads(new_duplicates_1.T.to_json(date_format='iso')).values()

collection = db['BugDupsTrainSet_436_1452_new']
collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("DUPLICATE_OF", pymongo.ASCENDING)], unique=True)
print(collection.index_information())
collection.insert(records)
print("Inserted data to results collection")



def fetch_bugs_list(db):
cluster_id = options.cluster
#Getting the clusters data
collection = db[settings.get('Potential_CFD', 'proj_cluster')]
cursor = collection.find({})
clusters =  pd.DataFrame(list(cursor))
project_clusters = []
cluster_status = True
groups = clusters.groupby('Cluster')

for name, group in groups:
	project_clusters.append(list(group['Project']))

print(project_clusters)

cluster = project_clusters[cluster_id - 1]

df = pd.DataFrame()
#Fetching the data for each project in the cluster
for proj in cluster:
	collection = db[settings.get('Potential_CFD', 'trainPrefix')+ proj.replace('.', '_')]
	cursor = collection.find({}) 
	print(proj)
	df2 =  pd.DataFrame(list(cursor))
	df = df.append(df2)

print(df['PROJECT'].unique())

duplicates = df[df['DUPLICATE_OF'].isnull() == False]
non_duplicates = df[df['DUPLICATE_OF'].isnull() == True]
print(duplicates[['IDENTIFIER', 'DUPLICATE_OF']])
#https://stackoverflow.com/questions/48220643/add-values-to-an-existing-dataframe-from-list-of-tuples
req_list = list(zip(duplicates['IDENTIFIER'], duplicates['DUPLICATE_OF']))
org_dup_ids = list(duplicates['DUPLICATE_OF'].unique())
dup_ids = list(duplicates['IDENTIFIER'])
non_dup_ids = list(non_duplicates['IDENTIFIER'])

non_req_list = []
for id in dup_ids:
	rand_items = random.sample(non_dup_ids, 2)
	for i in rand_items:
		req_list.append((id, i))
		non_req_list.append((id, i))

org_non_dups_ids = [j for i,j in req_list]
build_collection(df, duplicates, org_dup_ids)


def main():
	options = parse_options()
	if(options.env == "Prod"):
		key = "csap_prod_database"

	else:
		key = "csap_stage_database"

	db = get_db(settings, key)
	fetch_bugs_list(db)
コード例 #27
0
ファイル: generic.py プロジェクト: aprimadi/gsapi
def json_renderer(**data):
    return jsonify(loads(dumps(data)))