Exemple #1
0
    def __init__(self):
        self.X_train = None
        self.feature_names = None
        print('{}: Loading the data '.format((time.asctime(time.localtime(time.time())))))
        self.featuresDF = pd.read_excel('FinalFeatures.xlsx')
        self.labels = self.featuresDF['IsEfficient']
        self.submission_author_features = ['submission_author_number_original_subreddit',
                                           'submission_author_number_recommend_subreddit',
                                           'submission_created_time_hour']
        self.sub_comment_author_relation_features = ['cosine_similarity_subreddits_list',
                                                    'comment_submission_similarity',
                                                    'comment_title_similarity']
        self.comment_author_features =['comment_author_number_original_subreddit',
                                        'comment_author_number_recommend_subreddit',
                                        'percent_efficient_references_comment_author',
                                        'number_of_references_comment_author']
        self.comment_features = ['comment_created_time_hour', 'submission_created_time_hour',
                                 'time_between_messages', 'comment_len', 'number_of_r',
                                 'number_of_references_to_submission']
        self.subreddit_features = ['number_of_references_to_recommended_subreddit',
                                   'subreddits_similarity']
        # self.subreddit_features = self.featuresDF['number_of_references_to_recommended_subreddit']
        self.group_dic = {0: [self.submission_author_features, 'submission_author_features'],
                          1: [self.sub_comment_author_relation_features, 'sub_comment_author_relation_features'],
                          2: [self.comment_author_features, 'comment_author_features'],
                          3: [self.comment_features, 'comment_features'],
                          4: [self.subreddit_features, 'subreddit_features']}

        print('{}: Data loaded '.format((time.asctime(time.localtime(time.time())))))
        return
    def create_data_no_feature_selection(self):
        """
        This function create the data for the models if not using feature selection
        :return:
        """
        selected_features = list(self.group_dict.keys())
        features_group = [
            self.group_dict[group][0] for group in selected_features
        ]
        self.features = [
            item for sublist in features_group for item in sublist
        ]
        features = [item for sublist in features_group for item in sublist]
        features.append('group_number')
        self.X_train = self.featuresDF[features]
        features_names = [
            self.group_dict[feature][1] for feature in selected_features
        ]
        print('{}: Start training with the groups: {}'.format(
            (time.asctime(time.localtime(time.time()))), features_names))
        logging.info('{}: Start training with the groups: {}'.format(
            (time.asctime(time.localtime(time.time()))), features_names))
        group_results = self.models_iteration()

        for model in group_results:
            model.append(features_names)
            model.append(opts.k_fold)
        columns_names = [
            'classifier_name', 'score', 'auc', 'train_time', 'features_list',
            'k_fold'
        ]
        group_results_df = pd.DataFrame(group_results, columns=columns_names)

        return group_results_df
Exemple #3
0
    def __init__(self):
        self.X_train = None
        self.features = None
        self.feature_names = None
        print('{}: Loading the data: FinalFeatures_with_comment_time'.format(
            (time.asctime(time.localtime(time.time())))))
        self.original_data = pd.read_excel(
            'FinalFeatures_with_comment_time.xlsx')
        self.labels = None
        self.featuresDF = None
        # self.featuresDF['percent_efficient_references_comment_author'].astype(str)

        # self.featuresDF.to_csv('sorted_group.csv', encoding='utf-8')

        self.submission_author_features = [
            'submission_author_number_original_subreddit',
            'submission_author_number_recommend_subreddit',
            'submission_created_time_hour'
        ]
        self.sub_comment_author_relation_features = [
            'cosine_similarity_subreddits_list',
            'comment_submission_similarity', 'comment_title_similarity'
        ]
        self.comment_author_features = [
            'comment_author_number_original_subreddit',
            'comment_author_number_recommend_subreddit',
            # 'percent_efficient_references_comment_author',
            'number_of_references_comment_author'
        ]
        self.comment_features = [
            'comment_created_time_hour', 'time_between_messages',
            'comment_len', 'number_of_r', 'number_of_references_to_submission'
        ]
        self.subreddit_features = [
            'number_of_references_to_recommended_subreddit',
            'subreddits_similarity'
        ]
        # for 50Doc2Vec:
        # self.text_features = range(50)
        # for Word2Vec and 100Doc2Vec:
        self.text_features = range(100)

        self.group_dic = {
            0: [self.submission_author_features, 'submission_author_features'],
            1: [
                self.sub_comment_author_relation_features,
                'sub_comment_author_relation_features'
            ],
            2: [self.comment_author_features, 'comment_author_features'],
            3: [self.comment_features, 'comment_features'],
            4: [self.subreddit_features, 'subreddit_features'],
            5: [self.text_features, 'text_features']
        }

        print('{}: Data loaded '.format(
            (time.asctime(time.localtime(time.time())))))
        return
 def split_relevant_data(self, Peff_up_threshold, Peff_down_threshold):
     self.featuresDF = self.original_data.loc[
         (self.original_data['percent_efficient_references_comment_author']
          <= Peff_up_threshold) &
         (self.original_data['percent_efficient_references_comment_author']
          >= Peff_down_threshold)]
     # Split the data to k=15 groups, each comment_author in one group only
     i = 0
     number_sample_group = 0
     if Peff_up_threshold == 50.0 or Peff_up_threshold == 60.0 or Peff_up_threshold == 100.0:
         opts.k_fold = 4
     sample_per_group = self.featuresDF.shape[0] / opts.k_fold
     last_comment_author = ''
     for index, row in self.featuresDF.iterrows():
         if number_sample_group < sample_per_group:
             self.featuresDF.set_value(index, 'group_number', i)
             number_sample_group += 1
             last_comment_author = row['comment_author']
         else:
             if last_comment_author != row['comment_author']:
                 i += 1
                 self.featuresDF.set_value(index, 'group_number', i)
                 print(
                     '{}: finish split samples for group number {} with {} samples'
                     .format((time.asctime(time.localtime(time.time()))),
                             i - 1, number_sample_group))
                 print('{}: start split samples for group number {}'.format(
                     (time.asctime(time.localtime(time.time()))), i))
                 logging.info(
                     '{}: finish split samples for group number {} with {} samples'
                     .format((time.asctime(time.localtime(time.time()))),
                             i - 1, number_sample_group))
                 logging.info(
                     '{}: start split samples for group number {}'.format(
                         (time.asctime(time.localtime(time.time()))), i))
                 last_comment_author = row['comment_author']
                 number_sample_group = 1
             else:
                 self.featuresDF.set_value(index, 'group_number', i)
                 number_sample_group += 1
                 last_comment_author = row['comment_author']
                 print('{}: {} group is larger, number of samples is: {}'.
                       format((time.asctime(time.localtime(time.time()))),
                              i, number_sample_group))
     print('{}: finish split samples for group number {} with {} samples'.
           format((time.asctime(time.localtime(time.time()))), i,
                  number_sample_group))
     logging.info(
         '{}: finish split samples for group number {} with {} samples'.
         format((time.asctime(time.localtime(time.time()))), i,
                number_sample_group))
     opts.k_fold = i + 1
     self.labels = self.featuresDF[['IsEfficient', 'group_number']]
     print('{}: Finish split the data for Peff between: {} and {}'.format(
         (time.asctime(time.localtime(time.time()))), Peff_down_threshold,
         Peff_up_threshold))
     logging.info(
         '{}: Finish split the data for Peff between: {} and {}'.format(
             (time.asctime(time.localtime(time.time()))),
             Peff_down_threshold, Peff_up_threshold))
Exemple #5
0
    def iterate_over_features_groups(self, peff_up_threshold,
                                     peff_down_threshold):
        all_groups_results = pd.DataFrame()
        for number_of_groups in range(1, 7):
            feature_list = list(combinations(range(0, 6), number_of_groups))
            for groups in feature_list:
                if 5 not in groups:
                    continue
                # compare 2 features in group 2:
                # if groups != (2,3):
                #     continue
                features_group = [self.group_dic[group][0] for group in groups]
                self.features = [
                    item for sublist in features_group for item in sublist
                ]
                features = [
                    item for sublist in features_group for item in sublist
                ]
                features.append('group_number')
                self.X_train = self.featuresDF[features]
                group_names = [self.group_dic[group][1] for group in groups]
                print('{}: Start training with the groups: {} '.format(
                    (time.asctime(time.localtime(time.time()))), group_names))
                logging.info('{}: Start training with the groups: {} '.format(
                    (time.asctime(time.localtime(time.time()))), group_names))
                group_results = self.ModelsIteration()
                print('{}: Finish training with the groups: {}'.format(
                    (time.asctime(time.localtime(time.time()))), group_names))
                logging.info('{}: Finish training with the groups: {}'.format(
                    (time.asctime(time.localtime(time.time()))), group_names))

                for model in group_results:
                    model.append(group_names)
                    model.append(opts.k_fold)
                    model.append(peff_up_threshold)
                    model.append(peff_down_threshold)
                columns_names = [
                    'classifier_name', 'score', 'auc', 'train_time',
                    'group_list', 'k_fold', 'Peff_up_threshold',
                    'Peff_down_threshold'
                ]
                group_resultsDF = pd.DataFrame(group_results,
                                               columns=columns_names)
                # group_results.append(group_names).append([opts.k_fold])
                all_groups_results = all_groups_results.append(
                    group_resultsDF, ignore_index=True)
                all_groups_results.to_csv('test_results.csv', encoding='utf-8')

        # all_groups_results.to_csv('test_results_final_both.csv', encoding='utf-8')

        return all_groups_results
    def __init__(self):
        self.X_train = None
        self.features = None
        self.feature_names = None
        print('{}: Loading the data: 100w2v_scale_2_causality'.format(
            (time.asctime(time.localtime(time.time())))))
        self.original_data = pd.read_excel('100w2v_scale_2_causality.xlsx')
        self.labels = None
        self.featuresDF = None

        # for 50Doc2Vec:
        # self.text_features = range(50)
        # for Word2Vec and 100Doc2Vec:
        self.text_features = range(100)

        self.group_dic = {
            0: [['submission_author_number_original_subreddit'],
                'submission_author_number_original_subreddit'],
            1: [['submission_author_number_recommend_subreddit'],
                'submission_author_number_recommend_subreddit'],
            2: [['submission_created_time_hour'],
                'submission_created_time_hour'],
            3: [['cosine_similarity_subreddits_list'],
                'cosine_similarity_subreddits_list'],
            4: [['comment_submission_similarity'],
                'comment_submission_similarity'],
            5: [['comment_title_similarity'], 'comment_title_similarity'],
            6: [['comment_author_number_original_subreddit'],
                'comment_author_number_original_subreddit'],
            7: [['comment_author_number_recommend_subreddit'],
                'comment_author_number_recommend_subreddit'],
            8: [['number_of_references_comment_author'],
                'number_of_references_comment_author'],
            9: [['comment_created_time_hour'], 'comment_created_time_hour'],
            10: [['time_between_messages'], 'time_between_messages'],
            11: [['comment_len'], 'comment_len'],
            12: [['number_of_r'], 'number_of_r'],
            13: [['number_of_references_to_submission'],
                 'number_of_references_to_submission'],
            14: [['number_of_references_to_recommended_subreddit'],
                 'number_of_references_to_recommended_subreddit'],
            15: [['subreddits_similarity'], 'subreddits_similarity'],
            16: [['treated'], 'treated']
            # 16: [self.text_features, 'text_features']
        }

        print('{}: Data loaded '.format(
            (time.asctime(time.localtime(time.time())))))
        return
def fillModels(cv, mname, fname, comment=None):
    import os
    import time
    import stat
    if (comment == None):
        comment = " "
    pmmlfile = file(fname)
    sql = 'SELECT CURDATE()'
    cv.execute(sql)
    date = cv.fetchone()[0]
    atime = os.stat(fname)[stat.ST_ATIME]
    atime = time.asctime(time.localtime(atime))
    ctime = os.stat(fname)[stat.ST_CTIME]
    ctime = time.asctime(time.localtime(ctime))
    mtime = os.stat(fname)[stat.ST_MTIME]
    mtime = time.asctime(time.localtime(mtime))
    mode = os.stat(fname)[stat.ST_MODE]
    mode = oct(mode & 0777)

    # we were using the mysql specific LOAD_FILE, but it
    # wasn't working in Korea, so we're doing the file load
    # the hard way
    load_file = file(fname, "rb")
    file_content = load_file.read()
    load_file.close()

    sql = "INSERT INTO models VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

    try:
        login = getlogin()
    except:
        login = '******'

    cv.execute (sql,
      ( mname \
      , file_content \
      , comment \
      , str(os.getuid()) \
      , login \
      , str(os.getgid()) \
      , atime \
      , mtime \
      , ctime \
      , str(mode) \
      ))

    return
def fillModels(cv,mname,fname,comment=None):
    import os
    import time
    import stat
    if (comment==None):
     comment=" "
    pmmlfile=file(fname)
    sql='SELECT CURDATE()'
    cv.execute(sql)
    date=cv.fetchone()[0]
    atime=os.stat(fname)[stat.ST_ATIME]
    atime=time.asctime(time.localtime(atime))
    ctime=os.stat(fname)[stat.ST_CTIME]
    ctime=time.asctime(time.localtime(ctime))
    mtime=os.stat(fname)[stat.ST_MTIME]
    mtime=time.asctime(time.localtime(mtime))
    mode=os.stat(fname)[stat.ST_MODE]
    mode=oct(mode & 0777)
   
    # we were using the mysql specific LOAD_FILE, but it
    # wasn't working in Korea, so we're doing the file load
    # the hard way
    load_file = file (fname, "rb")
    file_content = load_file.read ()
    load_file.close ()
   
    sql="INSERT INTO models VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
   
    try:
        login=getlogin()
    except:
         login='******';
    
    cv.execute (sql,
      ( mname \
      , file_content \
      , comment \
      , str(os.getuid()) \
      , login \
      , str(os.getgid()) \
      , atime \
      , mtime \
      , ctime \
      , str(mode) \
      ))
    
    return
Exemple #9
0
    def benchmark(self, clf, clf_name='default'):
        print('_' * 80)
        print('{}: Traininig: {}'.format((time.asctime(time.localtime(time.time()))), clf))
        logging.info('_' * 80)
        logging.info('{}: Traininig: {}'.format((time.asctime(time.localtime(time.time()))), clf))
        t0 = time.time()
        # Cross validation part
        k = opts.k_fold
        if clf_name == 'GaussianNB':
            self.X_train = self.X_train.toarray()
        predicted = cross_val_predict(clf, self.X_train, self.labels, cv=k)
        score = metrics.accuracy_score(self.labels, predicted)
        train_time = time.time() - t0
        print("cross validation time: {}".format(train_time))
        logging.info("cross validation time: {}".format(train_time))
        # if hasattr(clf, 'coef_'):
        #     print("dimensionality: %d" % clf.coef_.shape[1])
        #     print("density: %f" % density(clf.coef_))

            # if opts.print_top10 and self.feature_names is not None:
            #     print("top 10 keywords per class:")
            #     for i, label in enumerate(self.labels):
            #         top10 = np.argsort(clf.coef_[i])[-10:]
            #         print(trim("%s: %s" % (label, " ".join(self.feature_names[top10]))))
            # print()

        # if True:  # opts.print_report:
        #     print("classification report:")
        #     print(metrics.classification_report(self.labels, predicted,
        #                                             self.labels=self.labels))

        if opts.print_cm:
            print("confusion matrix:")
            print(metrics.confusion_matrix(self.labels, predicted, labels=[-1, 1]))
            logging.info("confusion matrix:")
            logging.info(metrics.confusion_matrix(self.labels, predicted, labels=[-1, 1]))

            clf_descr = str(clf).split('(')[0]
        print("Accuracy: {} (+/- {})".format(score.mean(), score.std() * 2))
        logging.info("Accuracy: {} (+/- {})".format(score.mean(), score.std() * 2))

        auc = metrics.roc_auc_score(self.labels, predicted, average='samples')
        print('AUC: {}'.format(auc))
        logging.info('AUC: {}'.format(auc))

        return [clf_descr, score, auc, train_time]
Exemple #10
0
 def get(self):
     template_vars = {"timeofday" : time.asctime(),
     "filepath" : os.path.dirname(__file__),
     "somevalue" : 1.0}
     template = jinja_environment.get_template("templates/hello.html")
     self.response.write(template.render(template_vars))
     self.response.write("Hello world")
     self.response.write("<br>")
     self.response.write('<a href="/add?firstNum=23&secondNum=7"> Add 23 and 7 </a>')
Exemple #11
0
 def get(self):
     template_vars = {
         "timeofday": time.asctime(),
         "filepath": os.path.dirname(__file__),
         "somevalue": 1.0
     }
     template = jinja_environment.get_template("templates/hello.html")
     self.response.write(template.render(template_vars))
     self.response.write("Hello world")
     self.response.write("<br>")
     self.response.write(
         '<a href="/add?firstNum=23&secondNum=7"> Add 23 and 7 </a>')
Exemple #12
0
    def create_subreddit_data(self):
        print('{}: Start calculate subreddit dictionary'.format((time.asctime(time.localtime(time.time())))))
        for index, comment in self.all_data.iterrows():
            title = comment['title']
            if isinstance(title, str):
                title.encode('utf-8')
                if not isinstance(title, str) or title in ['[removed]', '[deleted]']:
                    title = ' '
            else:
                title = ' '
            submission_body = comment['submission_body']
            if isinstance(submission_body, str):
                submission_body.encode('utf-8')
                if not isinstance(submission_body, str) or submission_body in ['[removed]', '[deleted]']:
                    submission_body = ' '
            else:
                submission_body = ' '
            comment_body = comment['comment_body']
            if isinstance(comment_body, str):
                comment_body.encode('utf-8')
                if not isinstance(comment_body, str) or comment_body in ['[removed]', '[deleted]']:
                    comment_body = ' '
            else:
                comment_body = ' '

            concat_text = title + ' ' + submission_body + ' ' + comment_body
            subreddit = comment['subreddit']
            if isinstance(subreddit, str):
                subreddit.encode('utf-8')
            if subreddit in self.subreddit_dict.keys():
                self.subreddit_dict[subreddit] = self.subreddit_dict[subreddit] + concat_text
            else:
                self.subreddit_dict[subreddit] = concat_text
        with open('subreddit_dict.pickle', 'wb') as handle:
            pickle.dump(self.subreddit_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('{}: Finish calculate and save subreddit dictionary'.format((time.asctime(time.localtime(time.time())))))
        return
    def topic_model(self):
        """
        Calculate the topic model for all the units, the probability that the comment has each of the topics
        :return: pandas DF[number_of_units, number_of_topics] - the probability for each comment and topic
        """
        # Clean the data
        print('{}: Clean the data'.format((time.asctime(time.localtime(time.time())))))
        units_clean = {row['comment_id']: clean(row['comment_body']).split()
                       for index, row in self.units.iterrows()}
        all_data_clean = {row['comment_id']: clean(row['comment_body']).split()
                          for index, row in self.all_data.iterrows()}
        # Creating the term dictionary of our corpus, where every unique term is assigned an index.
        print('{}: Create the dictionary'.format((time.asctime(time.localtime(time.time())))))
        dictionary = corpora.Dictionary(all_data_clean.values())

        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        print('{}: Create units term matrix'.format((time.asctime(time.localtime(time.time())))))
        units_term_matrix = {index: dictionary.doc2bow(doc) for index, doc in units_clean.items()}
        print('{}: Create all data term matrix'.format((time.asctime(time.localtime(time.time())))))
        all_data_term_matrix = {index: dictionary.doc2bow(doc) for index, doc in all_data_clean.items()}

        # Create LDA model
        print('{}: Create model'.format((time.asctime(time.localtime(time.time())))))
        model = ldamodel.LdaTransformer(num_topics=self.number_of_topics, id2word=dictionary, passes=50,
                                        minimum_probability=0)
        # Train LDA model on the comments term matrix.
        print('{}: Fit the model on all data'.format((time.asctime(time.localtime(time.time())))))
        model = model.fit(list(all_data_term_matrix.values()))
        # Get topics for the data
        print('{}: Predict topics for units'.format((time.asctime(time.localtime(time.time())))))
        result = model.transform(list(units_term_matrix.values()))

        print('{}: Create final topic model data'.format((time.asctime(time.localtime(time.time())))))
        comment_ids_df = pd.DataFrame(list(units_term_matrix.keys()), columns=['comment_id'])
        result_columns = ['topic_model_'+str(i) for i in range(self.number_of_topics)]
        topic_model_result_df = pd.DataFrame(result, columns=result_columns)

        print('{}: Save final topic model data'.format((time.asctime(time.localtime(time.time())))))
        topic_model_final_result = pd.concat([comment_ids_df, topic_model_result_df], axis=1)

        return topic_model_final_result
 def split_relevant_data(self):
     """
     This function split the data to opts.k_fold folders and insert the group number to the DF
     :return:
     """
     # Split the data to k=opts.k_fold groups, each comment_author in one group only
     i = 1
     number_sample_group = 0
     sample_per_group = math.floor(self.featuresDF.shape[0] / opts.k_fold)
     self.featuresDF = self.featuresDF.sample(frac=1).reset_index(drop=True)
     for index, row in self.featuresDF.iterrows():
         if number_sample_group < sample_per_group or i == opts.k_fold:
             self.featuresDF.set_value(index, 'group_number', i)
             number_sample_group += 1
         else:
             i += 1
             self.featuresDF.set_value(index, 'group_number', i)
             print(
                 '{}: finish split samples for group number {} with {} samples'
                 .format((time.asctime(time.localtime(time.time()))), i - 1,
                         number_sample_group))
             print('{}: start split samples for group number {}'.format(
                 (time.asctime(time.localtime(time.time()))), i))
             logging.info(
                 '{}: finish split samples for group number {} with {} samples'
                 .format((time.asctime(time.localtime(time.time()))), i - 1,
                         number_sample_group))
             logging.info(
                 '{}: start split samples for group number {}'.format(
                     (time.asctime(time.localtime(time.time()))), i))
             number_sample_group = 1
     opts.k_fold = i + 1
     # print for the last group
     print('{}: finish split samples for group number {} with {} samples'.
           format((time.asctime(time.localtime(time.time()))), i,
                  number_sample_group))
     logging.info(
         '{}: finish split samples for group number {} with {} samples'.
         format((time.asctime(time.localtime(time.time()))), i,
                number_sample_group))
     self.labels = self.featuresDF[[self.label_column_name, 'group_number']]
     print('{}: Finish split the data'.format(
         (time.asctime(time.localtime(time.time())))))
     logging.info('{}: Finish split the data'.format(
         (time.asctime(time.localtime(time.time())))))
Exemple #15
0
def index(request):
        times = time.asctime( time.localtime(time.time()) )
        if request.method =="POST":
            number1 = request.POST.get("number")

            print number1
            if number1 is not None:
                number = number1
                add_list = get_list(number)
                return render(request, 'index.html',{'add_list':add_list,'number':number,'time':times})
            else:
                number = random_number()
                add_list = get_list(number)
                return render(request, 'index.html',{'add_list':add_list,'number':number,'time':times})
        else:

            number = random_number()
            add_list = get_list(number)
            return render(request, 'index.html',{'add_list':add_list,'number':number,'time':times})
Exemple #16
0
    def save_results(self):
        """Saves everything to a file"""
        padding = 24  # Padding for each of the data points
        header = "SQC self test measurement file \n Date: {} \n Operator: {} \n\n".format(
            time.asctime(),
            self.main.framework["Configs"]["config"]["settings"].get(
                "Current_operator", "None"),
        )
        empttykeys = list(self.data["Empty"].keys())
        Cardkeys = list(self.data["TestCard"].keys())

        measurements = list(empttykeys)
        measurements.extend(list(Cardkeys))
        units = [
            "#".ljust(padding),
        ]

        # Append units:
        for meas in measurements:
            header += meas.ljust(padding)
            units.append(self.data["units"].get(meas,
                                                "arb. units").ljust(padding))
        header += "\n" + "".join(units)

        finalarray = np.ones(shape=(self.samples,
                                    (len(empttykeys) + len(Cardkeys))))
        # Add empty meas
        i = 0
        for meas in empttykeys:
            finalarray[:, i] = self.data["Empty"][meas]
            i += 1
        # Add Test card
        for meas in Cardkeys:
            finalarray[:, i] = self.data["TestCard"][meas]
            i += 1

        filecontent = "\n"
        for line in finalarray:
            for entry in line:
                filecontent += str(entry).ljust(padding)
            filecontent += "\n"
    def benchmark(self, model, model_name='default'):
        print('_' * 80)
        print('{}: Traininig: {}'.format(
            (time.asctime(time.localtime(time.time()))), model_name))
        print(model)
        t0 = time.time()
        # Cross validation part
        k = 100
        predicted = cross_val_predict(model, self.data, self.labels, cv=k)
        score = metrics.accuracy_score(self.labels, predicted)
        train_time = time.time() - t0
        print("train and test time: {}".format(train_time))

        print("confusion matrix:")
        print(metrics.confusion_matrix(self.labels, predicted, labels=[-1, 1]))

        model_descr = str(model).split('(')[0]
        print("Accuracy: {} (+/- {})".format(score.mean(), score.std() * 2))

        auc = metrics.roc_auc_score(self.labels, predicted, average='samples')
        print('AUC: {}'.format(auc))

        return [model_descr, score, auc, train_time]
    def __init__(self):
        print('{}: Loading the data'.format(
            (time.asctime(time.localtime(time.time())))))
        # print '{}: Loading the data'.format((time.asctime(time.localtime(time.time()))))
        sentences = []
        labels = []
        sentences_len = []
        true_index = []
        false_index = []
        stripComment = lambda x: x.strip().lower()
        replaceComments = lambda x: x.replace(";", ' ').replace(":", ' ').replace('"', ' ').replace('-', ' ').\
            replace(',', ' ').replace('.', ' ').replace("/", ' ').replace('(', ' ').replace(')', ' ')
        splitCommant = lambda x: x.split(" ")
        stop = stopwords.words('english')
        stopWordsComment = lambda x: [i for i in x if i not in stop]
        data = pd.read_excel('FinalFeatures.xlsx')
        comment_index = 0
        for index, comment in data.iterrows():
            train_data = comment['comment_body']
            sentence = stripComment(train_data)
            sentence = replaceComments(sentence)
            sentence = splitCommant(sentence)
            sentence = stopWordsComment(sentence)
            remove_list = []
            for i, word in enumerate(sentence):
                if '\r\r' in word or word == '':
                    remove_list.append(i)
            sentence = [
                i for j, i in enumerate(sentence) if j not in remove_list
            ]
            sentences.append(sentence)
            labels.append(comment['IsEfficient'])
            sentences_len.append(len(train_data))
            if comment['IsEfficient'] == 1:
                true_index.append(comment_index)
            else:
                false_index.append(comment_index)
            comment_index += 1
        # words = set(itertools.chain(*sentences))

        # choose random index for test set
        true_test_index = random.sample(true_index, 110)
        false_test_index = random.sample(false_index, 740)

        # create test and train sets
        true_test = list(sentences[i] for i in true_test_index)
        true_label = list(labels[i] for i in true_test_index)
        false_test = list(sentences[i] for i in false_test_index)
        false_label = list(labels[i] for i in false_test_index)

        true_train_index = [
            index for index in true_index if index not in true_test_index
        ]
        false_train_index = [
            index for index in false_index if index not in false_test_index
        ]
        true_train = list(sentences[i] for i in true_train_index)
        true_train_label = list(labels[i] for i in true_train_index)
        false_train = list(sentences[i] for i in false_train_index)
        false_train_label = list(labels[i] for i in false_train_index)

        X_POS = list(itertools.chain(true_train, true_test))
        # Y_train = list(itertools.chain(true_train_label, false_train_label))
        X_NEG = list(itertools.chain(false_train, false_test))

        X_POS = self.labelizeComments(X_POS, 'POS')
        X_NEG = self.labelizeComments(X_NEG, 'NEG')

        final_sentences = list(itertools.chain(X_POS, X_NEG))

        print('{}: Start calculating Doc2Vec'.format(
            (time.asctime(time.localtime(time.time())))))
        number_of_features = 100
        model = Doc2Vec(min_count=2,
                        window=10,
                        size=number_of_features,
                        negative=5,
                        workers=7,
                        iter=55)  # documents=final_sentences,
        model.build_vocab(final_sentences)
        #
        print('{}: Start train Doc2Vec'.format(
            (time.asctime(time.localtime(time.time())))))
        for epoch in range(50):
            # model.train(shuffle(final_sentences))
            model.train(final_sentences,
                        total_examples=model.corpus_count,
                        word_count=2)
        #
        model.save('d2v100.d2v')
        # model = Doc2Vec.load('comment.d2v')

        print('{}: Finish calculating Doc2Vec'.format(
            (time.asctime(time.localtime(time.time())))))
        # Create train numpy
        data_size = len(sentences)
        true_size = len(true_train_index) + len(true_test_index)
        false_size = len(false_train_index) + len(false_test_index)
        self.data = np.zeros((data_size, number_of_features))
        self.labels = np.zeros(data_size)

        for i in range(true_size):
            prefix_train_pos = 'POS_' + str(i)
            self.data[i] = model.docvecs[prefix_train_pos]
            self.labels[i] = 1

        j = 0
        for i in range(true_size, true_size + false_size):
            prefix_train_neg = 'NEG_' + str(j)
            self.data[i] = model.docvecs[prefix_train_neg]
            self.labels[i] = -1
            j += 1

        print(self.labels)

        # for Non-Negative values - if we want to train Multinumial NB
        min_max_scale = MinMaxScaler()
        self.data = min_max_scale.fit_transform(self.data)

        comments_id = data['comment_id'].values

        i = 0
        w2v_id = []
        for sample in self.data:
            w2v_id_sample = sample.tolist()
            w2v_id_sample.append(comments_id[i])
            w2v_id.append(w2v_id_sample)
            i += 1

        index = range(number_of_features)
        index.append('comment_id')
        train_vecs_d2vPD = pd.DataFrame.from_records(w2v_id, columns=index)
        final_features = pd.merge(data, train_vecs_d2vPD, on='comment_id')
        final_features.to_csv('100_d2v_scale.csv', encoding='utf-8')

        return
Exemple #19
0
    def iterateOverFeaturesGroups(self):
        all_groups_results = pd.DataFrame()
        for number_of_groups in range(1, 6):
            for groups in itertools.permutations(range(5), number_of_groups):
                features_group = [self.group_dic[group][0] for group in groups]
                features = [item for sublist in features_group for item in sublist]
                self.X_train = self.featuresDF[features]
                group_names = [self.group_dic[group][1] for group in groups]
                print('{}: Start training with the groups: {} '.format((time.asctime(time.localtime(time.time()))),
                                                                       group_names))
                logging.info('{}: Start training with the groups: {} '
                             .format((time.asctime(time.localtime(time.time()))), group_names))
                group_results = self.ModelsIteration()
                print('{}: Finish training with the groups: {}' \
                    .format((time.asctime(time.localtime(time.time()))), group_names))
                logging.info('{}: Finish training with the groups: {}'
                             .format((time.asctime(time.localtime(time.time()))), group_names))
                # indices = np.arange(len(group_results))
                # results = [[x[i] for x in group_results] for i in range(4)]
                #
                # # clf_names, score, auc, training_time = results
                # clf_names = results[0]
                # score = results[1]
                # auc = results[2]
                # training_time = results[3]
                # training_time = np.array(training_time) / np.max(training_time)
                #
                # plt.figure(figsize=(12, 8))
                # plt.title("Score")
                # plt.barh(indices, score, .2, label="score", color='navy')
                # plt.barh(indices + .3, training_time, .2, label="training time",
                #          color='c')
                # plt.barh(indices, auc, .2, label="ACU", color='darkorange')
                # plt.yticks(())
                # plt.legend(loc='best')
                # plt.subplots_adjust(left=.25)
                # plt.subplots_adjust(top=.95)
                # plt.subplots_adjust(bottom=.05)
                #
                # for i, c in zip(indices, clf_names):
                #     plt.text(-.3, i, c)
                #
                # plt.show()
                # plt.savefig('pythonResults' + group_names + '.png', bbox_inches='tight')

                for model in group_results:
                    model.append(group_names)
                    model.append(opts.k_fold)
                columns_names = ['classifier_name', 'score', 'auc', 'train_time', 'group_list', 'k_fold']
                group_resultsDF = pd.DataFrame(group_results, columns=columns_names)
                # group_results.append(group_names).append([opts.k_fold])
                all_groups_results = all_groups_results.append(group_resultsDF, ignore_index=True)
                all_groups_results.to_csv('pythonResultsTemp.csv', encoding='utf-8')
                # if i == 0:
                #     all_groups_results = group_resultsDF
                #     i += 1
                #     all_groups_results.to_csv('pythonResultsTemp.csv', encoding='utf-8')
                # else:
                #     reut = all_groups_results.append(group_resultsDF, ignore_index=True)
                #     all_groups_results.to_csv('pythonResultsTemp.csv', encoding='utf-8')

        # resultsDF = pd.DataFrame(all_groups_results)
        all_groups_results.to_csv('pythonResultsFinal.csv', encoding='utf-8')

        return
    def benchmark(self, clf, clf_name='default'):
        # if I want to train only specific model:
        # if clf_name != 'MultinomialNB':
        #     print('Not training')
        #     return ['not training', 0, 0, 0]
        print('_' * 80)
        print('{}: Traininig: {}'.format(
            (time.asctime(time.localtime(time.time()))), clf))
        logging.info('_' * 80)
        logging.info('{}: Traininig: {}'.format(
            (time.asctime(time.localtime(time.time()))), clf))
        # Cross validation part
        if clf_name == 'GaussianNB':
            self.X_train = self.X_train.toarray()
        t1 = time.time()
        score = []
        auc = []
        for out_group in range(opts.k_fold):
            t0 = time.time()
            # create train and test data
            test_data = self.X_train.loc[self.X_train['group_number'] ==
                                         out_group][self.features]
            test_label = self.labels.loc[self.X_train['group_number'] ==
                                         out_group]['IsEfficient']
            train_data = self.X_train.loc[
                self.X_train['group_number'] != out_group][self.features]
            train_label = self.labels.loc[
                self.X_train['group_number'] != out_group]['IsEfficient']

            # train the model
            clf.fit(train_data, train_label)
            predicted = clf.predict(test_data)
            score.append(metrics.accuracy_score(test_label, predicted))
            auc.append(
                metrics.roc_auc_score(test_label, predicted,
                                      average='samples'))
            # print('fold number {}: accuracy: {}, AUC: {}'.format(out_group, metrics.accuracy_score(test_label,
            #                                                                                        predicted),
            #                                                      metrics.roc_auc_score(test_label, predicted,
            #                                                                            average='samples')))

            logging.info("Fold number:")
            logging.info(out_group)
            logging.info("accuracy:")
            logging.info(metrics.accuracy_score(test_label, predicted))
            logging.info("AUC:")
            logging.info(
                metrics.roc_auc_score(test_label, predicted,
                                      average='samples'))
            if opts.print_cm:
                # print("confusion matrix:")
                # print(metrics.confusion_matrix(test_label, predicted, labels=[-1, 1]))
                logging.info("confusion matrix:")
                logging.info(
                    metrics.confusion_matrix(test_label,
                                             predicted,
                                             labels=[-1, 1]))
            train_time = time.time() - t0
            # print("fold number {}: cross validation time: {}".format(out_group, train_time))
            logging.info("cross validation time: {}".format(train_time))

        # clf_descr = str(clf).split('(')[0]
        average_acc = sum(score) / len(score)
        print("Average Accuracy: {}".format(average_acc))
        logging.info("Average Accuracy: {})".format(average_acc))

        average_auc = sum(auc) / len(auc)
        print("Average AUC: {}".format(average_auc))
        logging.info('Average AUC: {}'.format(average_auc))

        train_time = time.time() - t1

        return [clf_name, average_acc, average_auc, train_time]
    def iterateOverFeaturesGroups(self, Peff_up_threshold,
                                  Peff_down_threshold):
        all_groups_results = pd.DataFrame()
        remaining_features = list(self.group_dic.keys())
        if opts.is_backward:  # use backward elimination
            selected_features = list(self.group_dic.keys())
        else:  # use forward selection
            selected_features = []
            remaining_features = [
                x for x in remaining_features if x not in selected_features
            ]
        current_auc, best_new_auc = 0.0, 0.0
        remain_number_of_candidate = len(remaining_features)
        while remaining_features and current_auc == best_new_auc and remain_number_of_candidate > 0:
            auc_with_candidates = list()
            for candidate in remaining_features:
                if opts.is_backward:  # use backward elimination
                    features_group = [
                        self.group_dic[group][0] for group in selected_features
                    ]
                    features_group.remove(self.group_dic[candidate][0])
                    self.features = [
                        item for sublist in features_group for item in sublist
                    ]
                    features = [
                        item for sublist in features_group for item in sublist
                    ]
                    features.append('group_number')
                    self.X_train = self.featuresDF[features]
                    features_names = [
                        self.group_dic[feature][1]
                        for feature in selected_features
                    ]
                    features_names.remove(self.group_dic[candidate][1])

                else:  # use forward selection
                    features_group = [self.group_dic[group][0] for group in selected_features] +\
                                     [self.group_dic[candidate][0]]
                    self.features = [
                        item for sublist in features_group for item in sublist
                    ]
                    features = [
                        item for sublist in features_group for item in sublist
                    ]
                    features.append('group_number')
                    self.X_train = self.featuresDF[features]
                    features_names = [self.group_dic[feature][1] for feature in selected_features] +\
                                     [self.group_dic[candidate][1]]

                print('{}: Start training with the groups: {} '.format(
                    (time.asctime(time.localtime(time.time()))),
                    features_names))
                logging.info('{}: Start training with the groups: {} '.format(
                    (time.asctime(time.localtime(time.time()))),
                    features_names))
                group_results = self.ModelsIteration()
                best_auc = max(result[2] for result in group_results)
                auc_with_candidates.append((best_auc, candidate))

                print('{}: Finish training with the groups: {}'.format(
                    (time.asctime(time.localtime(time.time()))),
                    features_names))
                logging.info('{}: Finish training with the groups: {}'.format(
                    (time.asctime(time.localtime(time.time()))),
                    features_names))

                for model in group_results:
                    model.append(features_names)
                    model.append(opts.k_fold)
                    model.append(Peff_up_threshold)
                    model.append(Peff_down_threshold)
                columns_names = [
                    'classifier_name', 'score', 'auc', 'train_time',
                    'features_list', 'k_fold', 'Peff_up_threshold',
                    'Peff_down_threshold'
                ]
                group_resultsDF = pd.DataFrame(group_results,
                                               columns=columns_names)
                # group_results.append(group_names).append([opts.k_fold])
                all_groups_results = all_groups_results.append(
                    group_resultsDF, ignore_index=True)
                all_groups_results.to_csv('test_results_stepwise.csv',
                                          encoding='utf-8')

            auc_with_candidates.sort()
            best_new_auc, best_candidate = auc_with_candidates.pop()
            if current_auc <= best_new_auc:
                if opts.is_backward:  # use backward elimination
                    selected_features.remove(best_candidate)
                else:  # use forward selection
                    selected_features.append(best_candidate)
                remaining_features.remove(best_candidate)
                current_auc = best_new_auc

            else:
                logging.info(
                    '{}: No candidate was chosen for threshold: {} and {}, number of selected features is {}.'
                    .format((time.asctime(time.localtime(time.time()))),
                            Peff_down_threshold, Peff_up_threshold,
                            len(selected_features)))
                print(
                    '{}: No candidate was chosen for threshold: {} and {}, number of selected features is {}.'
                    .format((time.asctime(time.localtime(time.time()))),
                            Peff_down_threshold, Peff_up_threshold,
                            len(selected_features)))

            # one candidate can be chosen, if not- we go forward to the next step.
            remain_number_of_candidate -= 1

        selected_features_names = [
            self.group_dic[feature][1] for feature in selected_features
        ]
        logging.info(
            '{}: Selected features for threshold: {} and {} are: {} and the best AUC is: {}'
            .format((time.asctime(time.localtime(time.time()))),
                    Peff_down_threshold, Peff_up_threshold,
                    selected_features_names, best_new_auc))
        print(
            '{}: Selected features for threshold: {} and {} are: {} and the best AUC is: {}.'
            .format((time.asctime(time.localtime(time.time()))),
                    Peff_down_threshold, Peff_up_threshold,
                    selected_features_names, best_new_auc))

        return all_groups_results
    def benchmark(self, clf, clf_name='default'):
        """
        This function train and test the model (clf) opts.k_fold time with CV
        :param clf: the model to train and test
        :param str clf_name: the name of the model
        :return: clf_name, average_acc, average_auc, train_time of the model
        :rtype list
        """
        print('_' * 80)
        print('{}: Traininig: {}'.format(
            (time.asctime(time.localtime(time.time()))), clf))
        logging.info('_' * 80)
        logging.info('{}: Traininig: {}'.format(
            (time.asctime(time.localtime(time.time()))), clf))
        # Cross validation part
        if clf_name == 'GaussianNB':
            self.X_train = self.X_train.toarray()
        t1 = time.time()
        score = []
        auc = []
        for out_group in range(1, opts.k_fold):
            t0 = time.time()
            # create train and test data
            test_data = self.X_train.loc[self.X_train['group_number'] ==
                                         out_group, self.features]
            test_label = self.labels.loc[self.X_train['group_number'] ==
                                         out_group, self.label_column_name]
            train_data = self.X_train.loc[
                self.X_train['group_number'] != out_group, self.features]
            train_label = self.labels.loc[
                self.X_train['group_number'] != out_group,
                self.label_column_name]

            # train the model
            clf.fit(train_data, train_label)
            predicted = clf.predict(test_data)
            score.append(metrics.accuracy_score(test_label, predicted))
            auc.append(
                metrics.roc_auc_score(test_label, predicted,
                                      average='samples'))

            logging.info("Fold number:")
            logging.info(out_group)
            logging.info("accuracy:")
            logging.info(metrics.accuracy_score(test_label, predicted))
            logging.info("AUC:")
            logging.info(
                metrics.roc_auc_score(test_label, predicted,
                                      average='samples'))
            if opts.print_cm:
                print("confusion matrix:")
                print(
                    metrics.confusion_matrix(test_label,
                                             predicted,
                                             labels=[0, 1]))
                logging.info("confusion matrix:")
                logging.info(
                    metrics.confusion_matrix(test_label,
                                             predicted,
                                             labels=[0, 1]))
            train_time = time.time() - t0
            # print("fold number {}: cross validation time: {}".format(out_group, train_time))
            logging.info("cross validation time: {}".format(train_time))

        # clf_descr = str(clf).split('(')[0]
        average_acc = sum(score) / len(score)
        print("Average Accuracy: {}".format(average_acc))
        logging.info("Average Accuracy: {})".format(average_acc))

        average_auc = sum(auc) / len(auc)
        print("Average AUC: {}".format(average_auc))
        logging.info('Average AUC: {}'.format(average_auc))

        train_time = time.time() - t1

        return [clf_name, average_acc, average_auc, train_time]
Exemple #23
0
def main(only_subreddit_similarity=False, only_percent=False):
    print('{}: Loading the data'.format((time.asctime(time.localtime(time.time())))))
    create_features = CreateFeatures()
    print('{}: Finish loading the data'.format((time.asctime(time.localtime(time.time())))))
    print('data sizes: all data: {}, ref data: {}, classify ref data: {} '.format(create_features.all_data.shape,
                                                                                  create_features.references.shape,
                                                                                  create_features.classify_ref.shape))
    if opts.pickel_not_saved:
        create_features.create_subreddit_data()
    else:
        with open('subreddit_dict.pickle', 'rb') as handle:
            create_features.subreddit_dict = pickle.load(handle)

    all_comments_features = list()
    for index, comment in create_features.classify_ref.iterrows():
        if index % 100 == 0:
            print('{}: Finish calculate {} samples'.format((time.asctime(time.localtime(time.time()))), index))
        comment_author = comment['comment_author']
        original_subreddit = comment['subreddit']
        recommend_subreddit = comment['recommend_subreddit']
        if opts.use_date_threshold:  # if we use the data threshold - use the comment time, else use the current time.
            comment_time = comment['comment_created_time']
            submission_time = comment['submission_created_time']
        else:
            comment_time = datetime.utcnow()
            submission_time = datetime.utcnow()

        if only_subreddit_similarity:
            subreddits_similarity = create_features.tfifd_similarity(original_subreddit, recommend_subreddit)
            featuresDF = pd.Series(subreddits_similarity)
        elif only_percent:
            number_of_efficient_references_comment_author = \
                create_features.number_of_efficient_references(comment_author, comment_time)
            number_of_checked_references = create_features.number_of_checked_references(comment_author, comment_time)
            if number_of_checked_references > 0:
                percent_efficient_references_comment_author = (100.0 * number_of_efficient_references_comment_author) / \
                                                              number_of_checked_references
            else:
                percent_efficient_references_comment_author = 0
                # print('percent_efficient_references_comment_author is 0 for comment ID: {}'.format(comment['comment_id']))
            featuresDF = pd.Series(percent_efficient_references_comment_author)
        else:
            # Calculate similarity between the original and recommended subreddits:
            subreddits_similarity = create_features.tfifd_similarity(original_subreddit, recommend_subreddit)
            # Get comment author features:
            comment_author_number_original_subreddit, comment_author_number_recommend_subreddit, \
            comment_author_subreddit_list = create_features.number_list_of_message(original_subreddit,
                                                                                   recommend_subreddit,
                                                                                   comment_author, comment_time)
            number_of_references_comment_author = create_features.number_of_references(comment_author, comment_time)
            # print('{}: comment ID: {}, number_of_references_comment_author: {}'\
            #     .format((time.asctime(time.localtime(time.time()))), comment['comment_id'],
            #             number_of_references_comment_author))
            number_of_efficient_references_comment_author = \
                create_features.number_of_efficient_references(comment_author, comment_time, is_efficient=1)
            number_of_inefficient_references_comment_author = \
                create_features.number_of_efficient_references(comment_author, comment_time, is_efficient=-1)
            number_of_checked_references = create_features.number_of_checked_references(comment_author, comment_time)
            if number_of_checked_references > 0:
                percent_efficient_references_comment_author = (100.0 * number_of_efficient_references_comment_author) / \
                                                              number_of_checked_references
            else:
                percent_efficient_references_comment_author = 0
                # print('percent_efficient_references_comment_author is 0 for comment ID: {}'.format(comment['comment_id']))
            # Get submission author features:
            submission_author = comment['submission_author']
            submission_author_number_original_subreddit, submission_author_number_recommend_subreddit, \
            submission_author_subreddit_list = create_features.number_list_of_message(original_subreddit,
                                                                                      recommend_subreddit,
                                                                                      submission_author, submission_time)
            # Similarity between comment and submission authors subreddits lists:
            cosine_similarity_subreddits_list = get_cosine(Counter(comment_author_subreddit_list),
                                                           Counter(submission_author_subreddit_list))
            # Get the hour of the comment and the submission:
            comment_created_time_hour = convert_utc(comment['comment_created_time']).hour
            submission_created_time_hour = convert_utc(comment['submission_created_time']).hour
            # Get the time between the submission was published and the comment time:
            time_to_comment = comment['time_to_comment']
            time_between_messages_hour = math.floor(time_to_comment/3600.0)
            time_between_messages_min = math.floor((time_to_comment - 3600*time_between_messages_hour)/60.0)/100.0
            time_between_messages = time_between_messages_hour + time_between_messages_min
            # Comment features:
            comment_body = comment['comment_body']
            submission_body = comment['submission_body']
            submission_title = comment['title']
            comment_len, number_of_r = number_of_subreddits(comment_body, '/r/')
            if isinstance(submission_body, str) and isinstance(comment_body, str):
                comment_submission_similarity = create_features.tfifd_similarity([comment_body, submission_body])
            else:
                comment_submission_similarity = 0.0
            if isinstance(submission_title, str) and isinstance(comment_body, str):
                comment_title_similarity = create_features.tfifd_similarity([comment_body, submission_title])
            else:
                comment_title_similarity = 0.0
            number_of_references_to_submission = comment['num_comments']
            # subreddit features:
            number_of_references_to_recommended_subreddit = create_features.popular_subreddit(recommend_subreddit,
                                                                                              comment_time)

            features = [comment_author_number_original_subreddit,
                        comment_author_number_recommend_subreddit, percent_efficient_references_comment_author,
                        number_of_references_comment_author, number_of_efficient_references_comment_author,
                        number_of_inefficient_references_comment_author,
                        submission_author_number_original_subreddit, submission_author_number_recommend_subreddit,
                        cosine_similarity_subreddits_list, comment_created_time_hour, submission_created_time_hour,
                        time_between_messages, comment_len, number_of_r, comment_submission_similarity,
                        comment_title_similarity, number_of_references_to_submission,
                        number_of_references_to_recommended_subreddit, subreddits_similarity]
            labels = ('comment_author_number_original_subreddit', 'comment_author_number_recommend_subreddit',
                      'percent_efficient_references_comment_author', 'number_of_references_comment_author',
                      'number_of_efficient_references_comment_author', 'number_of_inefficient_references_comment_author',
                      'submission_author_number_original_subreddit',
                      'submission_author_number_recommend_subreddit', 'cosine_similarity_subreddits_list',
                      'comment_created_time_hour', 'submission_created_time_hour', 'time_between_messages',
                      'comment_len', 'number_of_r', 'comment_submission_similarity', 'comment_title_similarity',
                      'number_of_references_to_submission', 'number_of_references_to_recommended_subreddit',
                      'subreddits_similarity')

            featuresDF = pd.Series(features, index=labels)

        comment_features = comment.append(featuresDF)
        if only_subreddit_similarity:
            comment_features.rename(columns={'0': 'subreddits_similarity'}, inplace=True)
        elif only_percent:
            comment_features.rename(columns={'0': 'percent_efficient_references_comment_author'}, inplace=True)

        if index == 0:
            all_comments_features = comment_features
            # print('{}: Finish calculate first samples'.format((time.asctime(time.localtime(time.time())))))
        else:
            all_comments_features = pd.concat([comment_features, all_comments_features], axis=1)

        all_comments_features.T.to_csv('Features_with_commnent_time.csv', encoding='utf-8')

    # export the data to csv file
    all_comments_features.T.to_csv('FinalFeatures_with_comment_time2.csv', encoding='utf-8')
    def __init__(self, label_column_name):
        self.X_train = None
        self.features = None
        self.feature_names = None
        print('{}: Loading the data: final_features_causality'.format(
            (time.asctime(time.localtime(time.time())))))
        self.labels = None
        self.featuresDF = pd.read_csv(
            os.path.join(
                features_directory,
                'matches_data_frame_treated_propensity_score_treated_logistic_all_deltas.csv'
            ))
        self.label_column_name = label_column_name

        # group_dict is in the format: {index: [features list of this group], group name
        self.group_dict = {
            0: [[
                'commenter_number_submission', 'commenter_number_comment',
                'number_of_comments_in_tree_by_comment_user',
                'commenter_seniority_days'
            ], 'commenter_features'],
            1: [[
                'submitter_number_submission', 'submitter_seniority_days',
                'submitter_number_comment',
                'number_of_comments_in_tree_from_submitter',
                'number_of_respond_by_submitter_total',
                'number_of_respond_by_submitter'
            ], 'submitter_features'],
            2: [['is_first_comment_in_tree', 'comment_len', 'comment_depth'],
                'comment_features'],
            3: [[
                'time_ratio', 'time_between_messages',
                'time_until_first_comment',
                'time_between_comment_first_comment'
            ], 'time_features'],
            4: [['submission_len', 'title_len'], 'submission_features'],
            5: [[
                'respond_to_comment_user_responses_ratio',
                'respond_to_comment_user_all_ratio', 'respond_total_ratio'
            ], 'ratio_features'],
            6: [['treated'], 'trated'],
            7: [[
                'nltk_com_sen_pos', 'nltk_com_sen_neg', 'nltk_com_sen_neutral',
                'nltk_sub_sen_pos', 'nltk_sub_sen_neg', 'nltk_sub_sen_neutral',
                'nltk_title_sen_pos', 'nltk_title_sen_neg',
                'nltk_title_sen_neutral', 'nltk_sim_sen'
            ], 'sentiment features'],
            8: [['percent_adj'], 'percent_adj'],
            9: [['submmiter_commenter_tfidf_cos_sim'],
                'submitted_commenter_similarity'],
            10: [[
                'topic_model_0', 'topic_model_1', 'topic_model_2',
                'topic_model_3', 'topic_model_4', 'topic_model_5',
                'topic_model_6', 'topic_model_7', 'topic_model_8',
                'topic_model_9', 'topic_model_10', 'topic_model_11',
                'topic_model_12', 'topic_model_13', 'topic_model_14'
            ], 'topic_model']
        }

        print('{}: Data loaded '.format(
            (time.asctime(time.localtime(time.time())))))
        return
Exemple #25
0
 def __init__(self, message='', devID=0xFF, tstamp=time.asctime()):
     self.tstamp = tstamp
     self.devID = devID
     self.message = message
Exemple #26
0
def main():
    dt_obj = time.asctime(time.localtime(time.time()))
    return render_template(MAIN_HTML, **locals())
Exemple #27
0
 def _tweet ( self, message ):
     if self.logger<>None:
         import time
         self.logger.write ("%s - [ %s ]\n"%(time.asctime(), message ))
         self.logger.flush()
Exemple #28
0
    def mavlink_packet(self, m):
        '''handle mavlink packets'''
        mtype = m.get_type()

        if mtype == 'HEARTBEAT':
            self.time_at_last_heartbeat = time()

        elif mtype == 'GLOBAL_POSITION_INT':
            if self.settings.target_system == 0 or self.settings.target_system == m.get_srcSystem(
            ):
                self.gps_update(m)

        elif mtype == 'SCALED_PRESSURE3':
            self.psensor_update(m)

        elif mtype == 'SCALED_PRESSURE':
            self.dsensor_update(m)

        elif mtype == "SYS_STATUS":
            self.battery_update(m)

        elif mtype == 'RC_CHANNELS_RAW':
            self.rc_update(m)

        elif mtype == 'SERVO_OUTPUT_RAW':
            self.servo_update(m)

        elif mtype == 'MAV_STATE_CRITICAL':
            self.mav_state_critical = True

        elif mtype == 'MAV_STATE_EMERGENCY':
            self.mav_state_emergency = True

        elif mtype in ['WAYPOINT_COUNT', 'MISSION_COUNT']:
            if self.wp_op is None:
                self.console.error("No waypoint load started")
            else:
                self.wploader.clear()
                self.wploader.expected_count = m.count
                self.console.writeln(
                    "Requesting %u waypoints t=%s now=%s" %
                    (m.count, time.asctime(time.localtime(
                        m._timestamp)), time.asctime()))
                self.send_wp_requests()

        elif mtype in ['WAYPOINT', 'MISSION_ITEM'] and self.wp_op is not None:
            if m.seq < self.wploader.count():
                # print("DUPLICATE %u" % m.seq)
                return
            if m.seq + 1 > self.wploader.expected_count:
                self.console.writeln(
                    "Unexpected waypoint number %u - expected %u" %
                    (m.seq, self.wploader.count()))
            self.wp_received[m.seq] = m
            next_seq = self.wploader.count()
            while next_seq in self.wp_received:
                m = self.wp_received.pop(next_seq)
                self.wploader.add(m)
                next_seq += 1
            if self.wploader.count() != self.wploader.expected_count:
                # print("m.seq=%u expected_count=%u" % (m.seq, self.wploader.expected_count))
                self.send_wp_requests()
                return
            if self.wp_op == 'list':
                for i in range(self.wploader.count()):
                    w = self.wploader.wp(i)
                    print(
                        "%u %u %.10f %.10f %f p1=%.1f p2=%.1f p3=%.1f p4=%.1f cur=%u auto=%u"
                        %
                        (w.command, w.frame, w.x, w.y, w.z, w.param1, w.param2,
                         w.param3, w.param4, w.current, w.autocontinue))
                if self.logdir is not None:
                    waytxt = os.path.join(self.logdir, 'way.txt')
                    self.save_waypoints(waytxt)
                    print("Saved waypoints to %s" % waytxt)
            elif self.wp_op == "save":
                self.save_waypoints(self.wp_save_filename)
            self.wp_op = None
            self.wp_requested = {}
            self.wp_received = {}

        elif mtype in ["WAYPOINT_REQUEST", "MISSION_REQUEST"]:
            self.process_waypoint_request(m, self.master)

        elif mtype in ["WAYPOINT_CURRENT", "MISSION_CURRENT"]:
            if m.seq != self.last_waypoint:
                self.last_waypoint = m.seq
                if self.settings.wpupdates:
                    self.say("waypoint %u" % m.seq, priority='message')

        elif mtype == "MISSION_ITEM_REACHED":
            wp = self.module('wp').wploader.wp(m.seq)
            if wp is None:
                # should we spit out a warning?!
                # self.say("No waypoints")
                self.next_wp = None
                pass
            else:
                if wp.command == mavutil.mavlink.MAV_CMD_DO_LAND_START:
                    alt_offset = self.get_mav_param('ALT_OFFSET', 0)
                    if alt_offset > 0.005:
                        self.say(
                            "ALT OFFSET IS NOT ZERO passing DO_LAND_START")
                self.next_wp = wp

        elif m.get_type() == "FENCE_STATUS":
            self.module('fence').last_fence_breach = m.breach_time
            self.module('fence').last_fence_status = m.breach_status
        elif m.get_type() in ['SYS_STATUS']:
            bits = mavutil.mavlink.MAV_SYS_STATUS_GEOFENCE

            present = ((m.onboard_control_sensors_present & bits) == bits)
            if self.module('fence').present is False and present is True:
                self.say("fence present")
            elif self.module('fence').present is True and present is False:
                self.say("fence removed")
            self.present = present

            enabled = ((m.onboard_control_sensors_enabled & bits) == bits)
            if self.module('fence').enabled is False and enabled is True:
                self.say("fence enabled")
            elif self.module('fence').enabled is True and enabled is False:
                self.say("fence disabled")
            self.module('fence').enabled = enabled

            healthy = ((m.onboard_control_sensors_health & bits) == bits)
            if self.module('fence').healthy is False and healthy is True:
                self.say("fence OK")
            elif self.module('fence').healthy is True and healthy is False:
                self.say("fence breach")
            self.module('fence').healthy = healthy

            # console output for fence:
            if self.module('fence').enabled is False:
                self.module('fence').console.set_status('Fence',
                                                        'FEN',
                                                        row=0,
                                                        fg='grey')
            elif self.module('fence').enabled is True and self.module(
                    'fence').healthy is True:
                self.console.set_status('Fence', 'FEN', row=0, fg='green')
            elif self.module('fence').enabled is True and self.module(
                    'fence').healthy is False:
                self.console.set_status('Fence', 'FEN', row=0, fg='red')

            return
Exemple #29
0
def command_log(*args):
    if len(args) > 2:
        raise Exception(
            "Too many arguments: [ip] [time period in s] (optional parameter)")
    ip = ''
    max_ago = float('inf')
    if len(args) >= 1:
        ip = args[0]
    if len(args) == 2:
        max_ago = float(args[1])

    from pyparsing import Word, alphas, Suppress, Combine, nums, string, Optional, Regex, ParseException
    # define line in (sys)log
    month = Word(string.uppercase, string.lowercase, exact=3)
    integer = Word(nums)
    serverDateTime = Combine(month + " " + integer + " " + integer + ":" +
                             integer + ":" + integer)
    hostname = Word(alphas + nums + "_" + "-")
    daemon = Word(alphas + nums + "/" + "-" +
                  "_") + Optional(Suppress("[") + integer +
                                  Suppress("]")) + Suppress(":")
    message = Regex(".*")
    bnf = serverDateTime + hostname + daemon + message

    from collections import deque
    import re, time
    last_access = {}
    tail_n = 100
    for line in deque(open(logfile_path), tail_n):
        try:
            fields = bnf.parseString(line)
        except ParseException:
            continue
        else:
            m = re.search('requests (\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})',
                          fields[-1])
            if m:
                #print fields[0], m.group(1)
                cur = time.localtime()
                # guess year...
                st = time.strptime(fields[0] + " %s" % cur.tm_year,
                                   "%b %d %H:%M:%S %Y")
                if st > cur:  # ok, re-guess
                    st = time.strptime(fields[0] + " %s" % (cur.tm_year - 1),
                                       "%b %d %H:%M:%S %Y")
                if (st > cur):
                    raise Exception("HMF logfile seems too old!?!")
                last_access[m.group(1)] = st

    ips = [key for key in last_access.keys() if ip in key]
    access_in_period = [((time.mktime(cur) - time.mktime(t)) <= max_ago)
                        for r, t in last_access.items()]
    if ips and any(access_in_period):
        print "Previous accesses:"
        for (resource, timestamp), state in zip(last_access.items(),
                                                access_in_period):
            if not state: continue
            if not resource in ips: continue
            print "\t%s was accessed on %s (%.1fs ago)" % (
                resource, time.asctime(timestamp),
                time.mktime(cur) - time.mktime(timestamp))
        return EXIT_FAILURE
    return EXIT_SUCCESS
def main():
    topics_number = 15
    print('{}: Loading the data'.format((time.asctime(time.localtime(time.time())))))
    create_features = CreateFeatures(topics_number)
    print('{}: Finish loading the data'.format((time.asctime(time.localtime(time.time())))))
    print('data sizes: all data: {}, units data: {}'.format(create_features.all_data.shape,
                                                            create_features.units.shape))

    # Features calculated for all the data frame:
    topic_model_result = create_features.topic_model()
    create_features.units = create_features.units.merge(topic_model_result, on='comment_id')
    create_features.units['comment_len'] = create_features.units['comment_body'].str.len()
    create_features.units['submission_len'] = create_features.units['submission_body'].str.len()
    create_features.units['title_len'] = create_features.units['submission_title'].str.len()

    new_index = 0
    number_of_treatment_minus_1 = 0
    for index, comment in create_features.units.iterrows():
        if new_index % 100 == 0:
            print('{}: Finish calculate {} samples'.format((time.asctime(time.localtime(time.time()))), new_index))
        comment_author = copy(comment['comment_author'])
        comment_time = copy(comment['comment_created_utc'])
        submission_time = copy(comment['submission_created_utc'])
        submission_id = copy(comment['submission_id'])
        submission_num_comments = copy(comment['submission_num_comments'])
        comment_body = copy(comment['comment_body'])
        submission_body = copy(comment['submission_body'])
        title = copy(comment['submission_title'])

        # treatment:
        is_quote = create_features.loop_over_comment_for_quote(comment, comment_body)
        if is_quote != -1:
            create_features.units.loc[index, 'treated'] = is_quote
        else:
            print('{}: treatment = -1'.format((time.asctime(time.localtime(time.time())))))
            number_of_treatment_minus_1 += 1
            continue

        # Get comment author features:
        # print('{}: Get comment author features'.format((time.asctime(time.localtime(time.time())))))
        create_features.units.loc[index, 'commenter_number_submission'] =\
            create_features.number_of_message(comment_author, comment_time, 'submission')
        create_features.units.loc[index, 'commenter_number_comment'] =\
            create_features.number_of_message(comment_author, comment_time, 'comment')
        create_features.units.loc[index, 'commenter_seniority_days'] =\
            create_features.calculate_user_seniority(comment_author)

        # Get submission author features:
        # print('{}: Get submission author features'.format((time.asctime(time.localtime(time.time())))))
        submission_author = comment['submission_author']
        create_features.units.loc[index, 'submitter_number_submission']\
            = create_features.number_of_message(submission_author, comment_time, 'submission')
        create_features.units.loc[index, 'submitter_number_comment']\
            = create_features.number_of_message(submission_author, comment_time, 'comment')
        create_features.units.loc[index, 'submitter_seniority_days'] =\
            create_features.calculate_user_seniority(submission_author)
        create_features.units.loc[index, 'is_first_comment_in_tree'],\
            create_features.units.loc[index, 'number_of_comments_in_tree_by_comment_user'], _, _ = \
            create_features.comment_in_tree(comment_author, comment_time, submission_id)

        # Get the time between the submission and the comment time and the ration between the first comment:
        # print('{}: Get the time between the submission and the comment time and the ration between the first comment'
        #       .format((time.asctime(time.localtime(time.time())))))
        time_to_comment = comment['time_between']
        time_between_messages_hour = math.floor(time_to_comment/3600.0)
        time_between_messages_min = math.floor((time_to_comment - 3600*time_between_messages_hour)/60.0)/100.0
        create_features.units.loc[index, 'time_between_messages'] =\
            time_between_messages_hour + time_between_messages_min
        time_until_first_comment, time_between_comment_first_comment =\
            create_features.time_to_first_comment(submission_id, submission_time, comment_time)
        if time_to_comment > 0:
            create_features.units.loc[index, 'time_ratio'] = time_until_first_comment/time_to_comment
        else:
            create_features.units.loc[index, 'time_ratio'] = 0

        create_features.units.loc[index, 'time_until_first_comment'] = time_until_first_comment
        create_features.units.loc[index, 'time_between_comment_first_comment'] = time_between_comment_first_comment

        # Get the numbers of comments by the submitter
        _, create_features.units.loc[index, 'number_of_comments_in_tree_from_submitter'],\
            number_of_respond_by_submitter, number_of_respond_by_submitter_total =\
            create_features.comment_in_tree(submission_author, comment_time, submission_id, comment_author, True)
        create_features.units.loc[index, 'number_of_respond_by_submitter'],\
            create_features.units.loc[index, 'number_of_respond_by_submitter_total'] \
            = number_of_respond_by_submitter, number_of_respond_by_submitter_total

        # Ratio of comments number:
        # print('{}: Ratio of comments number'.format((time.asctime(time.localtime(time.time())))))
        if submission_num_comments == 0:
            create_features.units.loc[index, 'respond_to_comment_user_all_ratio'] = 0
            create_features.units.loc[index, 'respond_total_ratio'] = 0
        else:
            create_features.units.loc[index, 'respond_to_comment_user_all_ratio'] =\
                number_of_respond_by_submitter / submission_num_comments
            create_features.units.loc[index, 'respond_total_ratio'] =\
                number_of_respond_by_submitter_total / submission_num_comments
        if number_of_respond_by_submitter_total == 0:
            create_features.units.loc[index, 'respond_to_comment_user_responses_ratio'] = 0
        else:
            create_features.units.loc[index, 'respond_to_comment_user_responses_ratio'] =\
                number_of_respond_by_submitter / number_of_respond_by_submitter_total

        # Sentiment analysis:
        # for the comment:
        print('{}: Sentiment analysis'.format((time.asctime(time.localtime(time.time())))))
        comment_sentiment_list = sentiment_analysis(comment_body)
        create_features.units.loc[index, 'nltk_com_sen_pos'], create_features.units.loc[index, 'nltk_com_sen_neg'], \
            create_features.units.loc[index, 'nltk_com_sen_neutral'] = \
            comment_sentiment_list[0], comment_sentiment_list[1], comment_sentiment_list[2]
        # for the submission:
        sub_sentiment_list = sentiment_analysis(submission_body)
        create_features.units.loc[index, 'nltk_sub_sen_pos'], create_features.units.loc[index, 'nltk_sub_sen_neg'],\
            create_features.units.loc[index, 'nltk_sub_sen_neutral'] = \
            sub_sentiment_list[0], sub_sentiment_list[1], sub_sentiment_list[2]
        # for the title
        title_sentiment_list = sentiment_analysis(title)
        create_features.units.loc[index, 'nltk_title_sen_pos'], create_features.units.loc[index, 'nltk_title_sen_neg'], \
            create_features.units.loc[index, 'nltk_title_sen_neutral'] = \
            title_sentiment_list[0], title_sentiment_list[1], title_sentiment_list[2]
        # cosine similarity between submission's sentiment vector and comment sentiment vector:
        sentiment_sub = np.array(sub_sentiment_list).reshape(1, -1)
        sentiment_com = np.array(comment_sentiment_list).reshape(1, -1)
        create_features.units.loc[index, 'nltk_sim_sen'] = cosine_similarity(sentiment_sub, sentiment_com)[0][0]

        # percent of adjective in the comment:
        # print('{}: percent of adjective in the comment'.format((time.asctime(time.localtime(time.time())))))
        create_features.units.loc[index, 'percent_adj'] = percent_of_adj(comment_body)

        new_index += 1

    # export the data to csv file
    create_features.units.T.to_csv(os.path.join(data_directory, 'features_CMV.csv'), encoding='utf-8')
    print('number_of_treatment_minus_1: ', number_of_treatment_minus_1)
Exemple #31
0
def RunJob(job, joblist, source):
    # Set up job attributes -- working dir, environment, killstatus

    # Open logfile
    logname = ""
    log = ""
    
    try:
        logname = os.path.join(job.workdir,
            "dispatch-"+str(random.randint(100000000,999999999))+".log")
        log = open(logname, "w")
    except:
        # Couldn't cd to workdir, or couldn't open logfile.  Die.
        return 3, None
    
    
    # Set up the environment
    envvars = os.environ.copy()
    for key,val in job.env.iteritems():
        if key=="PATH_PREFIX":
            envvars['PATH'] = val + envvars['PATH']
        else:
            envvars[key] = val
    print "\n#   Environment PATH:", envvars['PATH']

    # Spawn the process
    child = subprocess.Popen(job.cmd,cwd=job.workdir, env=envvars, shell=True,
        stdout=log, stderr=log)

    wait = 0
    rtncode = None
    
    while (rtncode == None):
        try:
            time.sleep(1)
            rtncode = child.poll()
            
            # Check for kill request
            if (wait % 10 == 0):
                wait = 0

                if (joblist.killMe() == True): 
                    print "Got Kill Request"
                    kill = subprocess.Popen("taskkill /F /T /PID %i" % child.pid, shell=True)
                    rtncode = 0
                    break
            wait += 1
        except:
            print "Lost connection:  Killing job!"
            kill = subprocess.Popen("taskkill /F /T /PID %i" % child.pid, shell=True)
            rtncode = 0
            break
        
    # Done!  Close things out.
    # Concatenate logfiles (bug in python.. drat!)
    log.close()
    log = open(logname, "r")

    # Using a threadsafe lock function so that multiple threads can append output
    # to the logfile without tripping on each other.
    LOGGERLOCK.acquire()

    logfile = open(os.path.join(job.workdir,LOGGERNAME),"a")
    for line in log:
        logfile.write(line)
    logfile.write("======= Finished "+time.asctime()+" ==============================\n")

    # Close out the logfiles and set to null so windows can delete them.
    log.flush()
    log.close()
    logfile.flush()
    logfile.close()
    log=None
    logfile=None

    try:
        os.remove(logname)
    except:
        pass # print sys.exc_info()  sometimes Windows doesn't release the logfile... :-(

    LOGGERLOCK.release()
    return rtncode, logname
    def diagbtn2_fun(self):

        file1 = open(self.le_url.text(), 'r')
        text1 = file1.read()
        file1.close()

        features = self.extract_data(text1)

        self.pb.show()
        self.pb.setValue(0)

        self.completed = 0

        res = svm_breast.predict(features)

        while self.completed <= 100:

            self.completed += 0.0075
            self.pb.setValue(self.completed)

        self.pb.hide()

        self.restb.setText("")

        localtime = time.asctime(time.localtime(time.time()))

        url = "logs/" + self.le1.text() + localtime + ".txt"

        file1 = open(url, 'w+')

        file1.write("---------Diagnosed By :Dr." + self.le1.text() + " On " +
                    localtime + "----------------\n")

        x = 1

        for ans in res:

            file1.write("\nId:" + str(x))
            file1.write("\nClump Thickness: " + str(features[x - 1][0]))
            file1.write("\nUniformity of Cell Size: " +
                        str(features[x - 1][1]))
            file1.write("\nUniformity of Cell Shape: " +
                        str(features[x - 1][2]))
            file1.write("\nMarginal Adhesion: " + str(features[x - 1][3]))
            file1.write("\nSingle Epithelial Cell Size: " +
                        str(features[x - 1][4]))
            file1.write("\nBare Nuclei: " + str(features[x - 1][5]))
            file1.write("\nBland Chromatin: " + str(features[x - 1][6]))
            file1.write("\nNormal Nucleoli: " + str(features[x - 1][7]))
            file1.write("\nMitoses: " + str(features[x - 1][8]))

            if ans == '4':
                self.restb.append("Id:   " + str(x) +
                                  "\t\t Result: Malignant ")
                file1.write("\nResult: Malignant\n")

            else:

                self.restb.append("Id:   " + str(x) + "\t\t Result: Benign ")
                file1.write("\nResult: Benign\n")

            x += 1

        self.restb.append(
            "-----------------------------------------------------------------------------------------------------"
        )
        self.restb.append("Diagnosed By: Dr. " + self.le1.text())

        url1 = " '//home//karan//Desktop//Breast Cancer Detection Project//UI//logs//"

        self.logurl = url1 + self.le1.text() + localtime + ".txt' "

        self.logbtn.show()