def __init__(self, data_file): self.utilities = Utilities() self.data_file = data_file self.processor = Processor({'training_file': data_file}) self.segmenter = self.processor.load_segmenter() self.segments = [] self.aspects = [] self.sentiments = []
def __init__(self, params={}, debug=0): self.parameters = params self.debug = 0 self.num_params = len(params) self.info = {} self.results = {} self.logger = Logging() self.processor = Processor()
def my_api(): # print params = dict(request.args) start = time.time() dict_data = Processor(params).process() print(time.time() - start) json_to_send = json.dumps(dict_data) return json_to_send
def process_hurricanes(fn, prefix, Basins, year_pairs, year_start_TC): # Define # the function and its arguments: # Read hurricane track data dataframe = pd.read_csv(fn) # Read data and create a variable named 'dataframe' del dataframe['Unnamed: 0'] # Delete column 'unnamed:0' from dataframe variable # Specify hurricane to examine: >= 2007 hurricanes = list(set(np.array(dataframe[ # Create a list of hurricanes dataframe['SEASON'] >= year_start_TC # IDs for hurricanes after 2007 ]['ID']))) hurricanes.sort() # Sort hurricanes list n = len(hurricanes) # n = 191 (number of hurricanes in the list) df_lst = [] for sy, ey in year_pairs: if case2use == 'noML': f = h5py.File(prefix + f'Argo_data_aggr_{sy}_{ey}.mat', 'r') # else: f = scipy.io.loadmat(prefix + f'Argo_data_aggr_{sy}_{ey}.mat') # Specify hurricane to examine: TC from 2007 to 2010; and counts the # number of hurricanes hurricanes = list(set(np.array(dataframe[ (dataframe['SEASON'] >= sy) # IDs for hurricanes after 2007 & (dataframe['SEASON'] <= ey) # but before 2010 ]['ID']))) hurricanes.sort() # Sort hurricanes list n = len(hurricanes) # n = 66 (number of hurricanes in the list) print(f'Processing years {sy} - {ey}...') # This cycle applies before_floats and add_after_floats functions # from file Processor.py for idx, h_id in enumerate(hurricanes): hurricane_df = dataframe[dataframe['ID'] == h_id] # 54 hurricanes name = np.array(hurricane_df['NAME'])[0] # TOMAS season = np.array(hurricane_df['SEASON'])[0] # 2010 num = np.array(hurricane_df['NUM'])[0] # 21 print(f'Processing {idx+1} of {n}: {name} of {season} ({h_id}).') P = Processor(hurricane_df, f) P.generate_before_floats() if P.float_df.shape[0] == 0: print('No before floats') continue P.add_after_floats() pair_df = P.create_pair_df() if pair_df is not None: df_lst.append(pair_df.assign(HurricaneID=h_id)) df = pd.concat(df_lst ).sort_values('before_t', ascending=False ).drop_duplicates('after_t').reset_index(drop=True) df['profile_dt'] = df['after_t'] - df['before_t'] df['hurricane_dt'] = df['after_t'] - df['proj_t'] df = df.assign(signed_angle=lambda r: - r.sign * r.angle) return df
def update_data(entity, level, skiplvl, times, runtimes): docs = get_doc_version(entity, level, time=runtimes) if docs.count() is 0 and times is not 0: update_data(entity, level + 1, times - 1, 0) if times is 0: return None for doc in docs: p = Processor(entity=entity, data=doc, level=level, skiplvl=skiplvl) if times == 0: times = p.doc_last_version for _ in range(level, level + times): mod = p.next() update_doc(entity, doc["_id"], mod) click.echo('End: <{}> {}'.format(entity, doc['_id'])) return update_data(entity, level, skiplvl, times, runtimes + 1)
def run_experiment(self, dataset_initial): for random_state in self.random_states: X_train = self.storage_path + dataset_initial + '_train_' + str( random_state) + '.csv' X_test = self.storage_path + dataset_initial + '_test_' + str( random_state) + '.csv' settings = { 'training_file': X_train, 'data_file': X_test, 'max_reviews': None, # Options: 0 to any integer | default: None (all) 'output_file': self.storage_path + dataset_initial + '_output_' + str(random_state) + '.csv' } processor = Processor(settings=settings) processor.run()
def main(): # if len(argv) != 2: # print("Try harder in the future please") # sys.exit() # elif not argv[1].isdigit(): # sys.exit() current_path, filename = os.path.split(os.path.abspath(__file__)) above_path, _ = os.path.split(current_path) # f_path = str.format("%s/flocabulary.com/Grade%d.txt" % (current_path, int(argv[1]))) # TODO: add additional sources # print(f_path) # f = open(f_path, 'r') # cleanList(f) # run processor current_dir = os.listdir(current_path) # check to see if the files needed are available # if they are not, then we run the processor # otherwise we skip this step # note, running the processor takes around 3 minutes if "table_StartWords.txt" not in current_dir or "table_SynWords.txt" \ not in current_dir or "table_AntWords.txt" not in current_dir: for grade_level in range(1, 9): p = Processor(grade_level, current_path) print("Processing grade level " + str(grade_level)) p.start() p.end() # now we add the things to the database db_create = Creator(current_path) print("Creating tables...") db_create.create_tables() print("Populating tables...") db_create.populate() db_create.end() # copy the database file to the main directory of the project source = current_path + "/init.db" destination = above_path + "/words.db" print("Moving database to main project directory.") copyfile(source, destination) print("\nSetup is complete.\n")
# Directory to save videos to. ROOT_DIR = os.getcwd() SAVE_DIR = f'{os.getcwd()}/videos' # Create the server. app = Flask(__name__) api = Api(app) valid_headers = ['Content-Type', 'Access-Control-Allow-Origin', '*'] cors = CORS(app, allow_headers=valid_headers) # Connect to the database. mongo = MongoDatabase() # Video processing. processor = Processor() # Do a little server-side checking. ALLOWED_EXTENSIONS = set(['webm', 'mp4', 'mp3', 'wav', 'jpeg', 'gif', 'png']) # ------------------------------------------------------------- # Global functions. def allowed_file(filename): '''Ensure we want to keep this file.''' return True def validate_filepath(func): '''Decorator to validate a filepath from the frontend.'''
class Evaluator: def __init__(self, data_file): self.utilities = Utilities() self.data_file = data_file self.processor = Processor({'training_file': data_file}) self.segmenter = self.processor.load_segmenter() self.stanford = Stanford() self.segments = [] self.aspects = [] self.sentiments = [] self.prepare_aspect_sentiment_data() def calculate_evaluatio_matrices(self, labels, result): positives = 0 negatives = 0 for label in labels: if label == 1: positives += 1 elif label == 0: negatives += 1 evaluation_info = { 'positives': positives, 'negatives': negatives, # 'precision': "%.3f" % precision_score(labels, result), # 'recall': "%.3f" % recall_score(labels, result), 'accuracy': "%.3f" % accuracy_score(labels, result), 'f1_score': "%.3f" % recall_score(labels, result) } return evaluation_info def evaluate_segmentation(self): dataset = self.segmenter.features_and_labels all_data_transformed = self.segmenter.transform_categorical_numerical(dataset['data'], 'train') all_data_unique = self.utilities.get_unique_list_of_lists(all_data_transformed, dataset['labels']) # model = SGDClassifier() model = svm.SVC(kernel='linear') # model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes = (5, 2), random_state = 1) # Neural Network # model = MultinomialNB() # model = RandomForestClassifier(random_state=5) # model = tree.DecisionTreeClassifier(random_state=0) X = all_data_unique['data'] y = all_data_unique['labels'] f1_scores = cross_val_score(model, X, y, scoring='f1_micro', cv=5) print [round(score, 3) for score in f1_scores.tolist()] print("F1-score: %0.4f" % (f1_scores.mean())) def prepare_aspect_sentiment_data(self): data = self.utilities.get_segments_aspects_sentiments(self.data_file) self.segments = data['segments'] self.aspects = data['aspects'] self.sentiments = data['sentiments'] def transform_aspect_name_list(self, name_list): id_list = [] for name in name_list: id_list.append(self.lexicon.get_aspect_id_by_name(name)) return id_list def evaluate_classifier(self, classifier, X, y): # Begin evaluation X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=5) model = classifier.fit(X_train, y_train) y_pred = model.predict(X_test) # *** save info for error analysis errors = [] for index in range(0, len(X_test)): if y_test[index] != y_pred[index]: errors.append("\""+X_test[index] +"\",\""+ y_test[index] +"\",\""+ y_pred[index]+"\"") str_out = "\n".join(errors) self.utilities.write_content_to_file('aspect_errors.csv', str_out) print(clsr(y_test, y_pred)) def evaluate_aspect_extraction(self): X = self.segments y = self.aspects self.evaluate_classifier(self.processor.ml_asp_classifier, X, y) def transform_sentiment_classes(self, sentiment_names): sentiment_values = [] for sentiment_name in sentiment_names: sentiment_values.append(self.utilities.sentiment_classes.index(sentiment_name)) return sentiment_values def evaluate_sentiment_detection(self): X = self.segments y = self.sentiments # y = self.processor.ml_snt_classifier.merge_classes(y) self.evaluate_classifier(self.processor.ml_snt_classifier,X, y)
class Evaluator: def __init__(self, data_file): self.utilities = Utilities() self.data_file = data_file self.processor = Processor({'training_file': data_file}) self.segmenter = self.processor.load_segmenter() self.segments = [] self.aspects = [] self.sentiments = [] def calculate_evaluatio_matrices(self, labels, result): positives = 0 negatives = 0 for label in labels: if label == 1: positives += 1 elif label == 0: negatives += 1 evaluation_info = { 'positives': positives, 'negatives': negatives, # 'precision': "%.3f" % precision_score(labels, result), # 'recall': "%.3f" % recall_score(labels, result), 'accuracy': "%.3f" % accuracy_score(labels, result), 'f1_score': "%.3f" % recall_score(labels, result) } return evaluation_info def evaluate_segmentation(self): dataset = self.segmenter.features_and_labels all_data_transformed = self.segmenter.transform_categorical_numerical( dataset['data'], 'train') all_data_unique = self.utilities.get_unique_list_of_lists( all_data_transformed, dataset['labels']) # model = SGDClassifier() model = svm.SVC(kernel='linear') # model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes = (5, 2), random_state = 1) # Neural Network # model = MultinomialNB() # model = RandomForestClassifier(random_state=5) # model = tree.DecisionTreeClassifier(random_state=0) X = all_data_unique['data'] y = all_data_unique['labels'] f1_scores = cross_val_score(model, X, y, scoring='f1_micro', cv=5) print[round(score, 3) for score in f1_scores.tolist()] print("F1-score: %0.4f" % (f1_scores.mean())) def get_segments_gold_data(self): rows = self.utilities.read_from_csv(self.data_file) segments = [] aspects = [] sentiments = [] for row in rows: comment = row[0] comment_parts = comment.split('**$**') for index, comment_part in enumerate(comment_parts): segment = self.utilities.clean_up_text(comment_part) segments.append(segment) aspect = row[index + 1] if len(aspect) < 1: aspect = 'other neutral' elif aspect == 'noise': aspect = 'noise neutral' aspect_cls = aspect.rsplit(' ', 1)[0] sentiment_cls = aspect.rsplit(' ', 1)[1] aspects.append(aspect_cls) sentiments.append(sentiment_cls) data = { 'segments': segments, 'aspects': aspects, 'sentiments': sentiments } return data def evaluate_classifier(self, classifier, X, y, scoring='f1_micro'): # five fold cross-validation, test size 20% cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=11) scores = cross_val_score(classifier, X, y, cv=cv, scoring=scoring) print(sum(scores) / float(len(scores))) # # Begin evaluation # X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=11) # model = classifier.fit(X_train, y_train) # # y_pred = model.predict(X_test) # # # *** save info for error analysis # errors = [] # for index in range(0, len(X_test)): # if y_test[index] != y_pred[index]: # errors.append("\""+X_test[index] +"\",\""+ y_test[index] +"\",\""+ y_pred[index]+"\"") # # str_out = "\n".join(errors) # self.utilities.write_content_to_file('aspect_errors.csv', str_out) # # # print(clsr(y_test, y_pred)) def evaluate_aspect_extraction(self, X, y, merged=True): if merged is True: y = self.processor.ml_asp_classifier.merge_classes(y) self.evaluate_classifier(self.processor.ml_asp_classifier, X, y) def transform_sentiment_classes(self, sentiment_names): sentiment_values = [] for sentiment_name in sentiment_names: sentiment_values.append( self.utilities.sentiment_classes.index(sentiment_name)) return sentiment_values def evaluate_sentiment_detection(self, scoring='f1_micro', merged=True): data = self.get_segments_gold_data() X = data['segments'] print(len(X)) y = data['sentiments'] if merged: y = self.processor.ml_snt_classifier.merge_classes(y) self.evaluate_classifier(self.processor.ml_snt_classifier, X, y, scoring=scoring) def get_category_counts(self, cat_type='aspect', merged=True): data = self.get_segments_gold_data() if cat_type == 'aspect': categories = data['aspects'] elif cat_type == 'sentiment': categories = data['sentiments'] else: return "Incorrect category type." if merged is True and cat_type == 'aspect': categories = self.utilities.merge_classes(categories) elif merged is True and cat_type == 'sentiment': categories = self.processor.ml_snt_classifier.merge_classes( categories) counter = Counter(categories) return counter
import NXOpen # setup logging user = os.getlogin() timestamp = date.today().isoformat() log_file = os.path.join(config.LOG_DIR, "{}_{}.log".format(timestamp, user)) logging.basicConfig(filename=log_file, format='[%(asctime)s]%(levelname)s|%(name)s:%(message)s', level=config.LOGGING_LEVEL) logger = logging.getLogger(__name__) session = NXOpen.Session.GetSession() processor = Processor() # log NX version nx_version = session.GetEnvironmentVariableValue("NX_FULL_VERSION") if not nx_version: # NX 12 and prior nx_version = session.GetEnvironmentVariableValue("UGII_FULL_VERSION") logger.info("NX Version: {}".format(nx_version)) # parse caller options parser = argparse.ArgumentParser() parser.add_argument("--select", action="store_true") parser.add_argument("--work", action="store_true") parser.add_argument("--all_open", action="store_true") parser.add_argument("--mfg", action="store", nargs="*") # parse arguments
dest='SVML_DRIVE', default=False, action='store_true') parser.add_argument('-d', dest='CSV_FILE', default='~/store/fraud_data/creditcard.csv') parser.add_argument('-yclass', dest='YCOL', default='Class') args = parser.parse_args() #arguments for running ml suite #driver - controller.py #CSV_FILE = '~/store/fraud_data/creditcard.csv' #YCOL = 'Class' logger = Logging() m = Model() proc = Processor() #processor data = proc.load_csv(args.CSV_FILE) data = proc.normalize_col(data, 'Amount') data = data.drop(['Time'], axis=1) print data[args.YCOL].value_counts() X = proc.get_xvals(data, args.YCOL) y = proc.get_yvals(data, args.YCOL) #processor xfolds Xu, yu = proc.under_sample(data, args.YCOL) Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets( Xu, yu, .3, 0) X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0)
def main(_): # Import data CSV_FILE = '~/store/fraud_data/creditcard.csv' YCOL = 'Class' logger = Logging() proc = Processor() #TODO make this test suite data = proc.load_csv(CSV_FILE) data = proc.normalize_col(data, 'Amount') data = data.drop(['Time'], axis=1) X = proc.get_xvals(data, YCOL) y = proc.get_yvals(data, YCOL) #print data.describe() Xu, yu = proc.under_sample(data, YCOL) Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets( Xu, yu, .3, 0) X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0) x = tf.placeholder(tf.float32, [None, 29]) W = tf.Variable(tf.zeros([29, 1])) b = tf.Variable(tf.zeros([1])) y = tf.matmul(x, W) + b # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, 1]) # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), # ) reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw # outputs of 'y', and then average across the batch. #cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) #cross_entropy = -tf.reduce_sum(y_*tf.log(tf.clip_by_value(y,1e-10,1.0))) cross_entropy = tf.reduce_sum(tf.square(tf.subtract(y_, y))) train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy) sess = tf.InteractiveSession() tf.global_variables_initializer().run() # Train y_test = y_test.as_matrix() for i in range(20): #batch_xs, batch_ys = mnist.train.next_batch(100) #batch_xs = X_train #batch_ys = y_train.as_matrix() sess.run(train_step, feed_dict={x: X_train, y_: y_train.as_matrix()}) # Test trained model print("[model] training is complete ***************** ") correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean( tf.subtract(tf.cast(correct_prediction, tf.float32), y_test[:10000])) print('accuracy: %s' % sess.run(accuracy, feed_dict={ x: X_test.head(10000), y_: y_test[:10000] })) #cp = sess.run(tf.cast(correct_prediction, tf.float32), feed_dict={x: X_test.head(10000), y_: y_test[:10000]}) #lacc = tf.subtract(tf.cast(correct_prediction, tf.float32), y_test[:10000]) #cp = sess.run(lacc, feed_dict={x: X_test.head(10000), y_ : y_test[:10000]}) #count = 0 #for idx, c in enumerate(cp): #if c != y_test[idx]: ##print(idx, c, y_test[idx]) #continue #else: #count +=1 #print((count/float(10000))) sess.close()