def preprocess_savefig(root, progressbar, paths, params, type): if not os.path.isdir(os.path.join(root, "preprocessed")): for x in ["individual", "joined"]: for y in ["healthy", "defective"]: os.makedirs(os.path.join(root, "preprocessed", x, y)) progressbar["value"] = 0 progressbar["maximum"] = len(paths) _, img_sample = preprocess(img_path=paths[0][0], params=params) shape = img_sample.shape imgs = [] for p in range(len(paths)): img_join = np.zeros((shape[0]*2, shape[1], shape[2]), dtype='uint8') for n in range(2): _, img_np = preprocess(img_path=paths[p][n], params=params) img_join[n*shape[0]: (n+1)*shape[0]] = img_np img_pil = Image.fromarray(img_np) img_pil.save(os.path.join(root, "preprocessed", "individual", type, f"bean{p+1}_side{n+1}.png")) img_pil = Image.fromarray(img_join) img_pil.save(os.path.join(root, "preprocessed", "joined", type, f"bean{p+1}.png")) progressbar["value"] += 1 progressbar.update() progressbar["value"] = 0 progressbar.update()
def main(): train_file ="/Users/phx/downloads/competetion/recipe/train.json" with open(train_file) as file: data = json.load(file) print("size of dataset %d" % len(data)) data = preprocess(data) data = preprocess(data) train_data = [data[i] for i in xrange(0,len(data)) if i%3 !=0] test_data = [data[i] for i in xrange(0,len(data)) if i%3 ==0] #test_data= preprocess(test_data) attribute_map = getAttributeMap(train_data,1) print('attribute number : %d' % len(attribute_map)) print(attribute_map) label_map = getLabelMap(data) print('label number : %d' %len(label_map)) print(label_map) X,y = getDataSet(train_data,attribute_map,label_map) testX,testY= getDataSet(test_data,attribute_map,label_map) sgd = SGDClassifier(loss='log') generate_save_proba(sgd,X,y,testX,testY,"SGDClassifier.loss_log") mnb = MultinomialNB(alpha=0.08, class_prior=None, fit_prior=True) generate_save_proba(mnb,X,y,testX,testY,"MultinomialNB.alpha_0.08") rf = RandomForestClassifier(n_estimators=500) generate_save_proba(rf,X,y,testX,testY,"RandomForestClassifier.n_estimators_500") """
def run_extractor(): """Run the full extraction pipeline""" subprocess.call('mkdir data/reviews', shell=True) subprocess.call('mkdir data/tagged', shell=True) subprocess.call('mkdir data/untagged', shell=True) subprocess.call('mkdir data/to_parse', shell=True) subprocess.call('mkdir data/parsed/', shell=True) preprocessor.preprocess() subprocess.call('javac -cp ./external/stanford-postagger.jar Tagger.java', shell=True) subprocess.call( 'java -cp ".:./external/stanford-postagger.jar" -Xmx1024m Tagger ./external/left3words-wsj-0-18.tagger data/reviews data/tagged data/untagged', shell=True) multiword_attr_identifier.identify_multiword_attrs() parser_preparation.pre_parse() parser.parse_parallel(4) extraction_generator.generate_extractions() common_extraction_generator.generate_common_extractions() attribute_classifier.classify() extraction_filterer.filter_extractions() polarity_computer.compute_polarities()
def main(): choice = input( "Do you want to clean the original the files first ? (Y/N) \n") if choice == 'y' or choice == 'Y': i = 0 # To get the list of files avoiding the hidden files that may start with '.' or '~' original_filelist = [ f for f in listdir(original_filepath) if not (f.startswith('.') or f.startswith('~')) ] for filename in original_filelist: i += 1 print(filename) preprocess(filename) print(i, "file cleaning done") choice = input("Do you want to process the cleaned files ? (Y/N) \n") if choice == 'y' or choice == 'Y': for size in windowsizes: window_generator(size) perform_calc()
def ip_16_32_count(file_name): # adding ip count, network(IP/16bit) count to 'stat_dict' by preprocess.py stat_dict = {'ip':{}, 'network':{}} for fname in tqdm(file_name, total=len(file_name)): with open(rf"{data_path}{os.sep}{fname}", "rb") as file: pk = pickle.load(file) preprocess(pk, stat_dict) return stat_dict
def get_input(image, boxes): images = get_cropped_images(boxes, image) preprocessed_images = [ normalize(images), preprocess(images, histogram_stretching), preprocess(images, histogram_equalization), boxes ] return preprocessed_images
def get_input(image, boxes): images = get_cropped_images(boxes, image) adeq_images = preprocess(np.array([image]), adaptive_histogram_equalization)[0] preprocessed_images = [ preprocess(images, histogram_equalization), preprocess(images, histogram_stretching), get_cropped_images(boxes, adeq_images), boxes ] return preprocessed_images
def get_preprocessed_images(images): images = [ normalize(images), preprocess(images, histogram_stretching), preprocess(images, histogram_equalization) ] images = [ np.array([resize(img, (256, 256)) for img in imgs]) for imgs in images ] return images
def process_link(link): """ Processes the given link, does some noise removal and return the detailed page in form of a HTMLNode object :param str link: :return: list of HTMLNode """ website = constants.website # Build the right website link if link[0] == '/': if re.search('(^((http[s]{0,1}://)?www\.)?.+\.[a-z]+)/', website) is None: constants.logger.error('Unknown website link format') return None else: site_pref = re.findall('(^((http[s]{0,1}://)?www\.)?.+\.[a-z]+)/', website)[0][0] website = site_pref + link else: website = link # Launch website and get HTML code try: response = urllib.request.urlopen(website) except urllib.error.URLError: constants.logger.error('Page:"%s" was not able to launch' % website) return None source = response.read().decode('latin-1') # Transfer HTML code via easyhtml dom_parser = parser.DOMParser() dom_parser.feed(str(source)) document = dom_parser.get_dom() # Finding the html node html_object = None for node in document.elements: if isinstance(node, easyhtml.dom.HTMLTag): if node.tag_name == 'html': html_object = node break if html_object is None: constants.logger.error('No html tag was found on detailed page') return None # Transforming the dom tree into the built in data objects # of HTMLNodes detailed_page = HTMLNode(html_object, 0) # preprocessing and noise removal preprocessor.preprocess(detailed_page) preprocessor.remove_noise_dp(detailed_page) # Finding and returning main text return detailed_page
def get_inputs(images, boxes): cropped_images = np.array([imgs for i in range(0, len(images)) for imgs in get_cropped_images(boxes[i], images[i])]) flattened_boxes = np.array([values for _boxes in boxes for values in _boxes]) preprocessed_images = [ normalize(cropped_images), preprocess(cropped_images, histogram_stretching), preprocess(cropped_images, histogram_equalization), flattened_boxes ] return preprocessed_images
def main(): macros = preprocessor.load_pokecrystal_macros() macro_table = preprocessor.make_macro_table(macros) stdout = sys.stdout for source in sys.argv[1:]: dest = os.path.splitext(source)[0] + '.tx' sys.stdin = open(source, 'r') sys.stdout = open(dest, 'w') preprocessor.preprocess(macro_table) # reset stdout sys.stdout = stdout
def get_num_episodes(): while True: try: with open(info_filename, 'r') as f: lines = f.readlines() #last line is blank. num_eps = len(lines) - 1 assert num_eps > 0 break except: print('preprocessing...') preprocessor.preprocess() print('preprocessing completed') return num_eps
def main(): config = configuration.Config() macros = preprocessor.load_pokecrystal_macros() stdout = sys.stdout for source in sys.argv[1:]: dest = os.path.splitext(source)[0] + '.tx' sys.stdin = open(source, 'r') sys.stdout = open(dest, 'w') preprocessor.preprocess(config, macros) # reset stdout sys.stdout = stdout
def improved_indexer(documents): index = {} m = len(documents) doc_lengths = {} stopword_file = open(os.path.join(os.path.dirname(__file__), 'stopword_list.txt'), 'r') stopword_list = [] for line in stopword_file: stopword_list.append(line.rstrip()) termlist = {} for recordnum in documents: document = documents[recordnum]['text'] doc_lengths[recordnum] = [] for (i, field) in enumerate(document): priority = i tokens = preprocess(field, stopword_list) for token in tokens: if (token, priority) in index: if recordnum in index[(token, priority)]: index[(token, priority)][recordnum] += 1 else: index[(token, priority)][recordnum] = 1 else: index[(token, priority)] = {recordnum: 1} if token in termlist: if recordnum not in termlist[token]: termlist[token].append(recordnum) else: termlist[token] = [recordnum] doc_lengths[recordnum].append(len(tokens)) all_doc_lengths = [doc_lengths[recordnum] for recordnum in doc_lengths] doc_lengths_avg = numpy.average(numpy.matrix(all_doc_lengths), axis=0).tolist()[0] doc_lengths['avg'] = doc_lengths_avg enhanced_index = {} d = defaultdict(list) for word, priority in index: d[word].append(priority) terms = dict((k, v) for (k, v) in d.items()) for word in terms: docs_with_word = termlist[word] idf = log10((m+1.0) / len(docs_with_word)) for priority in terms[word]: enhanced_index[(word, priority)] = {} for document in index[(word, priority)]: enhanced_index[(word, priority)][document] = [index[(word, priority)][document], idf] return [enhanced_index, doc_lengths]
def process(): # Mengakses data form dari request HTTP text = request.form.get("text", "") # Melakukan preprocessing text = preprocess(text) # Melakukan tagging text = tag(text, "http://localhost:7000") # Melakukan chunking text = chunk(text) # Melakukan proses normalisasi text = normalize(text) # Membuat response HTTP dengan format JSON yang berisi teks yang telah diproses return jsonify({ "status": "success", "message": "Request successful", "data": { "text": text } })
def get_topic_sentiment_nltk(topic_keywords): topic_max_distance = [0] topic_min_distance = [0] topic_sarcastic = False for topic_keyword in topic_keywords: tweets = get_tweets_for_feature_extraction(topic_keyword, 3) tweets_positive = [0] tweets_negative = [0] tweets_sarcastic = False for tweet in tweets: processed_tweet = preprocess(tweet["text"]) processed_text = processed_tweet["text"] tokens = nltk.word_tokenize(processed_text) tokens = [(t.lower()) for t in tokens] mean_sentiment = sentiment_helper.score_sentence(tokens) positive_sentence_sentiment = mean_sentiment[0] negative_sentence_sentiment = mean_sentiment[1] tweets_positive.append(positive_sentence_sentiment) tweets_negative.append(negative_sentence_sentiment) tweets_sarcastic = ("#sarcasm" in processed_tweet["hashtags"]) or tweets_sarcastic topic_max_distance.append(max(tweets_positive) - min(tweets_positive)) topic_min_distance.append(max(tweets_negative) - min(tweets_negative)) topic_sarcastic = topic_sarcastic or tweets_sarcastic return sum(topic_max_distance) / (len(topic_keywords) or 1), sum(topic_min_distance) / (len(topic_keywords) or 1), int(topic_sarcastic)
def process_line(line): if line.strip() == '': # Don't process empty lines any further if cCountEmptyLines: return "\\State", None, False, 0 else: return "\\Statex", None, False, 0 sp = line.split("#") comment = "" if len(sp) > 1: if len(sp[-2]) == 0 or not sp[-2][-1] == "\\": comment = sp[-1] line = "\\#".join(sp[:-1]) else: if not len(sp[-2]) == 0: sp[-2] = sp[-2][:-1] line = "\\#".join(sp) comment = comment.strip() line = line.strip() line = preprocess(line) terminator = None process_lvl = False transform = 0 if line == "": line = generate_comment_line(comment) else: keyword = get_keyword(line) generator = get_generator(keyword) line, terminator, process_lvl, transform = generator(line) if not comment == "": line += " \\Comment{\ " + comment + "}" return line, terminator, process_lvl, transform # Add generated line to result
def telemetry(sid, data): if data: # The current steering angle of the car steering_angle = data["steering_angle"] # The current throttle of the car throttle = data["throttle"] # The current speed of the car speed = data["speed"] # The current image from the center camera of the car imgString = data["image"] image = Image.open(BytesIO(base64.b64decode(imgString))) image_array = np.asarray(image) image_array = preprocessor.preprocess(image_array) steering_angle = float( model.predict(image_array[None, :, :, :], batch_size=1)) throttle = controller.update(float(speed)) print(steering_angle, throttle) send_control(steering_angle, throttle) # save frame if args.image_folder != '': timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3] image_filename = os.path.join(args.image_folder, timestamp) image.save('{}.jpg'.format(image_filename)) else: # NOTE: DON'T EDIT THIS. sio.emit('manual', data={}, skip_sid=True)
def run_configs(data_dir, reviews_filename): # directory where the preprocessed files will be stored preprocessed_dir = data_dir + "preprocessed_files/" # directory of raw data eg. {root}/data/electronics/reviews_Electronics_5 filename = data_dir + reviews_filename # file endings raw = filename + ".json.gz" reviews = filename + "_reviews.txt" ratings = filename + "_ratings.npy" # possible preprocessing steps preprocess_steps = { "reg_lemma": ["clean", "regexp_tokenize", "remove_stop_words", "lemmatize"], # "reg_stem": ["clean", "regexp_tokenize", "remove_stop_words", "stem"], # "tw_lemma": ["clean", "tweet_tokenize", "remove_stop_words", "lemmatize"], # "tw_stem": ["clean", "tweet_tokenize", "remove_stop_words", "stem"], } for step in preprocess_steps: # generate a new filename eg. {root}/data/preprocessed_files/electronics/reviews_Electronics_5_tw_stem.txt preprocessed_filename = preprocessed_dir + filename.replace( data_dir, "") + "_" + step + ".txt" # if given file does not exist, preprocess input file with given steps and save it if not os.path.isfile(preprocessed_filename): preprocessed_texts = preprocessor.preprocess( reviews, preprocess_steps[step]) preprocessor.save_texts(preprocessed_texts, preprocessed_filename)
def recommend(inputs): recommendation_list = [] all_recommendation = analyzer.recommend_start(inputs) features = preprocessor.preprocess(inputs) recommendations = lookup_table.lookup(features) recommendations.append(all_recommendation) return all_recommendation
def on_start_file_chooser_button_clicked(self, widget): window = self.shell_ui.get_object("all_window") dialog = Gtk.FileChooserDialog( title="Please choose a file", parent=window, action=Gtk.FileChooserAction.OPEN, buttons=(Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, Gtk.STOCK_OPEN, Gtk.ResponseType.OK)) response = dialog.run() if response == Gtk.ResponseType.OK: selected_file_path = dialog.get_filename() relative_path = os.path.basename(selected_file_path) inputfile = open(relative_path, "r") code = inputfile.read() lines = code.split('\n') finalfile = lines[0].split('.')[0] + '.8085' print(lines[0].split('.')[0]) print(finalfile) entries_box = self.shell_ui.get_object("start_entries_box") wids = entries_box.get_children() for widget in wids: widget.destroy() i = 0 print(lines) for line in lines: if line != '': self.z.append(line) label = Gtk.Label("Code" + str(i)) tv = Gtk.TextView() tb = tv.get_buffer() entries_box.add(label) entries_box.add(tv) i += 1 with open(line, "r") as file: s = file.read() tb.set_text(s) print(s) self.shell_ui.get_object("start_entry_number_entry").set_text( str(i)) entries_box.show_all() self.x = preprocess(self.z) processed_box = self.shell_ui.get_object("processed_box") i = 0 for file_name in self.x: if file_name != '': label = Gtk.Label("Code" + str(i)) tv = Gtk.TextView() tb = tv.get_buffer() processed_box.add(label) processed_box.add(tv) i += 1 with open(file_name, "r") as file: s = file.read() tb.set_text(s) print(s) processed_box.show_all() elif response == Gtk.ResponseType.CANCEL: print("Cancel clicked") dialog.destroy()
def translated_data(): directory = 'C:\\Users\\olive\\Desktop\\Datasets_for_thesis\\Prisjakt\\training_data' extracted_data = extractor.json_extract(directory) extracted_reviews = extracted_data[0] polarities = extracted_data[1] preprocessed_reviews = preprocessor.preprocess(extracted_reviews) dictionary = dict.Dictionary(preprocessed_reviews).dictionary # review_translator.translate_reviews(preprocessed_reviews, polarities) with open('untranslated_reviews validation combined.txt', 'r') as file: untranslated_reviews = np.concatenate( vectorizer.vectorize_data( preprocessor.preprocess(file.readlines()), dictionary, 300)) with open('translated_polarities validation combined.txt', 'r') as file: translated_polarities = [] for line in file: translated_polarities.append(int(line)) return [untranslated_reviews, np.array(translated_polarities)]
def km(num): num = int(num) data = preprocess('data.txt', None , [] ) X_principal = xnormalize(data) km_name = kmeans_cluster(X_principal , num) return {"figure": "cluster/" + km_name}
def test_success(self): list_docs = [ 'Hôm nay, tôi đi học. 12321 ', 'Hôm nay, trời 432 đẹp quá!' ] list_docs = preprocessor.preprocess(list_docs) transformer = Text2Vector() transformer.fit(list_docs) print('Most comment words: ', transformer.get_most_common(10)) vec = transformer.doc_to_vec( preprocessor.preprocess( ['Hôm nay, tôi 332 đi học.', 'Hôm nay, 43 tôi đi chơi.!'])) print('Vec: ', vec) text = transformer.vec_to_doc(vec) print('Text: ', text)
def __init__(self, content): self.content = content self.sents = preprocess(content) self.word2count = self.countword() self.k1 = 1.50 self.b = 0.75 self.stopWords = stopwords.words('english') self.title = None
def test_preprocess_with_prefix_and_suffix(self): parts = {'qwe', 'wer'} prefix, suffix = 'prefix', 'suffix' s = ''.join((prefix, '{', '|'.join(parts), '}', suffix)) result = set(preprocess(s)) expected = set(''.join((prefix, '{', value, '}', suffix)) for value in parts) self.assertEqual(expected, result)
def main(range_map_geodatabase_path, layer_name, forest_dependency_spreadsheet_path, global_canopy_cover_thresh, aoo_canopy_cover_thresh, altitude_limits_table_path, generation_lengths_table_path): """This function is the core of the application. It performs the pre-processing, analysis and post-processing. :param range_map_geodatabase_path: Path to an ESRI file geodatabase containing range maps to be analysed. See README for required format. :param layer_name: Name of the layer in the geodatabase at geodatabase_path containing the range maps to be analysed. :param forest_dependency_spreadsheet_path: Path to a spreadsheet containing species' forest dependency information. See README for required format. :param global_canopy_cover_thresh: Pixels in the "treecover2000" layer with an intensity less than this threshold are excluded from all computations: they are not counted as tree cover. :param aoo_canopy_cover_thresh: 2km by 2km grid cells containing a proportion of tree cover greater than aoo_canopy_cover_thresh are counted as forested cells for the purpose of AOO estimation. :param altitude_limits_table_path: Path to a CSV file containing species' minimum and maximum altitudes. See README for required format. :param generation_lengths_table_path: Path to a CSV file containing species' generation lengths. See README for required format. :return: """ # Google Cloud Platform authentication. os.system('gcloud auth login') # Google Earth Engine authentication. ee.Authenticate() ee.Initialize() range_map_ic_gee_path = preprocess(range_map_geodatabase_path, layer_name, forest_dependency_spreadsheet_path) print_w_timestamp('Waiting for all GEE tasks to complete...') wait_until_all_tasks_complete() print_w_timestamp('Done.') if global_canopy_cover_thresh: if aoo_canopy_cover_thresh: analyse(altitude_limits_table_path, range_map_ic_gee_path, global_canopy_cover_thresh, aoo_canopy_cover_thresh) else: analyse(altitude_limits_table_path, range_map_ic_gee_path, global_canopy_cover_thresh) else: if aoo_canopy_cover_thresh: analyse(altitude_limits_table_path, range_map_ic_gee_path, aoo_canopy_cover_thresh) else: analyse(altitude_limits_table_path, range_map_ic_gee_path) print_w_timestamp('Waiting for all GEE tasks to complete...') wait_until_all_tasks_complete() print_w_timestamp('Done.') postprocess(generation_lengths_table_path)
def input(self, s): self.lexer.lineno = ExtendedLineNo(1, 0) self.errors = [] self.lexer.errors = self.errors self.token_gen = self.generator(self.lexer.token) ps = preprocess(s) return self.lexer.input(ps)
def genSent(): objPre = preprocess() objPre = objPre.load() sentences = obj.generateSent(objPre.word_to_index, 1000, objPre.index_to_word) print sentences[:5] print "writing " + str(len(sentences)) + " news" write_line = '\n'.join(sentences) open(FILE_NAME + '_sentences', 'w').write(write_line.encode('utf-8'))
def process_message(msg, channel=None): slice = json.loads(msg) text = preprocess(slice['text']) reply = None yap_reply = get_yap(text) slice['tokens'] = get_tokens(yap_reply) slice['raw_yap'] = yap_reply submit_yapped(slice, channel) all_processed.append(slice)
def on_start_file_chooser_button_clicked(self, widget): window = self.shell_ui.get_object("all_window") dialog = Gtk.FileChooserDialog(title="Please choose a file", parent=window, action=Gtk.FileChooserAction.OPEN, buttons=( Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, Gtk.STOCK_OPEN, Gtk.ResponseType.OK)) response = dialog.run() if response == Gtk.ResponseType.OK: selected_file_path = dialog.get_filename() relative_path = os.path.basename(selected_file_path) inputfile = open(relative_path, "r") code = inputfile.read() lines = code.split('\n') finalfile = lines[0].split('.')[0] + '.8085' print(lines[0].split('.')[0]) print(finalfile) entries_box = self.shell_ui.get_object("start_entries_box") wids = entries_box.get_children() for widget in wids: widget.destroy() i = 0 print (lines) for line in lines: if line != '': self.z.append(line) label = Gtk.Label("Code" + str(i)) tv = Gtk.TextView() tb = tv.get_buffer() entries_box.add(label) entries_box.add(tv) i += 1 with open(line, "r") as file: s = file.read() tb.set_text(s) print(s) self.shell_ui.get_object("start_entry_number_entry").set_text(str(i)) entries_box.show_all() self.x = preprocess(self.z) processed_box = self.shell_ui.get_object("processed_box") i = 0 for file_name in self.x: if file_name != '': label = Gtk.Label("Code" + str(i)) tv = Gtk.TextView() tb = tv.get_buffer() processed_box.add(label) processed_box.add(tv) i += 1 with open(file_name, "r") as file: s = file.read() tb.set_text(s) print(s) processed_box.show_all() elif response == Gtk.ResponseType.CANCEL: print("Cancel clicked") dialog.destroy()
def calculate_tf_idf_docs(): #Get the list of documents and their data documents = fetch_documents() #Preprocess Documents preprocessed_documents = [] for document in documents: preprocessed_documents.append(preprocess(document)) documents = preprocessed_documents # Find the list of unique words in the document dataset list_of_words = [] for document in documents: for word in document: if word not in list_of_words: list_of_words.append(word) N = len(documents) + 1 # Generate vector for each document copy_documents = documents documents_vector = [] for document in documents: doc_vector = [] for word in list_of_words: #Calculate term frequency tf = 0 for term in document: if term == word: tf = tf + 1 #Calculate document frequency df = 0 for copy_document in copy_documents: if word in copy_document: df = df + 1 #Calculate tf-idf idf = math.log(N / df) tfidf = tf * idf doc_vector.append(tfidf) documents_vector.append(doc_vector) #Generate database db = {} db['list_of_words'] = list_of_words db['N'] = N db['documents_vector'] = documents_vector db['documents'] = documents #Save data to persistence storage pickle_out = open(PREPROCESSED_DATA, 'wb') pickle.dump(db, pickle_out) pickle_out.close()
def interpret(text): text, line_nums, indent_str = preprocess(text) program = core_parser.parse(text, trace=False) context = Context(op_parser, keywords) for stmt in program: if isinstance(stmt, Block): context.keywords[stmt.keyword](stmt.header, stmt.body, context, context) else: print(cata(stmt, lambda ast: parse_ops(ast, context)))
def load_image_and_steering_for_train(csv_line): correction = 0.2 steering = float(csv_line[3]) i = random.randint(0, 2) if i == 0: img = preprocessor.preprocess(cv2.imread(csv_line[0])) elif i == 1: img = preprocessor.preprocess(cv2.imread(csv_line[1])) steering = steering + correction else: img = preprocessor.preprocess(cv2.imread(csv_line[2])) steering = steering - correction img, steering = random_augment(img, steering) return img, steering
def generate_ground_data(image_path): image, img_txt = read_image(image_path) copy = image.copy() image, segments, euler_list, central_x, central_y = preprocess(image) feature_list = get_feature_list( image, segments, euler_list, central_x, central_y)[0] classes_list = get_class_list(copy, segments) with open("%s" % img_txt, 'wb') as test: for char, feature in zip(classes_list, feature_list): test.write("%s %s\n" % (chr(char), ' '.join(map(str, feature))))
def recognize(pic, dir_train_pics): pic = Image.open(pic) print 'preprocessor.preprocess' pic_preprocessed = preprocessor.preprocess(pic) block_array = [] print 'spliter.split' spliter.split(pic_preprocessed, block_array) captcha = "" if len(block_array) == 4: print 'recognize_block_array' captcha = recognize_block_array(block_array, dir_train_pics) return captcha
def recognize(pic, dir_train_pics): pic = Image.open(pic) #print 'preprocessor.preprocess' pic_preprocessed = preprocessor.preprocess(pic) block_array = [] #print 'spliter.split' spliter.split(pic_preprocessed, block_array) captcha = "" if len(block_array) >= THRESHOLD_BLOCK_NUMBER: #print 'recognize_block_array' captcha = recognize_block_array(block_array, dir_train_pics) return captcha
def run_extractor(): """Run the full extraction pipeline""" subprocess.call('mkdir data/reviews', shell=True) subprocess.call('mkdir data/tagged', shell=True) subprocess.call('mkdir data/untagged', shell=True) subprocess.call('mkdir data/to_parse', shell=True) subprocess.call('mkdir data/parsed/', shell=True) preprocessor.preprocess() subprocess.call('javac -cp ./external/stanford-postagger.jar Tagger.java', shell=True) subprocess.call('java -cp ".:./external/stanford-postagger.jar" -Xmx1024m Tagger ./external/left3words-wsj-0-18.tagger data/reviews data/tagged data/untagged', shell=True) multiword_attr_identifier.identify_multiword_attrs() parser_preparation.pre_parse() parser.parse_parallel(4) extraction_generator.generate_extractions() common_extraction_generator.generate_common_extractions() attribute_classifier.classify() extraction_filterer.filter_extractions() polarity_computer.compute_polarities()
def assembler_to_hex(source_code, filename=None, preprocessor_only=False): """ Convert a assembler program to `Tiny` machine code. Opcodes described at http://redd.it/1kqxz9 """ code = preprocess(source_code, filename or '<input>') if preprocessor_only: return '\n'.join(c.contents for c in code) return assemble(code)
def assemble(self): text = self.editor.GetText() try: self.reset(False) self.program = assembler.parse(preprocessor.preprocess(text)) self.emu.load(self.program.assemble()) self.program_list.update(self.program.instructions) self.refresh_debug_info() except Exception as e: self.reset(False) dialog = wx.MessageDialog(self, str(e), 'Error', wx.ICON_ERROR | wx.OK) dialog.ShowModal() dialog.Destroy()
def __init__(self, program, mode = "MIPS"): super(Assembler, self).__init__() try: text = program.read() except AttributeError: text = program self.mode = mode.upper() self.registers = Registers(self.mode) lines = text.split("\n") lines = clean(lines, self.mode) instrs, data = split_sections(lines) self.memory = Memory() for d in data: self.memory.insert(d) instrs = preprocess(instrs, self.mode) self.labels = label_positions(instrs) self.instructions = [Instruction(instr) for instr in instrs]
def get_frame(conf, sensor, location, start, end): df = pandas.DataFrame() # UUID of the data to retrieve uuid = conf[sensor][location]["uuid"] # IP Address of the Archiver server = conf["archiver"] # Port of the Archiver port = conf[sensor][location]["archiver_port"] # Title of the column in the frame title = sensor.title() + "_" + location # Get frame for each location tframe = get_data(uuid, server, port, title, start, end) df = preprocess(tframe) return df
def extract_train(self, sentences, labels): ''' Extract feature vectors and numbered labels from training data. @param sentences: list of sentences to be extracted @param labels: literal labels of each sentence @return X: 2D numpy array, feature vectors, one sentence per row @return y: 1D numpy array, numbered label of each sentence ''' literal_labels = list(set(labels)) print "Labels: ", literal_labels y = np.array([literal_labels.index(l) for l in labels]) sentences = [preprocess(s) for s in sentences] self.pre_calculate(sentences) Xs = [] X = np.array([self._extract(s) for s in sentences]) self.literal_labels = literal_labels return X, y
def get_features_from_nltk(tweet): # is tweet sarcastic is_sarcastic = int("#sarcasm" in tweet["text"]) processed_tweet = preprocess(tweet["text"]) processed_text = processed_tweet["text"] tokens = nltk.word_tokenize(processed_text) tokens = [(t.lower()) for t in tokens] mean_sentiment = sentiment_helper.score_sentence(tokens) positive_sentence_sentiment = mean_sentiment[0] negative_sentence_sentiment = mean_sentiment[1] sentence_sentiment = mean_sentiment[0] - mean_sentiment[1] word_sentiments = [] for word in processed_text.split(" "): if len(word) > 0: word_sentiment = sentiment_helper.score_word(word.lower()) word_sentiments.append(word_sentiment) maximum_word_polarity = max([x[0] for x in word_sentiments]) minimum_word_polarity = max([x[1] for x in word_sentiments]) polarity_distance_max = maximum_word_polarity - sentence_sentiment polarity_distance_min = abs(minimum_word_polarity - sentence_sentiment) blob_text = TextBlob(processed_text) topic_keywords = blob_text.noun_phrases + processed_tweet["hashtags"] + processed_tweet["mentions"] topic_positive, topic_negative, topic_sarcasm = get_topic_sentiment_nltk(topic_keywords) return ["{0:.2f}".format(positive_sentence_sentiment), "{0:.2f}".format(negative_sentence_sentiment), "{0:.2f}".format(sentence_sentiment), "{0:.2f}".format(maximum_word_polarity), "{0:.2f}".format(minimum_word_polarity), "{0:.2f}".format(polarity_distance_max), "{0:.2f}".format(polarity_distance_min), "{0:.2f}".format(topic_positive), "{0:.2f}".format(topic_negative), topic_sarcasm, is_sarcastic]
def main(): try: print_title () P = Params() P.load() P.check_all() files_to_rename = get_files_to_rename(P.INPUT_DIRS, P.VIDEO_EXTENSIONS) actions_to_process = preprocessor.preprocess( files = files_to_rename, language = P.LANGUAGE, output_dir = P.OUTPUT_DIR ) processor.process( to_process = actions_to_process, config_path = P.get_path(expanded=True), ACTION = P.ACTION ) except KeyboardInterrupt: print() log.info('Aborting.') exit(1) except ConnectionError: log.fail('Lost connection. Aborting.') exit(2) except ConnectionRefusedError: log.fail('Lost connection. Aborting.') exit(2) except KeyError as e: if e == 'EDITOR': log.fail('Could not find the environment variable EDITOR. Aborting.') exit(1) else: log.fail('Uncaught KeyError exception: %s. Aborting.' % e.args[0]) exit(2)
def get_features_from_text_blob(tweet): # is tweet sarcastic is_sarcastic = int("#sarcasm" in tweet["text"]) # preprocess tweet content processed_tweet = preprocess(tweet["text"]) processed_text = processed_tweet["text"] blob_text = TextBlob(processed_text) # measure sentiment features of tweet sentence_polarity = blob_text.sentiment.polarity sentence_subjectivity = blob_text.sentiment.subjectivity # calculate word based polarity to capture extreme expressions polarities = [] for word in processed_text.split(" "): blob_word = TextBlob(word) polarities.append(blob_word.sentiment.polarity) maximum_word_polarity = max(polarities) minimum_word_polarity = min(polarities) # measure how extreme the most expressive is with respect to whole sentence polarity_distance_max = maximum_word_polarity - sentence_polarity polarity_distance_min = abs(minimum_word_polarity - sentence_polarity) # extract topic based sentiment values; combined polarity, subjectivity and any sarcasm clue topic_keywords = blob_text.noun_phrases + processed_tweet["hashtags"] + processed_tweet["mentions"] topic_polarity, topic_subjectivity, topic_sarcasm = get_topic_sentiment(topic_keywords) return ["{0:.2f}".format(sentence_polarity), "{0:.2f}".format(sentence_subjectivity), "{0:.2f}".format(maximum_word_polarity), "{0:.2f}".format(polarity_distance_max), "{0:.2f}".format(polarity_distance_min), "{0:.2f}".format(topic_polarity), "{0:.2f}".format(topic_subjectivity), topic_sarcasm, is_sarcastic]
#print key.center(80,'*')+'\n' #print item.Serialize()+'\n' for key,value in items.items(): fp.write(key.center(70,'*')+'\n') fp.write(value.Serialize()+'\n') if __name__ == '__main__': log.InitLog() px=Parser() with open('IFC2X3_TC1.exp','rb') as fp: #with open('schema.exp','rb') as fp: px.parse(fp) dataset=px.dataset preprocess(dataset) with open('IFC2X3_TC1.json','w') as fp: #with open('schema.json','w') as fp: toJson(dataset.types,fp) toJson(dataset.entities,fp) toJson(dataset.rules,fp) toJson(dataset.functions,fp) generater=Generator(dataset) generater.generateCommonFiles() generater.generateTypes() generater.generateEntities() generater.generateIndexes()
def parse(s, parser=None): if parser is None: parser = Parser(file_prefix='.d_parser_mach_gen') return parser.parse(preprocessor.preprocess(s)).structure
train_filename = sys.argv[i] i += 1 print >>sys.stderr, "reading labelled dataset from '" + train_filename + "'..." input = open(train_filename, "r") if train_filename != "-" else sys.stdin input.readline() X = numpy.loadtxt(input, delimiter=",", dtype=numpy.uint8) labels = X[:,0] X=X[:,1:].astype(float) print >>sys.stderr, "training KNN with", min(train_threshold, X.shape[0]), "training instances and k=", k, "..." clf = make_classifier(preprocess(X[:train_threshold]), labels[:train_threshold], name="KNN", params=[k]) print >>sys.stderr, "making predicitions for", max(0,X.shape[0]-train_threshold), "instances ..." predictions = clf.predict(preprocess(X[train_threshold:])) print >>sys.stderr, "evaluating ..." if verbose: for i in range(len(predictions)): print labels[train_threshold:][i], predictions[i] if labels[train_threshold:][i] != predictions[i]: print >>sys.stderr, "should be:", labels[train_threshold:][i], ", was:", predictions[i] put_image(X[train_threshold:][i], 0, sys.stderr) print >>sys.stderr else: for i in range(len(predictions)):
from preprocessor import preprocess print preprocess("Sankararaman case: Kanchi seers, other accused acquitted".split(' '))
def extract(self, sentence): '''Extract the feature vector for a testing sentence. The sentence is first turned into a list of words and then the feature extraction logic is delegated to _extract.''' return self._extract(preprocess(sentence))
dir_path_step = '../../pics/gujinsuo/pics_step' dir_path_train = '../../pics/gujinsuo/pics_train/' deal_number = 10 pic_step1 = 1 pic_step2 = 2 pic_step3 = 3 for pic_ptr in xrange(deal_number): pic_ptr_str = str('%04d' % pic_ptr) image_path = dir_path_base + pic_ptr_str + '.jpg' pic = Image.open(image_path) pic_preprocessed = preprocessor.preprocess(pic) output_path = dir_path_step + str(pic_step1) + '/' + pic_ptr_str + '_' + str(pic_step1) + '.jpg' print output_path pic_preprocessed.save(output_path) block_array = [] spliter.split(pic_preprocessed, block_array) for i in xrange(len(block_array)): output_path = dir_path_step + str(pic_step2) + '/' + pic_ptr_str + '_' + str(pic_step2) + '_' + str(i) + '.jpg' print output_path block_array[i].save(output_path) for pic_ptr in xrange(deal_number): pic_ptr_str = str('%04d' % pic_ptr)
else: if sys.argv[1] == 'naivebayes' or sys.argv[1] == 'knn': annotated_texts = read('blog-gender-dataset.xlsx') training_set_len = 0.7 * len(annotated_texts) training_set = [] test_set = [] for (text,gender) in annotated_texts: if 'M' in gender: gender = 'M' else: gender = 'F' if len(training_set) < training_set_len: training_set.append((preprocess(text), gender)) else: test_set.append((preprocess(text), gender)) if sys.argv[1] == 'naivebayes': classifier = NaiveBayesClassifier(training_set) else: classifier = KNNClassifier(training_set, 5) print(calculate_metrics(test_set, classifier)) else: print('Invalid classifier name. Choose from [naivebayes, knn]')
from sklearn.ensemble import AdaBoostClassifier from sklearn.linear_model import SGDClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn import svm from sklearn.ensemble import BaggingClassifier from sklearn import tree import numpy from preprocessor import preprocess, getDataSet, getLabelMap, getAttributeMap import json train_file = "/Users/phx/downloads/competetion/recipe/train.json" with open(train_file) as file: data = json.load(file) print("size of dataset %d" % len(data)) data = preprocess(data) train_data = [data[i] for i in xrange(0, len(data)) if i % 3 != 0] test_data = [data[i] for i in xrange(0, len(data)) if i % 3 == 0] # test_data= preprocess(test_data) attribute_map = getAttributeMap(train_data, 1) print("attribute number : %d" % len(attribute_map)) print(attribute_map) label_map = getLabelMap(data) print("label number : %d" % len(label_map)) print(label_map)
__author__ = 'phx' import numpy from sklearn import metrics from sklearn.naive_bayes import BernoulliNB from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import SGDClassifier from preprocessor import preprocess,getAttributeMap,getLabelMap,getDataSet import json train_file ="train.json" with open(train_file) as file: data = json.load(file) print(len(data)) data = preprocess(data) """ train_data = [data[i] for i in xrange(0,len(data)) if i%4 !=0] test_data = [data[i] for i in xrange(0,len(data)) if i%4 ==0] """ train_data = data test_file ="/Users/phx/downloads/competetion/recipe/test.json" with open(test_file) as file: test_data = json.load(file) print(len(test_data)) test_data= preprocess(test_data) attribute_map = getAttributeMap(train_data,1)
print "filename: ", xml_filename text_filepath = outputloc + xml_filename + ".txt" pt = ParseText(xml_filepath, text_filepath) content = pt.readXmlToString() #content_list variable is not used anymore, but is still used in the getBio code that I left so I left this here too #content_list = pt.readXMLToList() if content in xmlset: continue # Skip duplicates xmlset.add(content) soup = BeautifulSoup(content, "html.parser") #Preprocess using preprocessor.py preprocess(soup) #Get headings headings = pt.findHeadings(PROBABLE_HEADINGS, soup) headingsclean = [h.get_text() for h in headings] #bio = pt.find_bio(content, content_list, headings, heading_indexes) #Use find_this function to find edu, exp, leadership, skills, languages, volunteer edu, isXml = pt.find_this(soup, ["education", "educaton"], []) exp, isXml = pt.find_this(soup, ["experience", "employment", 'career', 'history', 'professional', 'work'], ['objective', 'course']) leadExp, x = pt.find_this(soup, ["leadership", 'community', 'extracurricular', 'activities', 'organizations'], []) skills, isXml = pt.find_this(soup, ["kills"], []) languages, isXml = pt.find_this(soup, ["languages", 'foreign'], ['computer', 'programming'])