def test_corpus(self): with open("../data/pt_BR/nnp") as f: nnp = [line.rstrip() for line in f.readlines()] with open("../data/pt_BR/terms") as f: terms = [line.rstrip() for line in f.readlines()] with open("../data/pt_BR/patterns") as f: patterns = [line.rstrip() for line in f.readlines()] data = LoadData(['../corpus/sel1.csv', '../corpus/sel2.csv']).load() p = PreProcessing(nnp, terms, patterns) tokens = [] for d in data.values(): tokens += p.clean_and_stem(d) bow, bow_features_names = p.build_bow(tokens) dist = np.sum(bow.toarray(), axis=0) tbow = {} for term, count in zip(bow_features_names, dist): tbow[term] = count import operator with open("bow", "w") as f: f.write(str(len(tbow))) f.write( str( sorted(tbow.items(), key=operator.itemgetter(1), reverse=True))) terms = p.compute_tfidf(data.values(), eliminate_zeros=True) with open("terms", "w") as f: f.write(str(terms))
def test_should_remove_accents_and_special_chars(self): c = PreProcessing() expected = ['oi', 'qual', 'e', 'o', 'email', 'do', 'sr', 'joao', 'e', 'joaogmailcom', 'ah', 'eu', 'ja', 'sabia'] self.assertEquals(expected, c.__normalize__("Oi, qual é o e-mail do Sr. João? " "É [email protected]! Ah eu já sabia!")) expected = ['o', 'cpf', 'do', 'joao', 'e', '12345678900'] self.assertEquals(expected, c.__normalize__("o cpf do joao é 123.456.789-00"))
def __init__(self, questions: set, answers: set, word_vectors=None): self.bow = CountVectorizer() self.questions = questions self.answers = answers self.word_vectors = word_vectors self.pp = PreProcessing()
def load_gof_data(): from pre_processing import Loader, PreProcessing loader = Loader() word_lst = loader.get_list('data/testdatatxt.txt') pre_process = PreProcessing() lst = pre_process.process2(word_lst) return lst
def test(self): p = PreProcessing([], [], []) cts = machado.fileids()[:5] tokens = [] for c in cts: text = machado.raw(c) tokens += p.clean_and_stem(text) bow, bow_features_names = p.build_bow(tokens) dist = np.sum(bow.toarray(), axis=0) tbow = {} for term, count in zip(bow_features_names, dist): tbow[term] = count import operator print sorted(tbow.items(), key=operator.itemgetter(1), reverse=True) texts = {} for c in cts: text = machado.raw(c) texts[c] = text terms = p.compute_tfidf(texts.values(), top_n=10, eliminate_zeros=True) print terms
def load_gof_data(): from pre_processing import Loader, PreProcessing loader = Loader() word_lst = loader.get_list("data/testdatatxt.txt") pre_process = PreProcessing() lst = pre_process.process2(word_lst) return lst
def test_should_remove_digits(self): c = PreProcessing(["joao"], [], ["\d+"]) self.assertEquals(["tem", "anos"], c.__obfuscate__(["joao", "tem", "12", "anos"])) self.assertEquals(["anos", "e", "amigos", "no", "facebook"], c.__obfuscate__(["joao", "12", "anos", "e", "1765546587", "amigos", "no", "facebook"])) self.assertEquals(["o", "cpf", "do", "e"], c.__obfuscate__(["o", "cpf", "do", "joao", "e", "123.456.789-00"]))
def test_should_build_bag_of_words(self): p = PreProcessing(["joao", "maria"], [], ["\d+", "nomeemp*"]) text = "O técnico João foi até a casa da Maria (NOMEEMPRESA) e solucionou o problema. " \ "Ele não foi solucionado? NomeempProd" tokens = p.clean(text) tokens = p.stem(tokens) bow, bfn = p.build_bow(tokens) self.assertEquals("(7, 6)", bow.shape.__str__())
def pipeline(img, lanes_fit, camera_matrix, dist_coef): # debug flag is_debug_enabled = True # checkbox dimensions for calibration nx, ny, channels = 9, 6, 3 # calibrate camera and undistort the image undistorted_image = PreProcessing.get_undistorted_image( nx, ny, img, camera_matrix, dist_coef) # get the color and gradient threshold image binary_image = PreProcessing.get_binary_image(undistorted_image) # get source and destination points src, dst = PerspectiveTransform.get_perspective_points(img) # get image with source and destination points drawn img_src, img_dst = PerspectiveTransform.get_sample_wrapped_images( img, src, dst) # perspective transform to bird eye view warped_image = PerspectiveTransform.get_wrapped_image( binary_image, src, dst) # find the lanes lines and polynomial fit if len(lanes_fit) == 0: lane_lines, lanes_fit, left_xy, right_xy = LanesFitting.get_lanes_fit( warped_image) else: lane_lines, lanes_fit, left_xy, right_xy = LanesFitting.update_lanes_fit( warped_image, lanes_fit) # find the radius of curvature radius = Metrics.get_curvature_radius(lane_lines, left_xy, right_xy) # find the car distance from center lane center_distance, lane_width = Metrics.get_distance_from_center( lane_lines, lanes_fit) # unwrap the image resultant_img = PerspectiveTransform.get_unwrapped_image( undistorted_image, warped_image, src, dst, lanes_fit) # visualize the pipeline if is_debug_enabled is True: resultant_img = Visualization.visualize_pipeline( resultant_img, img_dst, binary_image, lane_lines, radius, center_distance, lane_width) return lanes_fit, resultant_img
def __init__(self, messages, model, questions: set, answers: set, pc_questions: dict, pc_answers: dict, tokenizer): self.questions = questions self.answers = answers self.pc_questions = pc_questions self.pc_answers = pc_answers self.tokenizer = tokenizer self.model = model self.messages = messages self.pp = PreProcessing() self.s = Similarity(questions=self.questions, answers=self.answers )
class Prediction: def __init__(self, messages, model, questions: set, answers: set, pc_questions: dict, pc_answers: dict, tokenizer): self.questions = questions self.answers = answers self.pc_questions = pc_questions self.pc_answers = pc_answers self.tokenizer = tokenizer self.model = model self.messages = messages self.pp = PreProcessing() self.s = Similarity(questions=self.questions, answers=self.answers ) def predict(self, msg): if msg == '' or msg is None: return emergency_message() try: msg = self.pp.pre_processing_text_for_similarity(msg) msg_nn = self.pp.pre_processing_text_for_neural_network(msg) except Exception as e: save_content_to_log(e) return BOT_PREFIX + emergency_message() + '\n' + str(e) if msg == '' or msg is None: return emergency_message() p = self.tokenizer.texts_to_matrix([msg_nn]) res = self.model.predict(p) if res >= 0.5: pc = self.pc_questions else: pc = self.pc_answers conversations = self.s.return_conversation_by_cossine(msg, res) conversations = self.s.return_conversation_by_page_rank(msg, conversations, page_compute=pc, reverse=True) return self.s.get_the_next_conversation(conversations, self.messages)
def apply_gazetteer(extract_from, tweets_file, gazetteer_file, final_file, lang_tweets=None): with open(tweets_file) as tweets_f: gazetteer_list = load_to_list_gazetteer_file(gazetteer_file) prepro = PreProcessing() i = 0 t = "{0}\t{1}\t{2}\n" for line in tweets_f: if i > 0: if extract_from == "mixed": write_for_mixed_signal(line, gazetteer_list, final_file, prepro, lang_tweets) elif extract_from == "gps": write_for_gps_signal(line, gazetteer_list, final_file, prepro) else: write_for_tweet_or_location_signal(line, extract_from, gazetteer_list, final_file, prepro, lang_tweets) else: with open(final_file, "w") as final_f: final_f.write( t.format(line.strip(), 'hierarchy', 'location_detected')) i += 1
def test_should_compute_tdidf(self): p = PreProcessing(["joao", "maria"], [], ["\d+", "nomeemp*"]) text_1 = "O técnico João foi até a casa da cliente Maria (NOMEEMPRESA) e solucionou o problema. " \ "Ele não foi solucionado? NomeempProd" text_2 = "A cliente Maria disse que continua sem sinal de Internet e " \ "reclamou que o problema não foi resolvido, ela continua sem sinal" text_3 = "Maria solicitou reparo, cliente reclama que esta sem sinal de Internet e Telefone após chuva" texts = [text_1, text_2, text_3] terms = p.compute_tfidf(texts) print terms import operator print sorted(terms.items(), key=operator.itemgetter(1), reverse=True)
def retrain(): ds = process(PreProcessing('./data/starwars.txt')) word_embedding = WordEmbedding(source='./embedding/FT/fasttext_cbow_300d.bin') word_embedding.train(ds.pairs) word_embedding.save('./embedding/starwars', 'starwars.bin')
def process(reader: PreProcessing, storage: DatasetStorage = DatasetStorage()): if not storage.exist(reader.idx): pairs = reader.process() dataset = Dataset(pairs, reader.idx) storage.save(dataset) return storage.load(reader.idx)
def train(): ds = process(PreProcessing(open('./data/starwars.txt', 'r'))) word_embedding = WordEmbedding(source=ds.pairs) word_embedding.train(ds.pairs) word_embedding.save(target_folder='./embedding/starwars', filename='starwars.bin')
def test_should_save_load_dataset(self): storage = ds.DatasetStorage() pre_processing = PreProcessing(sentences) dataset = ds.process(pre_processing) expected = storage.save(dataset) result = storage.load(expected.idx) self.assertEqual('{"idx": "' + expected.idx + '", "pairs": 3}', result.__str__())
def setUpClass(cls): cls.pre_processing = PreProcessing(sentences) cls.dataset = ds.process(cls.pre_processing) cls.word_embedding = WordEmbedding(source=cls.dataset.pairs) encoder = EncoderRNN(cls.word_embedding, 300, 1).to(settings.device) decoder = DecoderRNN(300, cls.word_embedding, 0.0, 1).to(settings.device) cls.model = Model(encoder, decoder) cls.model.train(cls.dataset)
class Dataset: def __init__(self): self.pp = PreProcessing() def import_dataset(self): messages = pd.read_csv(DATA_FILE, delimiter="\t", quoting=3, encoding="ISO-8859-2") messages.columns = [ 'msg_line', 'user_id', 'movie_id', 'msg', 'msg_pre_processed', 'msg_2', 'target' ] return messages def get_questions(self, messages): return set( messages[messages["target"] == 1]["msg_pre_processed"].astype(str)) def get_answers(self, messages): return set( messages[messages["target"] == 0]["msg_pre_processed"].astype(str)) def get_page_compute(self, qea=0): pc = None file = None if qea == 0: file = PAGE_RANK_ANSWERS else: file = PAGE_RANK_QUESTIONS pc = self.pp.pre_processing_page_rank_file(file) return self.pp.normalize_dictionary(pc) def load_tokenizer(self): with open(TOKENIZER_FILE, "rb") as handle: tokenizer = pickle.load(handle) return tokenizer
def __main__(): # get video stream video_cap = imageio.get_reader(config["project_video"]) # polynomial lane fit lanes_fit = [] # history of heatmaps to reject false positives history = deque(maxlen=config["history_limit"]) # classifier and scaler classifier = Classifier.get_trained_classifier(use_pre_trained=True) # load calibration parameters: camera_matrix, dist_coef = PreProcessing.load_calibration_params() for index, img in enumerate(video_cap): if index % config["skip_frames"] == 0: # get lanes lanes_fit, img = LaneDetection.pipeline(img, lanes_fit, camera_matrix, dist_coef) # resize image to improve speed of vehicle detection using classifier # jpg to png if config["is_training_png"]: img = Helper.scale_to_png(img) # 3 channel without alpha img = img[:, :, :config["channels"]] bounding_boxes = [] # get bounding boxes for left side x_start_stop_left, y_start_stop_left = config["xy_start_stop_left"] bounding_boxes += WindowSearch.get_bounding_boxes( img, classifier, x_start_stop_left, y_start_stop_left) # # get bounding boxes for top side x_start_stop_top, y_start_stop_top = config["xy_start_stop_top"] bounding_boxes += WindowSearch.get_bounding_boxes( img, classifier, x_start_stop_top, y_start_stop_top) # get bounding boxes for right side x_start_stop_right, y_start_stop_right = config[ "xy_start_stop_right"] bounding_boxes += WindowSearch.get_bounding_boxes( img, classifier, x_start_stop_right, y_start_stop_right) # remove false positives and duplicates from detection detected_cars = Helper.remove_false_positives( img, bounding_boxes, history) # visualization plt.imshow(detected_cars) plt.pause(0.0001)
def test_pre_processing(self): pre_processing = PreProcessing(sentences) dataset = ds.process(pre_processing) expected = [ ('ontem à noite e anteontem à noite . . .', 'tommyknockers, tommyknockers batendo na porta .'), ('tommyknockers, tommyknockers batendo na porta .', 'eu quero sair, não sei se posso . . . tenho medo do tommyknockers' ), ('eu quero sair, não sei se posso . . . tenho medo do tommyknockers', 'bobbi .') ] self.assertEqual(dataset.pairs, expected)
def run(hidden, layer, dropout, learning_rate, iteration, save, train=None, test=None): if train: dataset_id = train.split('/')[-1].split('.')[0] pre_processing = PreProcessing(open(train, 'r'), dataset_id) dataset = process(pre_processing) encoder_embeddings = WordEmbedding(source=dataset.pairs) decoder_embeddings = WordEmbedding(source=dataset.pairs) encoder = EncoderRNN(encoder_embeddings, hidden, layer).to(settings.device) decoder = DecoderRNN(hidden, decoder_embeddings, dropout, layer).to(settings.device) model = Model( encoder=encoder, decoder=decoder, learning_rate=learning_rate, ) model.summary() model.train(dataset, n_iter=iteration, save_every=save) if test: dataset = load(test) model = Model.load(test) while True: decoded_words = model.evaluate(str(input("> ")), dataset) print(' '.join(decoded_words))
links = self.get_links(visiting_now) if (links == -1): visit_quantity -= 1 continue self.print_debug(links) links = self.validate_links(links) self.evaluate_links(links, method) time.sleep(0.5) if save_results: self.save_visited_csv(method) print("Done") if (self.debug): self.out.close() if (__name__ == "__main__"): p = PreProcessing("../site.txt") sites = p.get_sites_info() for m in ['ml']: print('Initializing (' + m + '):') for s in sites: c = Crawler(s, dbg=False) c.visit(method=m, save_results=True) print('Finishing (' + m + ')\n')
passwd="legend", # your password #db="data_flood" db="data_earthquake_09_2015" ) # name of the data base if __name__ == '__main__': cur = db.cursor() punctuation = list(string.punctuation) stop = stopwords.words('spanish') + punctuation + ['rt', 'via'] # Use all the SQL you like cur.execute("select text,date_sub(created_at, INTERVAL 3 HOUR) from no_retweet;") dates = [] prepro = PreProcessing() for row in cur.fetchall(): terms_stop = [term for term in prepro.preprocess(row[0]) if len(term) >= 3 and term not in stop] #ignore terms with length <= 3 # if 'terremoto' in terms_stop: dates.append(row[1]) # a list of "1" to count the hashtags ones = [1]*len(dates) # the index of the series idx = pandas.DatetimeIndex(dates) # the actual series (at series of 1s for the moment) date_serie = pandas.Series(ones, index=idx) # Resampling / bucketing per_minute = date_serie.resample('1Min', how='sum').fillna(0) time_chart = Line(per_minute) time_chart.axis_titles(x='Time', y='Freq')
class Similarity: def __init__(self, questions: set, answers: set, word_vectors=None): self.bow = CountVectorizer() self.questions = questions self.answers = answers self.word_vectors = word_vectors self.pp = PreProcessing() def get_the_next_conversation(self, conversations, df): """ Get the first item in the dict """ keys_view = conversations.keys() keys_iterator = iter(keys_view) try: conversation = next(keys_iterator) except Exception as e: save_content_to_log(e) return naive_massage() return list(df[df['msg_pre_processed'] == conversation]['msg_2'])[0] def return_conversation_by_page_rank(self, msg, conversations, page_compute, reverse=True): """ Return a dictionary of message and similarity sorted by highter similarity """ conversations = self.pp.normalize_dictionary(conversations) conversations = { k: page_compute[k] + v for k, v in conversations.items() } return { k: v for k, v in sorted(conversations.items(), key=lambda item: item[1], reverse=reverse) } def return_conversation_by_cossine(self, msg, res): """ Return a dictionary of message and similarity sorted by highter similarity """ if res >= 0.5: msg_list = self.questions else: msg_list = self.answers similarity = [] for m in msg_list: m = str(m) new_msg_list = [msg, m] vector_bow = self.bow.fit_transform(new_msg_list) msg_bow = vector_bow.todense()[0] m_bow = vector_bow.todense()[1] d1_array = (1, 1) if m_bow.shape == d1_array and msg_bow.shape == d1_array: d = 1 - distance.euclidean(msg_bow, m_bow) else: d = 1 - distance.cosine(msg_bow, m_bow) if math.isnan(float(d)): similarity.append(0.0) else: similarity.append(d) """ vector_bow = [self.bow.fit_transform([msg, m]) for m in msg_list] msg_bow = [vect.todense()[0] for vect in vector_bow] m_bow = [vect.todense()[1] for vect in vector_bow] similarity = [1 - distance.cosine(msg_vect, m_vect) for msg_vect, m_vect in zip(msg_bow, m_bow)] """ result = dict(zip(msg_list, similarity)) return result
parser.add_argument("review_limit", help="the number of reviews to be processed") args = parser.parse_args() try: review_limit = int(args.review_limit) except ValueError: raise Exception("Review limit must be a number") if review_limit < 100: raise Exception("Review limit must be over 100") # step 1 - pre processing the training data # convert to combined pandas dataframe # remving stopwords and stemming the review text pre_processing = PreProcessing(limit_reviews=review_limit) df_reviews = pre_processing.get_df_reviews() df_meta = pre_processing.get_df_meta() combined = pre_processing.filter_and_combine(df_reviews, df_meta) reviews_clean = pre_processing.preprocess_reviews( combined['reviewTextProcessed'].tolist()) no_stop_words = pre_processing.remove_stop_words(reviews_clean) stemmed_reviews = pre_processing.get_stemmed_text(no_stop_words) combined['reviewTextProcessed'] = stemmed_reviews combined = pre_processing.change_categories_column(combined) combined.to_csv(args.output_file, sep='\t', encoding='utf-8')
parser.add_argument("review_limit", help="the number of reviews to be processed") args = parser.parse_args() try: review_limit = int(args.review_limit) except ValueError: raise Exception("Review limit must be a number") if review_limit < 100: raise Exception("Review limit must be over 100") # step 1 - pre processing the training data # convert to combined pandas dataframe # remving stopwords and stemming the review text pre_processing = PreProcessing(limit_reviews=review_limit) df_reviews = pre_processing.get_df_reviews() df_meta = pre_processing.get_df_meta() combined = pre_processing.filter_and_combine(df_reviews, df_meta) combined['reviewTextProcessed'] = pre_processing.preprocess_reviews( combined['reviewTextProcessed']) combined['reviewTextProcessed'] = pre_processing.remove_stop_words( combined['reviewTextProcessed']) combined['reviewTextProcessed'] = pre_processing.get_stemmed_text( combined['reviewTextProcessed']) reviews_and_sentiment = combined[['reviewTextProcessed', 'overall']] # convert string rating values to numerical values
def __init__(self): self.pp = PreProcessing()
def setUpClass(cls): cls.dataset = ds.process(PreProcessing(sentences))
def main(): configs = json.load(open('config.json', 'r')) # download and process all the datasets involved # includes AMZN GetData(configs['data']['symbol'], configs['data']['start'], configs['data']['end'], configs).get_stock_data() amzn_dataloader = DataLoader( os.path.join(configs['data']['save_dir'], configs['data']['symbol'] + '.csv'), configs['data']['columns']) preprocessing = PreProcessing() preprocessing.denoise(amzn_dataloader.data, configs) all_data = {configs['data']['symbol']: preprocessing.denoised} # and the correlated ones for correlate in configs['data']['correlates_to']: GetData(correlate, configs['data']['start'], configs['data']['end'], configs).get_stock_data() dataloader = DataLoader( os.path.join(configs['data']['save_dir'], correlate + '.csv'), configs['data']['columns']) preprocessing = PreProcessing() preprocessing.denoise(dataloader.data, configs) all_data.update({correlate: preprocessing.denoised}) # save all data preprocessed dataframe = pd.DataFrame(all_data) filename = os.path.join(configs['preprocessing']['save_dir'], configs['preprocessing']['filename']) dataframe.to_csv(filename, index=False) dataloader = DataLoader(filename, configs['data']['correlates_to']) dataloader.train_test_split(configs['data']['days'], configs['data']['train_test_split']) model = Model() # build and train model model.build(configs, dataloader) from keras.utils import plot_model plot_model(model.model, show_shapes=True, to_file="autoencoder-lstm-multivariable-for-prediction.png") yhat = model.predict(dataloader.train, dataloader.test, configs['data']['inputs']) import matplotlib.pyplot as plt plt.figure(figsize=(10, 8)) plt.plot(dataloader.test[0, :, 0].flatten(), label='Real') plt.plot(yhat.flatten(), label='Predicted') plt.legend() plt.show() yhat = model.predict(dataloader.train, dataloader.train, configs['data']['inputs']) plt.figure(figsize=(10, 8)) plt.plot(dataloader.train[0, :, 0].flatten(), label='Real') plt.plot(yhat.flatten(), label='Predicted') plt.legend() plt.show()
def test_should_create_dataset_dir(self): storage = ds.DatasetStorage() pre_processing = PreProcessing(sentences) dataset = ds.process(pre_processing) self.assertTrue(storage.exist(dataset.idx))
def test_should_generate_training_pairs(self): pre_processing = PreProcessing(sentences) dataset = ds.process(pre_processing) word_embedding = WordEmbedding(freeze=False, source=dataset.pairs) word_embedding.train() self.assertEqual(len(dataset.training_pairs(2, word_embedding)), 2)