def getPreds(df, out=None): # GPU won't work without the next three lines physical_devices = tf.config.list_physical_devices('GPU') if len(physical_devices) > 0: tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) dp = DataProcessor() test_articles = processData(df, ['body']).to_numpy() test_articles = list(map(lambda x: x[0], test_articles)) test_articles_raw = df.to_numpy() test_articles_raw = list(map(lambda x: x[0], test_articles_raw)) with open('./onion_tokenizer.pyc', 'rb') as pickleHand: tokenizer = pickle.load(pickleHand) assert isinstance(tokenizer, Tokenizer) seqs = test_articles max_len = dp.getMaxWords() seqs = tokenizer.texts_to_sequences(seqs) seqs = pad_sequences(seqs, max_len) model = keras.models.load_model('static/onion_connoisseur.h5') assert isinstance(model, keras.models.Model) print(test_articles) predVals = model.predict(seqs) preds = list(map(lambda x: "Real" if x < 0.75 else "Fake", predVals)) print(preds) if out: with open('predictions.csv', 'w', encoding='utf-8') as outHand: out = csv.writer(outHand) for i in range(0, len(preds)): out.writerow([test_articles_raw[i], preds[i], predVals[i]]) return [preds, predVals]
def main(): url = 'https://semsa.manaus.am.gov.br/sala-de-situacao/novo-coronavirus/' logging.info("INIT DOWNLOAD") pdfDownloader = PdfDownloader(url) pdfDownloader.download() fileName = pdfDownloader.filename logging.info("INIT Extracting") input_paths = "raw_db/{}".format(fileName) print(input_paths) pdfExtractor = PdfExtractor( input_paths, "db/{}".format(fileName.replace("pdf", "json"))) pdfExtractor.process() logging.info("INIT processing") dataProcessor = DataProcessor(pdfExtractor.output_path, 'analytics') dataProcessor.process_all() logging.info("FINISH")
from theano import tensor as T from event_ae import EventAE from process_data import DataProcessor sys.setrecursionlimit(10000) num_args = 2 num_slots = num_args + 1 hyp_hidden_size = 50 learning_rate = 0.01 wc_hidden_sizes = [50] * num_slots cc_hidden_sizes = [50] * num_args max_iter = 10 num_procs = int(sys.argv[2]) dp = DataProcessor() x_data, y_s_data, w_ind, c_ind, w_h_map = dp.make_data(sys.argv[1]) vocab_file = codecs.open("vocab.txt", "w", "utf-8") for w, ind in w_ind.items(): print >>vocab_file, w, ind vocab_file.close() ont_file = codecs.open("ont.txt", "w", "utf-8") for c, ind in c_ind.items(): print >>ont_file, c, ind ont_file.close() rev_w_ind = {ind:word for word, ind in w_ind.items()} rev_c_ind = {ind:concept for concept, ind in c_ind.items()} train_data = zip(x_data, y_s_data)
default="tanhlayer", ) argparser.add_argument( "--rec_model_type", type=str, help="Reconstruction model (gaussian, multinomial)", default="gaussian" ) args = argparser.parse_args() pred_arg_pos = args.word_types.split("_") learning_rate = args.lr use_pretrained_wordrep = False if args.pt_rep: use_pretrained_wordrep = True pt_word_rep = { l.split()[0]: numpy.asarray([float(f) for f in l.strip().split()[1:]]) for l in gzip.open(args.pt_rep) } dp = DataProcessor(pred_arg_pos) x_data, y_s_data, w_ind, c_ind, w_h_map, w_oov, c_oov = dp.make_data(args.train_file, relaxed=args.use_relaxation) rev_w_ind = {ind: word for word, ind in w_ind.items()} rev_c_ind = {ind: concept for concept, ind in c_ind.items()} init_hyp_strengths = None if args.rec_model_type == "multinomial": init_hyp_strengths = numpy.zeros((len(c_ind), len(w_ind))) for word in w_h_map: word_ind = w_ind[word] if word in w_ind else 0 for concept in w_h_map[word]: concept_ind = c_ind[concept] if concept in c_ind else 0 init_hyp_strengths[concept_ind][word_ind] = 1.0 if len(w_oov) != 0: print >> sys.stderr, "Regarding %d words as OOV" % (len(w_oov))
from wordcloud import STOPWORDS, WordCloud, ImageColorGenerator import matplotlib.pyplot as plt import numpy as np from process_data import DataProcessor from PIL import Image dp = DataProcessor() real, fake = dp.getDatasets() fakeString = "".join([art for art in fake]) wc = WordCloud(stopwords=STOPWORDS, height=1080, width=1920) f_wc = wc.generate(fakeString) f_wc.to_file('static/visualizations/fakeWordCloud.png') #Moving onto generate a wordcloud for the real articles realString = "".join([art for art in real]) wc = WordCloud(stopwords=STOPWORDS, height=1080, width=1920) r_wc = wc.generate(realString) r_wc.to_file('static/visualizations/realWordCloud.png')
from keras.callbacks import ModelCheckpoint from tensorflow.keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from process_data import DataProcessor import tensorflow as tf import numpy as np import pickle physical_devices = tf.config.list_physical_devices('GPU') if len(physical_devices) > 0: tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) '''what version is this''' num_words = 200000 dp = DataProcessor() x, y = dp.getTrainingData() # Assign token to each word present in headlines tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'`’‘\\', num_words=num_words) tokenizer.fit_on_texts(x) max_len = dp.getMaxWords() trainX = tokenizer.texts_to_sequences(x) trainX = pad_sequences(trainX, max_len) indexLen = len(tokenizer.word_index) with open('onion_tokenizer.pyc', 'wb') as pickleHand: pickle.dump(tokenizer, pickleHand) # Define our deep learning model
from load_data import DataLoader from process_data import DataProcessor data_loader = DataLoader('data/') data_loader.load_data() data_processor = DataProcessor('data1') data_processor.process_data()
argparser.set_defaults(no_hyp=False) #argparser.add_argument('--pt_rep', type=str, help="File containing pretrained embeddings") argparser.add_argument('--use_em', help="Use EM (Default is False)", action='store_true') argparser.set_defaults(use_em=False) argparser.add_argument('--use_nce', help="Use NCE for estimating encoding probability. (Default is False)", action='store_true') argparser.set_defaults(use_nce=False) argparser.add_argument('--hyp_model_type', type=str, help="Hypernymy model (weighted_prod, linlayer, tanhlayer)", default="weighted_prod") argparser.add_argument('--wc_pref_model_type', type=str, help="Word-concept preference model (weighted_prod, linlayer, tanhlayer)", default="tanhlayer") argparser.add_argument('--cc_pref_model_type', type=str, help="Concept-concept preference model (weighted_prod, linlayer, tanhlayer)", default="tanhlayer") argparser.add_argument('--rec_model_type', type=str, help="Reconstruction model (gaussian, multinomial)", default="gaussian") argparser.add_argument('--param_iter', type=int, help="Iteration of learned param to use (default 1)", default=1) args = argparser.parse_args() use_relaxation = args.use_relaxation pred_arg_pos = args.word_types.split("_") dp = DataProcessor(pred_arg_pos) x_data, y_s_data, w_ind, c_ind, _, _, _ = dp.make_data(args.test_file, relaxed=args.use_relaxation, handle_oov=False) num_slots = len(pred_arg_pos) num_args = num_slots - 1 hyp_hidden_size = 20 wc_hidden_sizes = [20] * num_slots cc_hidden_sizes = [20] * num_args #use_pretrained_wordrep = False #if args.pt_rep: # print >>sys.stderr, "Using pretrained word representations from %s"%(args.pt_rep) # use_pretrained_wordrep = True # pt_word_rep = {l.split()[0]: numpy.asarray([float(f) for f in l.strip().split()[1:]]) for l in gzip.open(args.pt_rep)} train_vocab_file = codecs.open(args.vocab_file, "r", "utf-8")