Example #1
0
def getPreds(df, out=None):
    # GPU won't work without the next three lines
    physical_devices = tf.config.list_physical_devices('GPU')
    if len(physical_devices) > 0:
        tf.config.experimental.set_memory_growth(physical_devices[0],
                                                 enable=True)
    dp = DataProcessor()

    test_articles = processData(df, ['body']).to_numpy()
    test_articles = list(map(lambda x: x[0], test_articles))

    test_articles_raw = df.to_numpy()
    test_articles_raw = list(map(lambda x: x[0], test_articles_raw))

    with open('./onion_tokenizer.pyc', 'rb') as pickleHand:
        tokenizer = pickle.load(pickleHand)
    assert isinstance(tokenizer, Tokenizer)

    seqs = test_articles
    max_len = dp.getMaxWords()
    seqs = tokenizer.texts_to_sequences(seqs)
    seqs = pad_sequences(seqs, max_len)
    model = keras.models.load_model('static/onion_connoisseur.h5')
    assert isinstance(model, keras.models.Model)
    print(test_articles)
    predVals = model.predict(seqs)
    preds = list(map(lambda x: "Real" if x < 0.75 else "Fake", predVals))
    print(preds)
    if out:
        with open('predictions.csv', 'w', encoding='utf-8') as outHand:
            out = csv.writer(outHand)
            for i in range(0, len(preds)):
                out.writerow([test_articles_raw[i], preds[i], predVals[i]])

    return [preds, predVals]
Example #2
0
def main():
    url = 'https://semsa.manaus.am.gov.br/sala-de-situacao/novo-coronavirus/'

    logging.info("INIT DOWNLOAD")
    pdfDownloader = PdfDownloader(url)
    pdfDownloader.download()
    fileName = pdfDownloader.filename

    logging.info("INIT Extracting")
    input_paths = "raw_db/{}".format(fileName)
    print(input_paths)
    pdfExtractor = PdfExtractor(
        input_paths, "db/{}".format(fileName.replace("pdf", "json")))
    pdfExtractor.process()

    logging.info("INIT processing")
    dataProcessor = DataProcessor(pdfExtractor.output_path, 'analytics')
    dataProcessor.process_all()

    logging.info("FINISH")
Example #3
0
from theano import tensor as T
from event_ae import EventAE
from process_data import DataProcessor 

sys.setrecursionlimit(10000)
num_args = 2
num_slots = num_args + 1
hyp_hidden_size = 50
learning_rate = 0.01
wc_hidden_sizes = [50] * num_slots
cc_hidden_sizes = [50] * num_args
max_iter = 10

num_procs = int(sys.argv[2])

dp = DataProcessor()
x_data, y_s_data, w_ind, c_ind, w_h_map = dp.make_data(sys.argv[1])

vocab_file = codecs.open("vocab.txt", "w", "utf-8")
for w, ind in w_ind.items():
  print >>vocab_file, w, ind
vocab_file.close()

ont_file = codecs.open("ont.txt", "w", "utf-8")
for c, ind in c_ind.items():
  print >>ont_file, c, ind
ont_file.close()

rev_w_ind = {ind:word for word, ind in w_ind.items()}
rev_c_ind = {ind:concept for concept, ind in c_ind.items()}
train_data = zip(x_data, y_s_data)
Example #4
0
    default="tanhlayer",
)
argparser.add_argument(
    "--rec_model_type", type=str, help="Reconstruction model (gaussian, multinomial)", default="gaussian"
)
args = argparser.parse_args()
pred_arg_pos = args.word_types.split("_")
learning_rate = args.lr
use_pretrained_wordrep = False
if args.pt_rep:
    use_pretrained_wordrep = True
    pt_word_rep = {
        l.split()[0]: numpy.asarray([float(f) for f in l.strip().split()[1:]]) for l in gzip.open(args.pt_rep)
    }

dp = DataProcessor(pred_arg_pos)
x_data, y_s_data, w_ind, c_ind, w_h_map, w_oov, c_oov = dp.make_data(args.train_file, relaxed=args.use_relaxation)
rev_w_ind = {ind: word for word, ind in w_ind.items()}
rev_c_ind = {ind: concept for concept, ind in c_ind.items()}

init_hyp_strengths = None
if args.rec_model_type == "multinomial":
    init_hyp_strengths = numpy.zeros((len(c_ind), len(w_ind)))
    for word in w_h_map:
        word_ind = w_ind[word] if word in w_ind else 0
        for concept in w_h_map[word]:
            concept_ind = c_ind[concept] if concept in c_ind else 0
            init_hyp_strengths[concept_ind][word_ind] = 1.0

if len(w_oov) != 0:
    print >> sys.stderr, "Regarding %d words as OOV" % (len(w_oov))
Example #5
0
from wordcloud import STOPWORDS, WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
from process_data import DataProcessor
from PIL import Image

dp = DataProcessor()

real, fake = dp.getDatasets()

fakeString = "".join([art for art in fake])

wc = WordCloud(stopwords=STOPWORDS, height=1080, width=1920)

f_wc = wc.generate(fakeString)

f_wc.to_file('static/visualizations/fakeWordCloud.png')

#Moving onto generate a wordcloud for the real articles
realString = "".join([art for art in real])

wc = WordCloud(stopwords=STOPWORDS, height=1080, width=1920)

r_wc = wc.generate(realString)

r_wc.to_file('static/visualizations/realWordCloud.png')
Example #6
0
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from process_data import DataProcessor
import tensorflow as tf
import numpy as np
import pickle

physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
'''what version is this'''

num_words = 200000

dp = DataProcessor()

x, y = dp.getTrainingData()

# Assign token to each word present in headlines
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'`’‘\\',
                      num_words=num_words)
tokenizer.fit_on_texts(x)
max_len = dp.getMaxWords()
trainX = tokenizer.texts_to_sequences(x)
trainX = pad_sequences(trainX, max_len)
indexLen = len(tokenizer.word_index)
with open('onion_tokenizer.pyc', 'wb') as pickleHand:
    pickle.dump(tokenizer, pickleHand)

# Define our deep learning model
Example #7
0
from load_data import DataLoader
from process_data import DataProcessor

data_loader = DataLoader('data/')
data_loader.load_data()

data_processor = DataProcessor('data1')
data_processor.process_data()
Example #8
0
argparser.set_defaults(no_hyp=False)
#argparser.add_argument('--pt_rep', type=str, help="File containing pretrained embeddings")
argparser.add_argument('--use_em', help="Use EM (Default is False)", action='store_true')
argparser.set_defaults(use_em=False)
argparser.add_argument('--use_nce', help="Use NCE for estimating encoding probability. (Default is False)", action='store_true')
argparser.set_defaults(use_nce=False)
argparser.add_argument('--hyp_model_type', type=str, help="Hypernymy model (weighted_prod, linlayer, tanhlayer)", default="weighted_prod")
argparser.add_argument('--wc_pref_model_type', type=str, help="Word-concept preference model (weighted_prod, linlayer, tanhlayer)", default="tanhlayer")
argparser.add_argument('--cc_pref_model_type', type=str, help="Concept-concept preference model (weighted_prod, linlayer, tanhlayer)", default="tanhlayer")
argparser.add_argument('--rec_model_type', type=str, help="Reconstruction model (gaussian, multinomial)", default="gaussian")
argparser.add_argument('--param_iter', type=int, help="Iteration of learned param to use (default 1)", default=1)
args = argparser.parse_args()

use_relaxation = args.use_relaxation
pred_arg_pos = args.word_types.split("_")
dp = DataProcessor(pred_arg_pos)
x_data, y_s_data, w_ind, c_ind, _, _, _ = dp.make_data(args.test_file, relaxed=args.use_relaxation, handle_oov=False)

num_slots = len(pred_arg_pos)
num_args = num_slots - 1
hyp_hidden_size = 20
wc_hidden_sizes = [20] * num_slots
cc_hidden_sizes = [20] * num_args

#use_pretrained_wordrep = False
#if args.pt_rep:
#  print >>sys.stderr, "Using pretrained word representations from %s"%(args.pt_rep)
#  use_pretrained_wordrep = True
#  pt_word_rep = {l.split()[0]: numpy.asarray([float(f) for f in l.strip().split()[1:]]) for l in gzip.open(args.pt_rep)}

train_vocab_file = codecs.open(args.vocab_file, "r", "utf-8")