class UrlNGram: def __init__(self, urls, n=2): self.ngram = MLE(n) train_data, padded_sents = padded_everygram_pipeline(n, urls) self.ngram.fit(train_data, padded_sents) def get_entropy(self, url): return self.ngram.entropy(list(url)) def get_perplexity(self, url): return self.ngram.perplexity(list(url))
class MleBigramTests(unittest.TestCase): """unit tests for MLENgramModel class""" score_tests = [ ("d", ["c"], 1), # Unseen ngrams should yield 0 ("d", ["e"], 0), # Unigrams should also be 0 ("z", None, 0), # N unigrams = 14 # count('a') = 2 ("a", None, 2.0 / 14), # count('y') = 3 ("y", None, 3.0 / 14), ] def setUp(self): vocab, training_text = _prepare_test_data(2) self.model = MLE(2, vocabulary=vocab) self.model.fit(training_text) def test_logscore_zero_score(self): # logscore of unseen ngrams should be -inf logscore = self.model.logscore("d", ["e"]) self.assertTrue(math.isinf(logscore)) def test_entropy_perplexity_seen(self): # ngrams seen during training trained = [ ("<s>", "a"), ("a", "b"), ("b", "<UNK>"), ("<UNK>", "a"), ("a", "d"), ("d", "</s>"), ] # Ngram = Log score # <s>, a = -1 # a, b = -1 # b, UNK = -1 # UNK, a = -1.585 # a, d = -1 # d, </s> = -1 # TOTAL logscores = -6.585 # - AVG logscores = 1.0975 H = 1.0975 perplexity = 2.1398 self.assertAlmostEqual(H, self.model.entropy(trained), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4) def test_entropy_perplexity_unseen(self): # In MLE, even one unseen ngram should make entropy and perplexity infinite untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")] self.assertTrue(math.isinf(self.model.entropy(untrained))) self.assertTrue(math.isinf(self.model.perplexity(untrained))) def test_entropy_perplexity_unigrams(self): # word = score, log score # <s> = 0.1429, -2.8074 # a = 0.1429, -2.8074 # c = 0.0714, -3.8073 # UNK = 0.2143, -2.2224 # d = 0.1429, -2.8074 # c = 0.0714, -3.8073 # </s> = 0.1429, -2.8074 # TOTAL logscores = -21.6243 # - AVG logscores = 3.0095 H = 3.0095 perplexity = 8.0529 text = [("<s>", ), ("a", ), ("c", ), ("-", ), ("d", ), ("c", ), ("</s>", )] self.assertAlmostEqual(H, self.model.entropy(text), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
class TestMleBigram(metaclass=ParametrizedTests): """Unit tests for MLE ngram model.""" score_tests = [ ("d", ["c"], 1), # Unseen ngrams should yield 0 ("d", ["e"], 0), # Unigrams should also be 0 ("z", None, 0), # N unigrams = 14 # count('a') = 2 ("a", None, 2.0 / 14), # count('y') = 3 ("y", None, 3.0 / 14), ] @classmethod def setup_method(self): vocab, training_text = _prepare_test_data(2) self.model = MLE(2, vocabulary=vocab) self.model.fit(training_text) def test_logscore_zero_score(self): # logscore of unseen ngrams should be -inf logscore = self.model.logscore("d", ["e"]) assert math.isinf(logscore) def test_entropy_perplexity_seen(self): # ngrams seen during training trained = [ ("<s>", "a"), ("a", "b"), ("b", "<UNK>"), ("<UNK>", "a"), ("a", "d"), ("d", "</s>"), ] # Ngram = Log score # <s>, a = -1 # a, b = -1 # b, UNK = -1 # UNK, a = -1.585 # a, d = -1 # d, </s> = -1 # TOTAL logscores = -6.585 # - AVG logscores = 1.0975 H = 1.0975 perplexity = 2.1398 assert pytest.approx(self.model.entropy(trained), 1e-4) == H assert pytest.approx(self.model.perplexity(trained), 1e-4) == perplexity def test_entropy_perplexity_unseen(self): # In MLE, even one unseen ngram should make entropy and perplexity infinite untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")] assert math.isinf(self.model.entropy(untrained)) assert math.isinf(self.model.perplexity(untrained)) def test_entropy_perplexity_unigrams(self): # word = score, log score # <s> = 0.1429, -2.8074 # a = 0.1429, -2.8074 # c = 0.0714, -3.8073 # UNK = 0.2143, -2.2224 # d = 0.1429, -2.8074 # c = 0.0714, -3.8073 # </s> = 0.1429, -2.8074 # TOTAL logscores = -21.6243 # - AVG logscores = 3.0095 H = 3.0095 perplexity = 8.0529 text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)] assert pytest.approx(self.model.entropy(text), 1e-4) == H assert pytest.approx(self.model.perplexity(text), 1e-4) == perplexity
class MleBigramTests(unittest.TestCase): """unit tests for MLENgramModel class""" score_tests = [ ("d", ["c"], 1), # Unseen ngrams should yield 0 ("d", ["e"], 0), # Unigrams should also be 0 ("z", None, 0), # N unigrams = 14 # count('a') = 2 ("a", None, 2.0 / 14), # count('y') = 3 ("y", None, 3.0 / 14), ] def setUp(self): vocab, training_text = _prepare_test_data(2) self.model = MLE(2, vocabulary=vocab) self.model.fit(training_text) def test_logscore_zero_score(self): # logscore of unseen ngrams should be -inf logscore = self.model.logscore("d", ["e"]) self.assertTrue(math.isinf(logscore)) def test_entropy_perplexity_seen(self): # ngrams seen during training trained = [ ("<s>", "a"), ("a", "b"), ("b", "<UNK>"), ("<UNK>", "a"), ("a", "d"), ("d", "</s>"), ] # Ngram = Log score # <s>, a = -1 # a, b = -1 # b, UNK = -1 # UNK, a = -1.585 # a, d = -1 # d, </s> = -1 # TOTAL logscores = -6.585 # - AVG logscores = 1.0975 H = 1.0975 perplexity = 2.1398 self.assertAlmostEqual(H, self.model.entropy(trained), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4) def test_entropy_perplexity_unseen(self): # In MLE, even one unseen ngram should make entropy and perplexity infinite untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")] self.assertTrue(math.isinf(self.model.entropy(untrained))) self.assertTrue(math.isinf(self.model.perplexity(untrained))) def test_entropy_perplexity_unigrams(self): # word = score, log score # <s> = 0.1429, -2.8074 # a = 0.1429, -2.8074 # c = 0.0714, -3.8073 # UNK = 0.2143, -2.2224 # d = 0.1429, -2.8074 # c = 0.0714, -3.8073 # </s> = 0.1429, -2.8074 # TOTAL logscores = -21.6243 # - AVG logscores = 3.0095 H = 3.0095 perplexity = 8.0529 text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)] self.assertAlmostEqual(H, self.model.entropy(text), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
class FeatureEmbeddings: def __init__(self): self.features = pd.DataFrame() n = 2 self.bigram = MLE(n) def __URLsplit(self, s): return [char for char in s] def __buildBigram(self, urls): train_data, padded_sents = padded_everygram_pipeline(2, urls) self.bigram.fit(train_data, padded_sents) def __cleanURL(self, url): xtract = tldextract.extract(url) return '.'.join(xtract) def __editDistance(self, url): popular_sites = [ 'https://news.yahoo.com/', 'https://news.google.com/?hl=en-US&gl=US&ceid=US:en', 'https://www.huffpost.com/', 'https://www.cnn.com/', 'https://www.nytimes.com/', 'https://www.foxnews.com/', 'https://www.nbcnews.com/', 'https://www.dailymail.co.uk/ushome/index.html', 'https://www.washingtonpost.com/', 'https://www.theguardian.com/us', 'https://www.wsj.com/', 'https://abcnews.go.com/', 'https://www.bbc.co.uk/news', 'https://www.usatoday.com/', 'https://www.latimes.com/' ] popular_sites = [self.__cleanURL(str(x)) for x in popular_sites] dist = float('inf') for site in popular_sites: new_dist = editdistance.eval(url, site) if new_dist < dist: dist = new_dist return dist def __htmlInfo(self, urls): n = len(urls) status_codes = [-1] * n is_active = [0] * n has_wp_content = [-1] * n num_iframes = [-1] * n it = -1 for url in tqdm(urls): it += 1 try: response = requests.get(url, timeout=10) status_codes[it] = response.status_code if response.status_code == 200: page = bs4.BeautifulSoup(response.text, 'lxml') is_active[it] = 1 iframes = page.find_all(name='iframe') num_iframes[it] = len(iframes) has_wp_content[it] = 1 if response.text.find( 'wp-content') > -1 else 0 except: continue self.features['status'] = status_codes self.features['active'] = is_active self.features['wp_content'] = has_wp_content self.features['num_iframes'] = num_iframes def __cleanHeadline(self, h): return remove_stopwords( strip_punctuation(strip_numeric(str(h).lower()))).split(' ') def __get_val(self, v, row, i): if v[row] == []: return 0.0 else: return float(v[row][i]) def __headerEmbeddings(self, headers): header_model = Word2Vec.load("models/headline_word_embeddings.model") head_vecs = [] for h in headers: h = self.__cleanHeadline(h) h = [x for x in h if x in header_model.wv.vocab] if len(h) >= 1: head_vecs.append(np.mean(header_model[h], axis=0)) else: head_vecs.append([]) for i in range(len(head_vecs[0])): self.features.insert(i, 'h_vec_' + str(i), [ self.__get_val(head_vecs, row, i) for row in range(len(head_vecs)) ], True) def __articleEmbeddings(self, articles): # sys.path.append(os.getcwd()) global embed doc_model = Doc2Vec(vector_size=100, window=10, min_count=2, epochs=100) doc_model = Doc2Vec.load("models/my_doc2vec_model") a_vec_labels = [] for i in range(0, 100): a_vec_labels.append('a_vec_' + str(i)) vecs = [] # loop = tqdm(total=len(articles), position=0) # the little progress bar thing # do the embedding itself. def embed(article): t = str(article[1]).split() e = list(doc_model.infer_vector(t)) # vecs.append(e) return e #prep some basic paralellism, to make use of the supercomputer's many cores. pool = multiprocessing.Pool() jobs = [] for x in tqdm(pool.imap(embed, enumerate(articles)), total=len(articles)): vecs.append(x) pool.close() pool.join() # for i, text in enumerate(articles): # # update progress bar # loop.set_description('Inferring vector for article number: '+str(i)) # loop.update(1) # # add this bad boy to the pool of our many embeddings to make # p = multiprocessing.Process(target = embed, args=((i, text),)) # jobs.append(p) # p.start() # there must be an end to all this parallel madness # for p in jobs: # p.join() # report the results, we have finished. a_embeds = pd.DataFrame(vecs, columns=a_vec_labels) self.features = a_embeds.join(self.features) def create(self, data, url_col, article_col, header_col=None): ''' Creates feature dataset from news article URL Features: BUILT: TRANSFERRED: - bigram entropy - bigram perplexity - clean bigram entropy - clean bigram perplexity - edit distance to top 15 site - status - active - has wordpress content - number of iframes NEW: - header embeddings TO BE BUILT: - article embeddings - url embeddings ''' # HEADLINE VECTORS if header_col: sys.stdout.write('Building embeddings for headlines...\n') self.__headerEmbeddings(data[header_col]) if url_col is not None: # BIGRAM ENTROPY & PERPLEXITY sys.stdout.write( 'Building bigram model features for URL strings...\n') urls = data[url_col].apply(lambda a: str(a)) split_urls = urls.apply(lambda a: self.__URLsplit(a)) self.__buildBigram(split_urls) self.features['bigram_entropy'] = [ self.bigram.entropy(x) for x in urls ] self.features['bigram_perplexity'] = [ self.bigram.perplexity(x) for x in urls ] # CLEAN BIGRAM ENTROPY & PERPLEXITY clean_urls = urls.apply(lambda a: self.__cleanURL(str(a))) split_clean_urls = clean_urls.apply(lambda a: self.__URLsplit(a)) self.__buildBigram(split_clean_urls) self.features['clean_bigram_entropy'] = [ self.bigram.entropy(x) for x in split_clean_urls ] self.features['clean_bigram_perplexity'] = [ self.bigram.perplexity(x) for x in split_clean_urls ] # EDIT DISTANCE sys.stdout.write( 'Calculating edit distance for each URL string...\n') self.features['edit_distance'] = [ self.__editDistance(x) for x in clean_urls ] # HTML INFO (STATUS, ACTIVE, WP CONTENT, # IFRAMES) #sys.stdout.write('Accessing request info for features...\n') #self.__htmlInfo(urls) # ARTICLE EMBEDDINGS VIA DOC2VEC sys.stdout.write('Inferring article embeddings via doc2vec...\n') self.__articleEmbeddings(data[article_col]) sys.stdout.flush()