def get_news(self, url, flag): #given a news site, tries to extract article links #definitely works for Hacker News #returns list of urls for articles as a list of strings (unicode) news = n.News(url) news.set_flag(flag) news.find_links() return news.links
def read_log(log_file): par_adv = [] measured = False sys.stdout.write("Reading log") fo = open(log_file, "r") for line in fo: # print line tim, linetype, linename, value, unit_id, treatment_id = interpret_log_line( line) if (linetype == 'meta'): if (linename == 'agents'): num_agents = int(value) elif (linename == 'treatnames'): treatnames = re.split("\@\|", value) # print "Treatments: ", treatnames elif (linename == 'block_id start'): sys.stdout.write(".") sys.stdout.flush() block_id = int(value) adv = [] ints = [] newsv = [] for i in range(0, num_agents): adv.append(adVector.AdVector()) ints.append(interest.Interests()) newsv.append(news.NewsVector()) # print block_id elif (linename == 'assignment'): assignment = [int(x) for x in re.split("\@\|", value)] elif (linename == 'block_id end'): apply_labels_to_vecs(adv, ints, newsv, assignment, num_agents, len(treatnames)) par_adv.append({ 'advector': adv, 'newsvector': newsv, 'assignment': assignment, 'intvector': ints }) elif (linetype == 'treatment'): pass elif (linetype == 'measurement'): if (linename == 'ad'): ind_ad = ad.Ad(value, treatment_id) adv[int(unit_id)].add(ind_ad) if (linename == 'interest'): ints[int(unit_id)].set_from_string(value) if (linename == 'news'): ind_news = news.News(value, treatment_id) newsv[int(unit_id)].add(ind_news) elif (linetype == 'error'): # print "Error in block", block_id, ": ", line.strip() pass sys.stdout.write(".Reading complete\n") print "Treatments: ", treatnames return par_adv, treatnames
def test_1_news_init(self): print("Test #1 for news.__init__(...) with mock News API replies:") news_out = [] # go through each mock API answer and initialize a News object for a in self.mock_api_answer: news_out.append(news.News(a["description"], a["source"])) # run the check through the news_tester function defined below self.news_tester(news_out[-1], a) print("Done.")
def test(): print( "test1:displays number of features generated from all the documents\n") f = open('feature_definition_file', 'r') count = 0 k = f.readline() while k: count += 1 k = f.readline() print("number of features generated in feature_defintion_file : " + str(count)) f.close() print( "test2:verified that all the documents are read and parsed from the mininewsgroup directory\n" ) f = open('training_data_file', 'r') count = 0 k = f.readline() while k: count += 1 k = f.readline() print("number of documents parsed from mininewsgroup : " + str(count)) print("test3 : Given a filename and filepath parse the document\n") fil = open('class_definition_file', "r") classes = {} r = fil.readline() while r: p = str(r.strip()).split(" ") if p[0] in classes: classes[p[0]].append(p[1]) else: classes[p[0]] = [p[1]] r = fil.readline() fil.close() directorypath = input( "Enter the filepath (eg:localpath/mini_newsgroups/alt.atheism/51121):\n" ) ngobj = news.News(directorypath, classes) print("DOCID : " + ngobj.docID) print("Newsgroup : " + ngobj.newsgroup) print("Class : " + ngobj.class_label) print("Subject : " + ngobj.subject) print("Body : " + ngobj.body) print("test4\n") print( "Tokenizing the subject and body of the above given file,removing stop words and stemming: \n" ) print(util.tokenize(ngobj.subject + " " + ngobj.body)) print("test5 : printing inverted index of the given file\n") indexobjtest = InvertedIndex() indexobjtest.indexDoc(ngobj) for key in indexobjtest.items: print(key + " " + str(ngobj.docID) + " " + str(indexobjtest.items[key].posting[ngobj.docID].positions))
def againdeal(url_list, output, base_url): #deal with data, use append add to store_class , findally return store_class = news.List_news() i = 1 json_list = [] topic_split = re.compile('<h1 class=\"headline\">.*</h1>') author_split = re.compile('<span class=\"provider org\">.*</span>') date_split = re.compile('<abbr title=.*</abbr>') text_split = re.compile('<p class=\"first\">.*</p>|<p>.*</p>') #It is very difficult thought for a long, but can be found with union for url in url_list: nextweb = requests.get(base_url + str(url) + 'html') nextweb.encoding = 'utf-8' information = nextweb.text #Prevent coding problems try: #uer "str" , because list not use topic = str(topic_split.findall(information)).replace('<h1 class=\"headline\">', '').replace('</h1>', '').replace('\\u3000', '', 20).replace('╱', '', 10).replace('[', '', 10).replace(']', '',10) author = str(author_split.findall(information)).replace('<span class=\"provider org\">', '').replace('</span>', '').replace('[', '', 10).replace(']', '',10) date = str(date_split.findall(information)).replace('>', '<', 10).split('<')[2] #this is so trouble, it is ["", "<abbr title = ...", "date", "</abbr>", ""], so is data[2] text = str(text_split.findall(information)).replace('<p class=\"first\">', '').replace('</p>', '', 100).replace(' ', '', 100).replace('<p>', '', 100).replace('[', '', 10).replace(']', '',10).replace('\',\'', '', 10) #deal with date if '下' in date: date = date.replace('下午', '') try: date = datetime.strptime(date, '%Y年%m月%d日 %H:%M') + timedelta(hours = 12) except: date = datetime.strptime(date, '%Y年%m月%d日 %H:%M') + timedelta(days = 1, hours = -12) else: date = date.replace('上午', '') date = datetime.strptime(date, '%Y年%m月%d日 %H:%M') store_class.append(news.News(topic, author, date, text)) json_list.append(store_class.news[i - 1].toDict()) print('第', i, '則新聞已擷取完') i += 1 except: continue output.write(json.dumps(json_list, ensure_ascii = False)) print('讀取完畢!') return store_class
def get_news_object(self, art, use_description = True): # check the news headline is long enough, otherwise # Watson NLU won't be able to analyze it if use_description: key = "description" else: key = "title" # also try to clean the title art[key] = self.clean_news_title(art[key]) hl = art[key] if len(hl.split()) > 3 and len(hl) > 15: # it is long enough, use it return news.News(art[key], art["source"]["name"]) else: # it is not long enough, pass on this one return None
def news_start_1000(): my_news = news.News() repos = mysql.get_repo_without_news() if len(repos) == 0: print 'done' exit(0) for repo in repos: search_news, urls = my_news.get_news(repo[2]) if search_news: mysql.insert_news(repo[0], search_news, urls) print '\033[1;31;40m' print repo, 'news ,done' print '\033[0m' else: mysql.insert_news(repo[0], [''], ['']) print repo, 'failed'
def main(): # Get API keys news_api = news.News() news_data = news_api.getNews(datetime.today()) news_words = news_api.get_words() print(news_words) # Get spotify playlist playlist = spotify.addWords(news_words) # Load in UI root = Tk() root.geometry("800x800+800+800") # Congifure user interface app = UI() app.render_news(news_data) app.render_playlist(playlist) root.mainloop()
def test_set_news(self): obj = news.News() example = {'id': 1, 'title': 'record.title', 'link': 'record.link', 'date': 'date', 'description': 'record.description'} obj.set_news(example) self.assertEqual(obj._news[1], example) # элемент бодавлен в словарь self.assertEqual(len(obj._news), 1) # в словаре только 1 элемент obj.set_news(example) self.assertEqual(len(obj._list_news), 1) # элемент повторно не добавлен example = {'id': 2, 'title': 'record.title', 'link': 'record.link', 'date': 'date', 'description': 'record.description'} obj.set_news(example) self.assertEqual(obj._news[2], example) # элемент бодавлен в словарь self.assertEqual(len(obj._news), 2) # в словаре 2 элемента
def test_set_attribute(self): obj = news.News() example = {'id': 1, 'title': 'record.title', 'link': 'record.link', 'date': 'date', 'description': 'record.description'} obj.set_news(example) example = {'id': 2, 'title': 'record.title', 'link': 'record.link', 'date': 'date', 'description': 'record.description'} obj.set_news(example) obj.set_attribute(2, 'text', 'text news') example = {'id': 2, 'title': 'record.title', 'link': 'record.link', 'date': 'date', 'description': 'record.description', 'text': 'text news'} self.assertEqual(obj._news[2], example)
def __init__(self, master=None): self.padding_for_x = 30 self.padding_for_y = 30 self.news_handler = news.News() super().__init__(master) # lang self.languages = ["fi", "en"] self.selected = tk.StringVar() self.language_label = tk.Label(self) self.language_select = tt.Combobox(self, textvariable=self.selected, values=self.languages) self.build_language_field() # hint labels: self.remove_hint = tk.Label(self) self.add_new_hint = tk.Label(self) # lists all current news self.list_of_news = tk.Listbox(self) self.fill_list_view(self.news_handler.get_news()) # delete from db button self.delete_from_db = tk.Button(self) # create new: headline field self.headline_label = tk.Label(self) self.headline = tk.Text(self) # create new: message field self.message_label = tk.Label(self) self.message = tk.Text(self) # create new: date self.date_label = tk.Label(self) self.news_date = tk.Entry(self) # create new: push to db self.add_news = tk.Button(self) self.master = master self.pack() self.create_hint_headers() self.create_list_view() self.create_delete_from_db_btn() self.create_new_news_components() self.winfo_toplevel().title("Cats opinion admin panel - Edit news")
def main(): E = float(sys.argv[2]) # glove_file = datapath('glove.twitter.27B/glove.twitter.27B.200d.txt') # tmp_file = get_tmpfile("tweets_word2vec.txt") # _ = glove2word2vec(glove_file, tmp_file) # model = KeyedVectors.load_word2vec_format(tmp_file) # model.save("tweets_word2vec.model") # print("model completed") model = KeyedVectors.load("glove.twitter.27B/tweets_word2vec.model") news_api = news.News() word_processor = processor.Processor() tweets_api = tweets.Tweets() articles = news_api.process_news(news_api.retrieve_everything()) data = [] for line in open(sys.argv[1]): data.append(json.loads(line)) all_tweets = tweets_api.process_tweets(data) all_tokens = [] copied_tweets = list(all_tweets) for tweet in copied_tweets: tokens = word_processor.tweet_tokenize(tweet[0]) if tokens == []: all_tweets.remove(tweet) continue all_tokens.append(tokens) all_clusters = [] cluster_id = 0 for i in range(len(all_tweets)): tweet = all_tweets[i] token = all_tokens[i] # first cluster if all_clusters == []: new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) continue clustered = False for j in range(len(all_clusters)): single_cluster = all_clusters[j] vector = single_cluster.get_vector(False) # no common words between the tweet and the cluster, skip common_text_vector = intersection(vector["text"], token["text"]) common_hashtag_vector = intersection(vector["hashtag"], token["hashtag"]) common_url_vector = intersection(vector["url"], token["url"]) if common_text_vector == [] and \ common_hashtag_vector == [] and \ common_url_vector == []: continue # vector = single_cluster.get_vector(True) new_token = {} new_token["text"] = token["text"] new_token["hashtag"] = token["hashtag"] # TODO: we can check if a word is in the pre-trained model by doing the following # for word not in new_token["text"]: # if word in model.wv.vocab: # if word in model.vocab: # print(word) similarity = model.wv.n_similarity(new_token["text"], vector["text"]) print(similarity) if similarity >= E: # max_cluster_similarity = similarity # max_cluster_index = j single_cluster.push(tweet[0], tweet[1], token) clustered = True break if not clustered: new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) print("Total number of clusters generated: %d" % (len(all_clusters))) cluster_sizes = [x.get_size() for x in all_clusters] print("The sizes of all clusters generated:") print(cluster_sizes) max_cluster_size = max(cluster_sizes) print("The max cluster size is: %d" % (max_cluster_size)) F = float(sys.argv[3]) related_news_clusters = [] # for article in articles: for i in range(len(articles)): news_cluster_group = {} # max_similarity = 0 # max_similarity_index = -1 article = articles[i] text = article["title"] + article["description"] time = article["publish_time"] # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] # Remove outlier clusters # if single_cluster.get_size() <= 10 or single_cluster.is_clustered: if single_cluster.get_size() <= 10: continue cluster_vector = single_cluster.get_vector(True)["text"] similarity = word_processor.docs_similarity(text, cluster_vector) similarity = word_processor.modified_similarity( similarity, time, single_cluster, True) # if similarity >= F and similarity > max_similarity: if similarity >= F: # max_similarity = similarity # max_similarity_index = j # The news is related to this cluster news_cluster_group["article"] = i news_cluster_group["cluster"] = single_cluster.get_id() related_news_clusters.append(news_cluster_group) # stop comparing with other clusters break # if max_similarity_index == -1: # continue # news_cluster_group["article"] = i # news_cluster_group["cluster"] = max_similarity_index # related_news_clusters.append(news_cluster_group) # all_clusters[max_similarity_index].change_clustered() counter = {} for item in related_news_clusters: if item["cluster"] not in counter: counter[item["cluster"]] = 1 else: counter[item["cluster"]] += 1 # most_related_cluster = max(counter.items(), key=operator.itemgetter(1))[0] print("Number of pairs generated in total: %d" % (len(related_news_clusters))) print("All generated pairs:") print(related_news_clusters)
def main(): if int(sys.argv[5]) == 1: enable_time_relevancy = True else: enable_time_relevancy = False if int(sys.argv[6]) == 1: enable_hashtag_similarity = True else: enable_hashtag_similarity = False E = float(sys.argv[2]) word_processor = processor.Processor(enable_hashtag_similarity) tweets_api = tweets.Tweets(int(sys.argv[4])) all_tweets = tweets_api.process_tweets(sys.argv[1]) all_tokens = [] copied_tweets = list(all_tweets) for tweet in copied_tweets: tokens = word_processor.tweet_tokenize(tweet[0]) if tokens["text"] == []: all_tweets.remove(tweet) continue all_tokens.append(tokens) all_clusters = [] # we are computing the similarity of one tweet with all clusters # exists, not the similarity with other tweets cluster_id = 0 for i in range(len(all_tweets)): # first cluster if all_clusters == []: tweet = all_tweets[i] token = all_tokens[i] new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) continue clustered = False # max_cluster_similarity = 0 # max_cluster_index = -1 token = all_tokens[i] # print("Tweet after processed: %s" % (token["text"])) # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] vector = single_cluster.get_vector(False) # no common words between the tweet and the cluster, skip if not intersection(vector["text"], token["text"]) and \ not intersection(vector["hashtag"], token["hashtag"]): continue # start_pre_similarity = time.time() new_token = {} new_token["text"] = " ".join(token["text"]) new_token["hashtag"] = token["hashtag"] # new_token["url"] = token["url"] # print("Pre similarity duration: %s" % (time.time() - start_pre_similarity)) # if all_text_in_cluster(new_token["text"], vector["text"]): # similarity = 1 # else: # vector = single_cluster.get_vector(True) # similarity = word_processor.new_triple_similarity(new_token, vector) try: # print("Cluster: %s" % (vector["text"])) vector = single_cluster.get_vector(True) similarity = word_processor.new_triple_similarity(new_token, vector) except: continue # print(new_token) # print(vector) if enable_time_relevancy: similarity = word_processor.modified_similarity( similarity, all_tweets[i][1], single_cluster) # print("Similarity: %f" % (similarity)) if similarity >= E: tweet = all_tweets[i] single_cluster.push(tweet[0], tweet[1], token) clustered = True break # if max_cluster_index != -1: # all_clusters[max_cluster_index].push(tweet[0], tweet[1], token) # clustered = True if not clustered: tweet = all_tweets[i] token = all_tokens[i] new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) # print("-----------------------------------------------------------") print("Total number of clusters generated: %d" % (len(all_clusters))) cluster_sizes = [x.get_size() for x in all_clusters] print("The sizes of all clusters generated:") print(cluster_sizes) max_cluster_size = max(cluster_sizes) # for j in range(len(cluster_sizes)): # if cluster_sizes[j] == max_cluster_size: # break print("The max cluster size is: %d" % (max_cluster_size)) # print("Number of tweets clustered using hashtag/url: %d" % (word_processor.hashtag_index)) # print("Number of tweets clustered using text: %d" % (word_processor.text_index)) # for item in all_clusters[j].get_all_tweets(): # print(item) # similarity = word_processor.docs_similarity(all_tweets[0][0], all_tweets[0][0]) # the similarity we get is greater the better, closer to 1 means they are very # similar, otherwise very different # TODO: after finish clustering, we need to compute the similarity between # each cluster and each news we retrieved news_api = news.News() articles = news_api.process_news(news_api.retrieve_everything()) F = float(sys.argv[3]) related_news_clusters = [] # for article in articles: for i in range(len(articles)): news_cluster_group = {} # max_similarity = 0 # max_similarity_index = -1 article = articles[i] text = article["title"] + article["description"] publish_time = article["publish_time"] # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] # Remove outlier clusters if single_cluster.get_size() < 10: continue cluster_vector = single_cluster.get_vector(True)["text"] if not intersection(cluster_vector, text): continue similarity = word_processor.docs_similarity(text, cluster_vector) # similarity = word_processor.modified_similarity(similarity, publish_time, single_cluster, True) # if enable_time_relevancy: # similarity = word_processor.modified_similarity( # similarity, publish_time, single_cluster, True) # if similarity >= F and similarity > max_similarity: if similarity >= F: # max_similarity = similarity # max_similarity_index = j # The news is related to this cluster news_cluster_group["article"] = i news_cluster_group["cluster"] = single_cluster.get_id() news_cluster_group["similarity"] = similarity related_news_clusters.append(news_cluster_group) # stop comparing with other clusters break print("Number of pairs generated in total: %d" % (len(related_news_clusters))) print("All generated pairs:") print(related_news_clusters) for related_pair in related_news_clusters: print("News below") article_id = related_pair["article"] print(articles[article_id]) cluster_id = related_pair["cluster"] print("Tweets below:") for k in range(len(all_clusters[cluster_id].get_all_tweets())): print("[%d]: %s: " % (k, all_clusters[cluster_id].get_all_tweets()[k])) print("----------------------------------------------------")
def main(): E = float(sys.argv[2]) news_api = news.News() word_processor = processor.Processor() tweets_api = tweets.Tweets(int(sys.argv[4])) articles = news_api.process_news(news_api.retrieve_everything()) data = [] for line in open(sys.argv[1]): data.append(json.loads(line)) all_tweets = tweets_api.process_tweets(data) all_tokens = [] copied_tweets = list(all_tweets) for tweet in copied_tweets: tokens = word_processor.tweet_tokenize(tweet[0]) if tokens["text"] == []: all_tweets.remove(tweet) continue all_tokens.append(tokens) all_clusters = [] # we are computing the similarity of one tweet with all clusters # exists, not the similarity with other tweets cluster_id = 0 for i in range(len(all_tweets)): tweet = all_tweets[i] token = all_tokens[i] # first cluster if all_clusters == []: new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) continue clustered = False # max_cluster_similarity = 0 # max_cluster_index = -1 # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] vector = single_cluster.get_vector(False) # no common words between the tweet and the cluster, skip common_text_vector = intersection(vector["text"], token["text"]) common_hashtag_vector = intersection(vector["hashtag"], token["hashtag"]) common_url_vector = intersection(vector["url"], token["url"]) if common_text_vector == [] and \ common_hashtag_vector == [] and \ common_url_vector == []: continue vector = single_cluster.get_vector(True) new_token = {} new_token["text"] = " ".join(token["text"]) new_token["hashtag"] = token["hashtag"] new_token["url"] = token["url"] similarity = word_processor.new_triple_similarity( new_token, vector) # print("Tweet %d, Cluster %d" % (i, j)) # print("Similarity before: %f" % (similarity)) # # similarity = word_processor.docs_similarity(tweet[0], vector) # similarity = word_processor.modified_similarity( # similarity, tweet[1], single_cluster) # print("Similarity after: %f" % (similarity)) # if similarity >= E and similarity > max_cluster_similarity: if similarity >= E: # max_cluster_similarity = similarity # max_cluster_index = j single_cluster.push(tweet[0], tweet[1], token) clustered = True # TODO: we need to consider when one tweet is similar to multiple clusters, # which cluster should we push to break # if max_cluster_index != -1: # all_clusters[max_cluster_index].push(tweet[0], tweet[1], token) # clustered = True if not clustered: new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) # print(i) print("Total number of clusters generated: %d" % (len(all_clusters))) cluster_sizes = [x.get_size() for x in all_clusters] print("The sizes of all clusters generated:") print(cluster_sizes) max_cluster_size = max(cluster_sizes) # for j in range(len(cluster_sizes)): # if cluster_sizes[j] == max_cluster_size: # break print("The max cluster size is: %d" % (max_cluster_size)) # print("Number of tweets clustered using hashtag/url: %d" % (word_processor.hashtag_index)) # print("Number of tweets clustered using text: %d" % (word_processor.text_index)) # for item in all_clusters[j].get_all_tweets(): # print(item) # similarity = word_processor.docs_similarity(all_tweets[0][0], all_tweets[0][0]) # the similarity we get is greater the better, closer to 1 means they are very # similar, otherwise very different # TODO: after finish clustering, we need to compute the similarity between # each cluster and each news we retrieved F = float(sys.argv[3]) related_news_clusters = [] # for article in articles: for i in range(len(articles)): news_cluster_group = {} # max_similarity = 0 # max_similarity_index = -1 article = articles[i] text = article["title"] + article["description"] time = article["publish_time"] # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] # Remove outlier clusters # if single_cluster.get_size() <= 10 or single_cluster.is_clustered: if single_cluster.get_size() <= 10: continue cluster_vector = single_cluster.get_vector(True)["text"] similarity = word_processor.docs_similarity(text, cluster_vector) similarity = word_processor.modified_similarity( similarity, time, single_cluster, True) # if similarity >= F and similarity > max_similarity: if similarity >= F: # max_similarity = similarity # max_similarity_index = j # The news is related to this cluster news_cluster_group["article"] = i news_cluster_group["cluster"] = single_cluster.get_id() related_news_clusters.append(news_cluster_group) # stop comparing with other clusters break # if max_similarity_index == -1: # continue # news_cluster_group["article"] = i # news_cluster_group["cluster"] = max_similarity_index # related_news_clusters.append(news_cluster_group) # all_clusters[max_similarity_index].change_clustered() counter = {} for item in related_news_clusters: if item["cluster"] not in counter: counter[item["cluster"]] = 1 else: counter[item["cluster"]] += 1 # most_related_cluster = max(counter.items(), key=operator.itemgetter(1))[0] print("Number of pairs generated in total: %d" % (len(related_news_clusters))) print("All generated pairs:") print(related_news_clusters)
def set_news(self): nss = news.News() list_news = nss.get_list_of_sm() for x in range(0, 8): self.main_widget.ids.newsgrid.add_widget(list_news[x])
import weather import news import quotes import time from datetime import date from PIL.ImageTk import PhotoImage, Image weather_obj = weather.Weather() news_obj = news.News() months = ["January", "February", "March", "April", "May", "June", "July",\ "August", "September", "October", "November", "December"] weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday",\ "Saturday", "Sunday"] large_icons = {"cloudy": Image.open("bigcloudy.png"), "clear-night": Image.open("bignight.png"), "foggy": Image.open('bigcloudy.png')\ , "partly-cloudy-night": Image.open("bignightcloudy.png"), "partly-cloudy-day": Image.open("bigpartlycloudy.png"),\ "rain": Image.open("bigrainy.png"), "clear-day": Image.open("bigsun.png"), "thunderstorm": Image.open("bigstorm.png")} small_icons = {"cloudy": Image.open("cloudy.png"), "clear-night": Image.open("night.png"), "foggy": Image.open('cloudy.png')\ , "partly-cloudy-night": Image.open("nightcloudy.png"), "partly-cloudy-day": Image.open("partlycloudy.png"),\ "rain": Image.open("rain.png"), "clear-day": Image.open("sun.png"), "thunderstorm": Image.open("storm.png")} year, month, day = str(date.today()).split('-') week_index = date(int(year), int(month), int(day)).weekday() def date_as_str(): return f'{weekdays[week_index]}, {months[int(month)]} {int(day)}, {year}'
def __init__(self): self.app = None self.news = news.News()
def post(self): pieceOfNews = news.News(title = self.request.get('title'), text = self.request.get('text')) pieceOfNews.put() self.redirect('/add_news')
def test_append(self): self.news.append( news.News('a', 'a', datetime(1000, 6, 18, 1, 25, 0), 'a')) self.assertEqual(len(self.news), 3)
# начиная с нуля.)) # Задается список для ссылок на страницы найденных новостей, ссылка для запроса GET и # наименование файла для сохранения найденных новостей, соответствующие условиям поиска, # и список словарей, полученный в ответ на запрос. links = [] url, filename = news.url_filename_composer(tag, offset, start_date, end_date) items = requests.get(url).json()['items'] # Цикл заполнения списка ссылок на страницы с новостями, удовлетворяющими уловиям поиска. while(items != []): for i in range(len(items)): if items[i]['fronturl'][:6] != 'https:': links.append('https:' + items[i]['fronturl']) else: links.append(items[i]['fronturl']) offset += 10 url = news.url_filename_composer(tag, offset, start_date, end_date)[0] items = requests.get(url).json()['items'] # Формирование списка словарей, соответствующих новостям, ссылки на страницы с которыми # содержатся в списке links. Каждая ссылка используется для создания объекта класса News, # который с помощью метода as_dict() преобразуется в словарь. # Преобразование списка словарей в объект pandas.DataFrame и сохранение его в 'csv' файл. if links == []: print("Новостей не найдено") else: news_list = [news.News(link).as_dict() for link in links] news_df = pd.DataFrame(news_list, columns = ['Date', 'Time', 'Header', 'Overview', 'Text']) news_df.to_csv(filename, index = False, encoding = 'utf-8', mode = 'w') print('Сбор и сохранение новостей завершены.')
def main(): # Settings if int(sys.argv[5]) == 1: enable_time_relevancy = True else: enable_time_relevancy = False if int(sys.argv[6]) == 1: enable_hashtag_similarity = True else: enable_hashtag_similarity = False E = float(sys.argv[2]) # model = Doc2Vec.load("./enwiki_dbow/doc2vec.bin") # print("Starts loading the model.") doc2vec_model = Doc2VecModel() model = doc2vec_model.get_model() # print("Model loaded.") word_processor = processor.Processor(enable_hashtag_similarity) tweets_api = tweets.Tweets(int(sys.argv[4])) # print("Starts loading the dataset") all_tweets = tweets_api.process_tweets(sys.argv[1]) # print("Dataset loaded") all_tokens = [] copied_tweets = list(all_tweets) for tweet in copied_tweets: tokens = word_processor.tweet_tokenize(tweet[0]) if tokens == []: all_tweets.remove(tweet) continue all_tokens.append(tokens) # print("pre-processing completed") all_clusters = [] cluster_id = 0 for i in range(len(all_tweets)): # start_total = time.time() # first cluster if all_clusters == []: tweet = all_tweets[i] token = all_tokens[i] new_cluster = cluster.Cluster( tweet[0], tweet[1], token, cluster_id, True, model) cluster_id += 1 all_clusters.append(new_cluster) continue clustered = False # print("Starts clustering %d" % (i)) token = all_tokens[i] for j in range(len(all_clusters)): vector = all_clusters[j].get_vector(False) single_cluster = all_clusters[j] # no common words between the tweet and the cluster, skip if not intersection(vector["text"], token["text"]) and \ not intersection(vector["hashtag"], token["hashtag"]): continue # vector = single_cluster.get_vector(True) new_token = {} new_token["text"] = token["text"] new_token["hashtag"] = token["hashtag"] # cluster_dbow_vector = model.infer_vector(vector["text"]) # similarity = spatial.distance.cosine(tweet_dbow_vector, cluster_dbow_vector) # similarity = 1 - similarity # if all_text_in_cluster(new_token["text"], vector["text"]): # similarity = 1 # else: # tweet_dbow_vector = model.infer_vector(new_token["text"]) # similarity = word_processor.doc2vec_double_similarity(new_token, vector, tweet_dbow_vector, all_clusters[j]) tweet_dbow_vector = model.infer_vector(new_token["text"]) similarity = word_processor.doc2vec_double_similarity( new_token, vector, tweet_dbow_vector, all_clusters[j]) if enable_time_relevancy: similarity = word_processor.modified_similarity( similarity, all_tweets[i][1], single_cluster) if similarity >= E: tweet = all_tweets[i] all_clusters[j].push(tweet[0], tweet[1], token) clustered = True break if not clustered: # start_new_cluster = time.time() tweet = all_tweets[i] token = all_tokens[i] new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id, True, model) cluster_id += 1 all_clusters.append(new_cluster) # print("New cluster duration: %s" % # (time.time() - start_new_cluster)) # print("Total time: %s" % (time.time() - start_total)) # print("Clustering completed %d" % (i)) print("Total number of clusters generated: %d" % (len(all_clusters))) cluster_sizes = [x.get_size() for x in all_clusters] print("The sizes of all clusters generated:") print(cluster_sizes) max_cluster_size = max(cluster_sizes) print("The max cluster size is: %d" % (max_cluster_size)) news_api = news.News() articles = news_api.process_news(news_api.retrieve_everything()) F = float(sys.argv[3]) related_news_clusters = [] # for article in articles: for i in range(len(articles)): news_cluster_group = {} # max_similarity = 0 # max_similarity_index = -1 article = articles[i] text = article["title"] + article["description"] publish_time = article["publish_time"] # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] # Remove outlier clusters if single_cluster.get_size() < 10: continue # print("Article %d, Cluster %d." % (i, j)) cluster_vector = single_cluster.get_vector(True)["text"] if not intersection(cluster_vector, text): continue similarity = word_processor.docs_similarity(text, cluster_vector) # if enable_time_relevancy: # similarity = word_processor.modified_similarity( # similarity, publish_time, single_cluster, True) if similarity >= F: news_cluster_group = {} # find all clusters related to the news # if i not in news_cluster_group.keys(): # news_cluster_group[i] = [] # cluster_id = single_cluster.get_id() # news_cluster_group[i].append((cluster_id, similarity)) news_cluster_group["article"] = i news_cluster_group["cluster"] = single_cluster.get_id() news_cluster_group["similarity"] = similarity related_news_clusters.append(news_cluster_group) # stop comparing with other clusters break # if news_cluster_group != {}: # related_news_clusters.append(news_cluster_group) print("Number of pairs generated in total: %d" % (len(related_news_clusters))) print("All generated pairs:") print(related_news_clusters) # for related_pair in related_news_clusters: # article_id = list(related_pair.keys())[0] # print("News is below") # print(articles[article_id]) # print("Tweets are below") # cluster_list = list(related_pair.values())[0] # for cluster_id, similarity in cluster_list: # for k in range(len(all_clusters[cluster_id].get_all_tweets())): # print("[%d]: %s: " % # (k, all_clusters[cluster_id].get_all_tweets()[k])) # print("---------------------------------------------------") for related_pair in related_news_clusters: print("News below") article_id = related_pair["article"] print(articles[article_id]) cluster_id = related_pair["cluster"] print("Tweets below:") for k in range(len(all_clusters[cluster_id].get_all_tweets())): print("[%d]: %s: " % (k, all_clusters[cluster_id].get_all_tweets()[k])) print("----------------------------------------------------")
def getUserName(self): getName = self.cam.readUserName() print(getName) if getName == "Unknown" or getName == "No User Detected": self.tk.after(5000, self.getUserName) else: jsonfile = getName with open('./json/' + getName + '.json') as json_file: data = json.load(json_file) for p in data[jsonfile]: clockCheckbox = (p['clockCheckbox']) clockFrame = (p['clockFrame']) clockSide = (p['clockSide']) weatherCheckbox = (p['weatherCheckbox']) weatherFrame = (p['weatherFrame']) weatherSide = (p['weatherSide']) newsCheckbox = (p['newsCheckbox']) newsFrame = (p['newsFrame']) newsSide = (p['newsSide']) newsCategory = (p['newsCategory']) stockCheckbox = (p['stockCheckbox']) stockFrame = (p['stockFrame']) stockSide = (p['stockSide']) stockList = (p['stockList']) quoteCheckbox = (p['quoteCheckbox']) self.splash.pack_forget() self.cam.pack_forget() self.instructions1.pack_forget() self.instructions2.pack_forget() #Quotes if quoteCheckbox == 'enable': self.quotes = quotes.Quotes(self.bottomFrame) self.quotes.pack(anchor=N, padx=100, pady=60) #clock if clockCheckbox == 'enable': self.clock = clock.Clock(getattr(self, clockFrame)) if clockFrame == 'topFrame': self.clock.pack(side=clockSide, anchor=N, padx=100, pady=60) else: self.clock.pack(side=clockSide, anchor=S, padx=100, pady=60) #weather if weatherCheckbox == 'enable': self.weather = weather.Weather(getattr(self, weatherFrame), weatherSide) if weatherFrame == 'topFrame': self.weather.pack(side=weatherSide, anchor=N, padx=100, pady=60) else: self.weather.pack(side=weatherSide, anchor=S, padx=100, pady=60) #news if newsCheckbox == 'enable': self.news = news.News(getattr(self, newsFrame), newsSide, newsCategory) if newsFrame == 'topFrame': self.news.pack(side=newsSide, anchor=N, padx=100, pady=60) else: self.news.pack(side=newsSide, anchor=S, padx=100, pady=60) #stock if stockCheckbox == 'enable': self.stock = stock.Stock(getattr(self, stockFrame), stockList) if stockFrame == 'topFrame': self.stock.pack(side=stockSide, anchor=N, padx=100, pady=60) else: self.stock.pack(side=stockSide, anchor=S, padx=100, pady=60) self.checkStillViewing()
#user_data will be list of format username, city, province, lat, long username = user_data[0] user_city = user_data[1] lat = user_data[3] lon = user_data[4] count = user_data[5] #display intro print(("Welcome back, {}").format(username)) displaydateandtime.show_date() displaydateandtime.show_time() #fill in user object current_user = User(username, user_city, lat, lon, count) return current_user if __name__ == "__main__": #check if data file exists exists = os.path.isfile('user_data_file.txt') if exists: current_user = ReturningUser() else: current_user = NewUser() print() #show weather ShowWeather.show_weather(current_user) news.News(current_user) print('Thank you for using Starlight - By Raza Abbas')
def get_news(): return news.News()
def read_log(log_file): # check treatnames = [] fo = open(log_file, "r") line = fo.readline() chunks = re.split("\|\|", line) if (chunks[0] == 'g'): old = True gmarker = 'g' treatments = 2 treatnames = ['0', '1'] samples = len(chunks) - 1 else: old = False gmarker = 'assign' treatments = int(chunks[2]) samples = int(chunks[1]) line = fo.readline() chunks = re.split("\|\|", line) for i in range(1, len(chunks)): treatnames.append(chunks[i].strip()) fo.close() assert treatments == len(treatnames) for i in range(0, treatments): print "Treatment ", i, " = ", treatnames[i] adv = [] ints = [] newsv = [] for i in range(0, samples): adv.append(adVector.AdVector()) ints.append(interest.Interests()) newsv.append(news.NewsVector()) loadtimes = [timedelta(minutes=0)] * samples reloads = [0] * samples errors = [0] * samples xvfbfails = [] breakout = False par_adv = [] ass = [] fo = open(log_file, "r") r = 0 sys.stdout.write("Scanning ads") for line in fo: chunks = re.split("\|\|", line) chunks[len(chunks) - 1] = chunks[len(chunks) - 1].rstrip() if (chunks[0] == gmarker and r == 0): r += 1 ass = chunks[2:] if (old): ass = chunks[1:] assert len(ass) == samples apply_labels_to_vecs(adv, ints, newsv, ass, samples, treatments) #print ass elif (chunks[0] == gmarker and r > 0): r += 1 par_adv.append({ 'adv': adv, 'newsv': newsv, 'ass': ass, 'xf': xvfbfails, 'interests': ints, 'break': breakout, 'loadtimes': loadtimes, 'reloads': reloads, 'errors': errors }) sys.stdout.write(".") sys.stdout.flush() adv = [] ints = [] newsv = [] for i in range(0, samples): adv.append(adVector.AdVector()) ints.append(interest.Interests()) newsv.append(news.NewsVector()) loadtimes = [timedelta(minutes=0)] * samples reloads = [0] * samples errors = [0] * samples xvfbfails = [] breakout = False ass = chunks[2:] if (old): ass = chunks[1:] assert len(ass) == samples apply_labels_to_vecs(adv, ints, newsv, ass, samples, treatments) elif (chunks[0] == 'Xvfbfailure'): xtreat, xid = chunks[1], chunks[2] xvfbfails.append(xtreat) elif (chunks[1] == 'breakingout'): breakout = True elif (chunks[1] == 'loadtime'): t = (datetime.strptime(chunks[2], "%H:%M:%S.%f")) delta = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) id = int(chunks[3]) loadtimes[id] += delta elif (chunks[1] == 'reload'): id = int(chunks[2]) reloads[id] += 1 elif (chunks[1] == 'errorcollecting'): id = int(chunks[2]) errors[id] += 1 elif (chunks[1] == 'prepref'): id = int(chunks[4]) ints[id].remove_interest() elif (chunks[1] == 'pref'): id = int(chunks[4]) int_str = chunks[3] ints[id].set_from_string(int_str) elif (chunks[0] == 'news'): ind_news = news.News({ 'Time': datetime.strptime(chunks[3], "%Y-%m-%d %H:%M:%S.%f"), 'Title': chunks[4], 'Agency': chunks[5], 'Ago': chunks[6], 'Body': chunks[7].rstrip(), 'Label': chunks[2] }) newsv[int(chunks[1])].add(ind_news) elif (chunks[0] == 'ad'): ind_ad = ad.Ad({ 'Time': datetime.strptime(chunks[3], "%Y-%m-%d %H:%M:%S.%f"), 'Title': chunks[4], 'URL': chunks[5], 'Body': chunks[6].rstrip(), 'cat': "", 'Label': chunks[2] }) adv[int(chunks[1])].add(ind_ad) else: # to analyze old log files try: ind_ad = ad.Ad({ 'Time': datetime.strptime(chunks[2], "%Y-%m-%d %H:%M:%S.%f"), 'Title': chunks[3], 'URL': chunks[4], 'Body': chunks[5].rstrip(), 'cat': "", 'label': chunks[1] }) # ind_ad = ad.Ad({'Time':datetime.strptime(chunks[1], "%Y-%m-%d %H:%M:%S.%f"), 'Title':chunks[2], # 'URL': chunks[3], 'Body': chunks[4].rstrip(), 'cat': "", 'label':""}) adv[int(chunks[0])].add(ind_ad) except: pass r += 1 par_adv.append({ 'adv': adv, 'newsv': newsv, 'ass': ass, 'xf': xvfbfails, 'interests': ints, 'break': breakout, 'loadtimes': loadtimes, 'reloads': reloads, 'errors': errors }) sys.stdout.write(".Scanning complete\n") sys.stdout.flush() return par_adv, treatnames
import utils import news import plot import visualizer news = news.News() plot = plot.IndexPlot() user = visualizer.GithubUser() class MessageHandler(): def __init__(self, machine): self.machine = machine class HallHandler(MessageHandler): help_message = ("Welcome to the hall, " "here are some command you can use:\n" "\n" "help\n" "pop up the help message\n" "\n" "news\n" "Go to bulletin board for breaking news\n" "\n" "vol\n" "Plot the finance voladility") def handle(self, event, command):