def getData(): tweetCriteria = got.manager.TweetCriteria().setUsername("CNN").setSince("2013-01-01").setUntil( "2017-01-01").setMaxTweets(15000) tweets = got.manager.TweetManager.getTweets(tweetCriteria) news_collection = database.news_collection() for tweet in tweets: urls = tweet.urls if 'twitter' in urls or not urls: continue if news_collection.find({'reference': urls}).count() > 0: print('Skip duplicated ' + urls) continue timestamp = int(time.mktime(time.strptime(tweet.formatted_date, '%a %b %d %H:%M:%S +0000 %Y'))) document = { 'id': tweet.id, 'created_at': timestamp, 'reference': tweet.urls } print('Insert ' + tweet.urls + ' created at ' + str(timestamp)) news_collection.insert_one(document)
def build_tagged(): logger = thesis_logging.get_logger('preprocess') latest = 0 count = 0 index = 1 news_collection = database.news_collection() duplicated_doc = {} while True: documents = news_collection.find({'created_at': {'$gte': latest}}) if documents.count() == 0: break for doc in documents: count += 1 try: latest = doc['created_at'] if not doc.get('text'): print('Ignore', 'Count ' + str(count), 'Id ' + str(doc['id']), str(doc['created_at']), doc['reference']) continue content = doc['text'] if content not in duplicated_doc: duplicated_doc[content] = True index += 1 logger.info(nltk.word_tokenize(content.lower())) yield TaggedDocument(words=nltk.word_tokenize( content.lower()), tags=[index]) except Exception as e: logger.error(doc['reference'] + ' : ' + str(e))
def write_web_news(): logger = thesis_logging.get_logger('preprocess') link_list = [ 'cnn.it', 'nyti.ms', 'nbcnews', 'apne.ws', 'reut.rs', 'wapo.st', 'abcn.ws', 'nbcbay.com', 'bbc.in', 'huff.to', 'ti.me', 'cbsn.ws', 'huffingtonpost.com', 'cnb.cx', 'cnnmon.ie', 'huffp.st', 'forbes.com', 'telegraph.co', 'cnn.com', 'trib.al', 'express.co', 'gu.com', 'bloom.bg', 'hill.cm', 'natgeo.com', 'pbs.org', 'washingtonpost', 'news.sky.com' ] for source in link_list: # latest = ObjectId("59abbfedf296532f80d18a47") # dyncorp # latest = ObjectId("59abc7e2f296532ad483f4b6") # lds # latest = ObjectId("59acc20df296533c88dbaed6") # tm latest = ObjectId("5942946efe43ad1da80b1a79") # news index = 0 path_file = './datasets/insensitive/news/' + source.replace('.', '_') + '_' train_collection = news_database.news_collection() duplicated_doc = {} while True: documents = train_collection.find({ '_id': { '$gt': latest }, 'reference': { '$regex': '.*' + source + '.*' } }) if documents.count() == 0: break for doc in documents: try: latest = doc['_id'] if not doc.get('text'): # print('Ignore', 'Count ' + str(count), 'Id ' + str(doc['id']), str(doc['created_at']), # doc['reference']) continue content = doc['text'] if len(content) < 1000: # logger.info('Ignore small content, Count ' + str(count)) continue title = doc['title'] if len(title) > 60: title = title[0:60] title = "".join(x for x in title if x.isalnum()) if content not in duplicated_doc: duplicated_doc[content] = True index += 1 # logger.info(nltk.word_tokenize(content.lower())) with open(path_file + title + '.txt', 'w', encoding="utf-8") as doc_file: doc_file.write(doc['text']) except Exception as e: logger.error(doc['reference'] + ' : ' + str(e)) print(source, index)
def mergeDB(): news_collection = database.news_collection() train_collection = train_database.train_collection() latest = ObjectId("5942946efe43ad1da80b1a79") count = 0 index = 1 while True: documents = news_collection.find({'_id': {'$gt': latest}}).limit(100) if documents.count() == 0: break for doc in documents: count += 1 try: latest = doc['_id'] if not doc.get('text'): print('Skip', doc['reference']) continue if train_collection.find({ 'reference': doc['reference'] }).count() > 0: print('Skip duplicated reference ' + doc['reference']) continue if train_collection.find({'text': doc['text']}).count() > 0: print('Skip duplicated text ' + doc['reference']) continue document = { 'id': doc['id'], 'created_at': doc['created_at'], 'reference': doc['reference'], 'title': doc['title'], 'text': doc['text'], 'image': doc['image'], } print('Insert ' + doc['reference'] + ' created at ' + str(doc['created_at'])) train_collection.insert_one(document) except Exception as e: print(e)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz' enwiki_bin_location = 'training/metawiki-20170401-pages-articles.xml.bz2' enwiki_txt_location = 'training/wiki-documents.txt' doc2vec_model_location = 'model/doc2vec-model-temp-300.bin' word2vec_model_location = 'model/word2vec-model.bin' doc2vec_vectors_location = 'model/doc2vec-vectors.bin' clustering_model_location = 'model/clustering_model.bin' doc2vec_dimensions = 300 classifier_model_location = 'model/classifier-model.bin' train_collection = train_database.train_collection() news_collection = news_database.news_collection() # Build the word2vec model from the corpus # doc2vec.build_vocab(taggedDocuments) def build_wiki_text(): i = 0 output = open(enwiki_txt_location, 'w+', encoding="utf-8") wiki = WikiCorpus(enwiki_bin_location, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles")
def locate_feeds( news_converter: NewsConverter, latest: int = 0, ): global crawler_finish logger = thesis_logging.get_logger('locator') news_collection = database.news_collection() class VectorConverter(Thread): def __init__(self, text): super().__init__() self.text = text self.vector = [] def run(self): self.vector = news_converter.convert_doc_to_vector( self.text).tolist() class GeographyExtractor(Thread): def __init__(self, text): super().__init__() self.text = text self.places = [] self.people = [] self.organs = [] def run(self): context = geograpy.get_place_context(text=self.text) self.places = context.places self.people = context.people self.organs = context.organs class PageParser(Thread): def __init__(self, tweet_id, url, collection): super().__init__() self.tweet_id = tweet_id self.url = url self.collection = collection def run(self): try: logger.info('Parse ' + self.url) article = Article(self.url) article.download() if article.download_exception_msg and "404" in article.download_exception_msg: logger.error('404 not found, delete... ' + self.url) news_collection.remove({"id": self.tweet_id}) return article.parse() ignore_list = [ "twitter.com", "youtube.com", "facebook.com", "instagram.com" ] if any(x in article.canonical_link for x in ignore_list): print('delete ' + article.canonical_link) news_collection.remove({"id": self.tweet_id}) return logger.info('Title for ' + article.top_image + ' - ' + article.canonical_link + '\n' + article.title + '\n\n') logger.info('Latest: ' + str(latest)) vector_converter = VectorConverter(article.text) geography_extractor = GeographyExtractor(article.text) vector_converter.start() geography_extractor.start() geography_extractor.join() vector_converter.join() vector = vector_converter.vector news_collection.update_one({'id': self.tweet_id}, { '$set': { 'places': geography_extractor.places, 'people': geography_extractor.people, 'organs': geography_extractor.organs, 'vector': vector, 'title': article.title, 'text': article.text, 'image': article.top_image } }) for place in geography_extractor.places: self.collection.update_one({'place': place}, {'$inc': { 'count': 1 }}, upsert=True) except Exception as e: logger.error(str(e)) location_collection = database.location_collection() duplicate_urls = {} tasks = [] while True: documents = news_collection.find({ 'created_at': { '$gte': latest } }).limit(100) logger.info('Found ' + str(documents.count()) + ' after ' + str(latest)) # Clean up remaining tasks if len(tasks) != 0: logger.info('Cleaning up remaining tasks') for task in tasks: task.join() tasks.clear() if documents.count() == 1: if crawler_finish: break logger.warn('Nap and back in 500 seconds') time.sleep(500) continue logger.info('Start Locating') index = 0 for doc in documents: try: ref = doc['reference'] latest = doc['created_at'] image = doc.get('image') if image is not None: logger.info('image skip') continue if news_collection.find({'reference': ref}).count() > 1: logger.info('delete duplicate ' + ref) news_collection.remove({"id": doc['id']}) continue thread = PageParser(doc['id'], ref, location_collection) tasks.append(thread) thread.start() time.sleep(7) index += 1 if index % 5 == 0: for task in tasks: task.join() tasks.clear() except Exception as e: logger.error(doc['reference'] + ' : ' + str(e))
def crawl_feeds(dev: TwitterDev, duration: int = 0): global crawler_finish logger = thesis_logging.get_logger('crawler') while True: try: if dev is None: logger.error('There is no Twitter developer account detected.') return news_collection = database.news_collection() logger.info('ok') user_id = dev.api.VerifyCredentials() logger.info('Twitter Auth: ' + str(user_id.AsJsonString())) friends = dev.api.GetFriendIDs(user_id, stringify_ids=True) logger.info('Friends: ' + str(friends)) logger.info('Start crawling') start = int(round(time.time()) * 1000) link_list = [ 'cnn.it', 'nyti.ms', 'nbcnews', 'apne.ws', 'reut.rs', 'wapo.st', 'abcn.ws', 'nbcbay.com', 'bbc.in', 'huff.to', 'ti.me', 'cbsn.ws', 'huffingtonpost.com', 'cnb.cx', 'cnnmon.ie', 'huffp.st', 'forbes.com', 'telegraph.co', 'cnn.com', 'trib.al', 'express.co', 'gu.com', 'bloom.bg', 'hill.cm', 'natgeo.com', 'pbs.org', 'washingtonpost', 'news.sky.com' ] ignore_list = [ 'bit.ly', 'twitter', 'tinyurl', 'goo.gl', 'facebook.com', ] dupliate_urls = {} for status in dev.api.GetStreamFilter(follow=friends): urls = status['entities']['urls'] if len(urls) == 0: continue url = urls[0]['expanded_url'] if url is None: continue if not any(x in url for x in link_list): logger.info('Skip link ' + url) continue if news_collection.find({'reference': url}).count() > 0: logger.info('Skip duplicated ' + url) continue timestamp = int( time.mktime( time.strptime(status['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))) document = { 'id': status['id'], 'created_at': timestamp, 'reference': url } news_collection.insert_one(document) logger.info('Insert ' + url + ' created at ' + str(timestamp)) # if duration != 0 and int(round(time.time()) * 1000) - start > duration: # break except Exception as e: logger.error(e) finally: # crawler_finish = True logger.info('Finish crawling') logger.info('Sleeping 5s to start again...') time.sleep(5)