def test_get_url_info_1(connection): ''' tests that get_url_info can handle very long urls ''' # FIXME: the database should be parameterized somehow #engine = sqlalchemy.create_engine('postgresql:///novichenkobot') #connection = engine.connect() url = 'https://www.nytimes.com/' + 'absurd_path' * 1000 get_url_info(connection, url) return True
def test_get_url_info_2(connection): ''' tests that get_url_info returns the same url_id when called twice on a new randomly generated url ''' # FIXME: the database should be parameterized somehow #engine = sqlalchemy.create_engine('postgresql:///novichenkobot') #connection = engine.connect() url = 'https://www.example.com/' + str(random.randrange(100000000000)) url_info1 = get_url_info(connection, url) url_info2 = get_url_info(connection, url) return url_info1['id_urls'] == url_info2['id_urls']
def test_get_url_info_3(connection): ''' ''' url_info = { 'scheme': 'https', 'hostname': 'media1.faz.net', 'port': -1, 'path': '/ppmedia/aktuell/wirtschaft/2406223479/1.6105689/article_aufmacher_klein/ist-amazon-zu-maechtig.jpg', 'params': '', 'query': '', 'fragment': '', 'other': '', 'depth': 7 } url = urlinfo2url(url_info) # FIXME: the database should be parameterized somehow #engine = sqlalchemy.create_engine('postgresql:///novichenkobot') #connection = engine.connect() get_url_info(connection, url) return True
def insert_tweet(connection, tweet): ''' Inserts the tweet into the database. Args: connection: a sqlalchemy connection to the postgresql db tweet: a dictionary representing the json tweet object ''' # skip tweet if already inserted sql = sqlalchemy.sql.text(''' SELECT id_tweets FROM twitter.tweets WHERE id_tweets = :id_tweets ''') res = connection.execute(sql, { 'id_tweets': tweet['id'], }) if res.first() is not None: return # insert tweet with connection.begin() as trans: ######################################## if tweet['user']['url'] is None: tweet_user_url = {} else: tweet_user_url = get_url_info(connection, tweet['user']['url']) # create/update the user sql = sqlalchemy.sql.text(''' INSERT INTO twitter.users (id_users,created_at,updated_at,screen_name,name,location,id_urls,hostname,description,protected,verified,friends_count,listed_count,favourites_count,statuses_count,withheld_in_countries) VALUES (:id_users,:created_at,:updated_at,:screen_name,:name,:location,:id_urls,:hostname,:description,:protected,:verified,:friends_count,:listed_count,:favourites_count,:statuses_count,:withheld_in_countries) ON CONFLICT (id_users) DO UPDATE SET created_at=:created_at, updated_at=:updated_at, screen_name=:screen_name, name=:name, location=:location, id_urls=:id_urls, hostname=:hostname, description=:description, protected=:protected, verified=:verified, friends_count=:friends_count, listed_count=:listed_count, favourites_count=:favourites_count, statuses_count=:statuses_count, withheld_in_countries=:withheld_in_countries WHERE :updated_at > twitter.users.updated_at OR twitter.users.updated_at is null ''') res = connection.execute( sql, { 'id_users': tweet['user']['id'], 'created_at': tweet['user']['created_at'], 'updated_at': tweet['created_at'], 'screen_name': remove_nulls(tweet['user']['screen_name']), 'name': remove_nulls(tweet['user']['name']), 'location': remove_nulls(tweet['user']['location']), 'id_urls': tweet_user_url.get('id_urls', None), 'hostname': tweet_user_url.get('hostname', None), 'description': remove_nulls(tweet['user']['description']), 'protected': tweet['user']['protected'], 'verified': tweet['user']['verified'], 'friends_count': tweet['user']['friends_count'], 'listed_count': tweet['user']['listed_count'], 'favourites_count': tweet['user']['favourites_count'], 'statuses_count': tweet['user']['statuses_count'], 'withheld_in_countries': tweet['user'].get('withheld_in_countries', None), }) ######################################## try: #geo = [[ (tweet['geo']['coordinates'][1],tweet['geo']['coordinates'][0]) ]] geo_coords = tweet['geo']['coordinates'] geo_coords = str(tweet['geo']['coordinates'][0]) + ' ' + str( tweet['geo']['coordinates'][1]) geo_str = 'POINT' except TypeError: try: geo_coords = '(' for i, poly in enumerate( tweet['place']['bounding_box']['coordinates']): if i > 0: geo_coords += ',' geo_coords += '(' for j, point in enumerate(poly): geo_coords += str(point[0]) + ' ' + str(point[1]) + ',' geo_coords += str(poly[0][0]) + ' ' + str(poly[0][1]) geo_coords += ')' geo_coords += ')' geo_str = 'MULTIPOLYGON' except KeyError: if tweet['user']['geo_enabled']: raise ValueError('couldnt find geolocation information') try: text = tweet['extended_tweet']['full_text'] except: text = tweet['text'] try: country_code = tweet['place']['country_code'].lower() except TypeError: country_code = None if country_code == 'us': state_code = tweet['place']['full_name'].split( ',')[-1].strip().lower() if len(state_code) > 2: state_code = None else: state_code = None try: place_name = tweet['place']['full_name'] except TypeError: place_name = None # created unhydrated references if tweet.get('in_reply_to_user_id', None) is not None: sql = sqlalchemy.sql.text(''' INSERT INTO twitter.users (id_users,screen_name) VALUES (:id_users,:screen_name) ON CONFLICT DO NOTHING ''') res = connection.execute( sql, { 'id_users': tweet['in_reply_to_user_id'], 'screen_name': tweet['in_reply_to_screen_name'], }) # insert the tweet sql = sqlalchemy.sql.text(f''' INSERT INTO twitter.tweets (id_tweets,id_users,created_at,in_reply_to_status_id,in_reply_to_user_id,quoted_status_id,geo,retweet_count,quote_count,favorite_count,withheld_copyright,withheld_in_countries,place_name,country_code,state_code,lang,text,source) VALUES (:id_tweets,:id_users,:created_at,:in_reply_to_status_id,:in_reply_to_user_id,:quoted_status_id,ST_GeomFromText(:geo_str || '(' || :geo_coords || ')'),:retweet_count,:quote_count,:favorite_count,:withheld_copyright,:withheld_in_countries,:place_name,:country_code,:state_code,:lang,:text,:source) ON CONFLICT DO NOTHING; ''') res = connection.execute( sql, { 'id_tweets': tweet['id'], 'id_users': tweet['user']['id'], 'created_at': tweet['created_at'], 'in_reply_to_status_id': tweet.get('in_reply_to_status_id', None), 'in_reply_to_user_id': tweet.get('in_reply_to_user_id', None), 'quoted_status_id': tweet.get('quoted_status_id', None), 'geo_coords': geo_coords, 'geo_str': geo_str, 'retweet_count': tweet.get('retweet_count', None), 'quote_count': tweet.get('quote_count', None), 'favorite_count': tweet.get('favorite_count', None), 'withheld_copyright': tweet.get('withheld_copyright', None), 'withheld_in_countries': tweet.get('withheld_in_countries', None), 'place_name': place_name, 'country_code': country_code, 'state_code': state_code, 'lang': tweet.get('lang'), 'text': remove_nulls(text), 'source': remove_nulls(tweet.get('source', None)), }) ######################################## try: urls = tweet['extended_tweet']['entities']['urls'] except KeyError: urls = tweet['entities']['urls'] for url in urls: url_info = get_url_info(connection, url['expanded_url']) if url_info is None: id_urls = None hostname = None else: id_urls = url_info['id_urls'] hostname = url_info['hostname'] sql = sqlalchemy.sql.text(''' INSERT INTO twitter.tweet_urls (id_tweets,id_urls,hostname) VALUES (:id_tweets,:id_urls,:hostname) ON CONFLICT DO NOTHING ''') res = connection.execute(sql, { 'id_tweets': tweet['id'], 'id_urls': id_urls, 'hostname': hostname }) ######################################## try: mentions = tweet['extended_tweet']['entities']['user_mentions'] except KeyError: mentions = tweet['entities']['user_mentions'] for mention in mentions: sql = sqlalchemy.sql.text(''' INSERT INTO twitter.users (id_users,name,screen_name) VALUES (:id_users,:name,:screen_name) ON CONFLICT DO NOTHING ''') res = connection.execute( sql, { 'id_users': mention['id'], 'name': remove_nulls(mention['name']), 'screen_name': remove_nulls(mention['screen_name']), }) sql = sqlalchemy.sql.text(''' INSERT INTO twitter.tweet_mentions (id_tweets,id_users) VALUES (:id_tweets,:id_users) ON CONFLICT DO NOTHING ''') res = connection.execute(sql, { 'id_tweets': tweet['id'], 'id_users': mention['id'] }) ######################################## try: hashtags = tweet['extended_tweet']['entities']['hashtags'] cashtags = tweet['extended_tweet']['entities']['symbols'] except KeyError: hashtags = tweet['entities']['hashtags'] cashtags = tweet['entities']['symbols'] tags = ['#' + hashtag['text'] for hashtag in hashtags ] + ['$' + cashtag['text'] for cashtag in cashtags] for tag in tags: sql = sqlalchemy.sql.text(''' INSERT INTO twitter.tweet_tags (id_tweets,tag) VALUES (:id_tweets,:tag) ON CONFLICT DO NOTHING ''') res = connection.execute(sql, { 'id_tweets': tweet['id'], 'tag': remove_nulls(tag) }) ######################################## try: media = tweet['extended_tweet']['extended_entities']['media'] except KeyError: try: media = tweet['extended_entities']['media'] except KeyError: media = [] for medium in media: url_info = get_url_info(connection, medium['media_url']) sql = sqlalchemy.sql.text(''' INSERT INTO twitter.tweet_media (id_tweets,id_urls,hostname,type) VALUES (:id_tweets,:id_urls,:hostname,:type) ON CONFLICT DO NOTHING ''') res = connection.execute( sql, { 'id_tweets': tweet['id'], 'id_urls': url_info['id_urls'], 'hostname': url_info['hostname'], 'type': medium['type'] })
def parse(self, response): # only parse html pages if not isinstance(response, HtmlResponse): return # process the downloaded webpage article = html2article(response.url, response.body) all_links = self.le.extract_links(response) # insert article into database with self.connection.begin() as trans: # insert into articles table url_info = get_url_info( self.connection, article.canonical_link, depth=response.request.depth, ) id_urls_canonical = url_info['id_urls'] sql = sqlalchemy.sql.text(''' INSERT INTO articles (id_urls,hostname,id_urls_canonical,id_responses,title,text,html,lang,pub_time) values (:id_urls,:hostname,:id_urls_canonical,:id_responses,:title,:text,:html,:lang,:pub_time) returning id_articles ''') res = self.connection.execute( sql, { 'id_urls': response.request.id_urls, 'hostname': response.request.hostname, 'id_urls_canonical': id_urls_canonical, 'id_responses': response.id_responses, 'title': article.title, 'text': article.text, 'html': article.article_html, 'lang': article.lang, 'pub_time': article.publish_date, }) id_articles = res.first()[0] # update keywords table keywords_lang = self.keywords.get(article.lang, []) alltext_lower = article.alltext.lower() text_lower = article.text.lower() title_lower = article.title.lower() keywords_alltext = sum( [alltext_lower.count(keyword) for keyword in keywords_lang]) keywords_text = sum( [text_lower.count(keyword) for keyword in keywords_lang]) keywords_title = sum( [title_lower.count(keyword) for keyword in keywords_lang]) sql = sqlalchemy.sql.text(''' INSERT INTO keywords (id_articles,keyword,num_title,num_text,num_alltext) VALUES (:id_articles,:keyword,:num_title,:num_text,:num_alltext) ''') res = self.connection.execute( sql, { 'id_articles': id_articles, 'keyword': 'north korea', 'num_title': keywords_title, 'num_text': keywords_text, 'num_alltext': keywords_alltext, }) # update authors table for author in article.authors: sql = sqlalchemy.sql.text(''' INSERT INTO authors (id_articles,author) values (:id_articles,:author) ''') self.connection.execute(sql, { 'id_articles': id_articles, 'author': author, }) # update refs table refs = [] refs.append([article.top_image, 'top_image', '']) for url in article.images: refs.append([url, 'image', '']) for url in article.movies: refs.append([url, 'movie', '']) for url in all_links: refs.append([url.url, 'link', url.text]) for (url, url_type, text) in refs: target_url_info = get_url_info(self.connection, url, depth=response.request.depth + 1) if target_url_info is not None: target = target_url_info['id_urls'] sql = sqlalchemy.sql.text(''' insert into refs (source,target,type,text) values (:source,:target,:type,:text); ''') self.connection.execute( sql, { 'source': id_articles, 'target': target, 'type': url_type, 'text': text[:2048], }) # yield all links for link in all_links: # calculate keywords in link text keywords_link = sum( [link.text.count(keyword) for keyword in keywords_lang]) # create the request r = scrapy.http.Request(url=link.url) r.priority = 10000 * keywords_link + 100 * keywords_title + keywords_text + keywords_alltext r.meta.update(link_text=link.text) r.depth = response.request.depth + 1 yield r
sys.path.append('.') from Novichenko.Bot.sqlalchemy_utils import get_url_info, insert_request from Novichenko.Bot.spiders.general_spider import GeneralSpider # create database connection engine = sqlalchemy.create_engine(args.db) connection = engine.connect() # add seeds with open(args.csv) as f: reader = csv.DictReader(f) for row in reader: # extract hostname from row url_info = get_url_info(connection, row['URL']) hostname = url_info['hostname'] #if hostname[:4]=='www.': #hostname=hostname[4:] print(f'adding {hostname}') # add seed URLS into the frontier if args.add_seeds and row['PRIORITY'] != 'ban': urls = [ #'http://'+hostname+'/'+url_info['path'], #'http://www.'+hostname+'/'+url_info['path'], #'https://'+hostname+'/'+url_info['path'], #'https://www.'+hostname+'/'+url_info['path'], ] if len(url_info['path']) > 1: urls += [