def get_tweets_for_nonprofit(nonprofits_id): """Retrieve tweets for the given nonprofit and store them in the DB.""" logger.debug('Inside get_tweets_for_nonprofit(nonprofit) for nonprofits_id {0}'.format(nonprofits_id)) nonprofit = DBSession.query(Nonprofit).get(nonprofits_id) max_tweet = DBSession.query(func.max(cast(Tweet.tweet_id, Integer)).label('max_tweet_id')).filter(Tweet.twitter_name == nonprofit.twitter_name).first() if max_tweet is None or max_tweet.max_tweet_id is None: max_tweet_id = 1 else: max_tweet_id = max_tweet.max_tweet_id tweets = [] if nonprofit.twitter_id is not None: tweets = givinggraph.twitter.tweets.get_tweets_by_id(nonprofit.twitter_id, True, since_id=max_tweet_id) elif nonprofit.twitter_name is not None: tweets = givinggraph.twitter.tweets.get_tweets_by_name(nonprofit.twitter_name, True, since_id=max_tweet_id) else: pass for tweet in tweets: DBSession.add(Tweet(tweet['user']['screen_name'], tweet['id_str'], tweet['created_at'], tweet['text'].encode('utf-8'), tweet['lang'], tweet['retweet_count'], tweet['favorite_count'], ', '.join([mention['id_str'] for mention in tweet['entities']['user_mentions']]), ', '.join([mention['screen_name'] for mention in tweet['entities']['user_mentions']]), ', '.join([hashtag['text'] for hashtag in tweet['entities']['hashtags']]), ', '.join([url['expanded_url'] for url in tweet['entities']['urls']]), tweet['in_reply_to_screen_name'], tweet['in_reply_to_user_id_str'], tweet['in_reply_to_status_id_str'])) DBSession.commit()
def add_nonprofit_company_news_article_connections(article_ids, companies): """Takes a list of IDs of news articles and a list of Company objects as input. If any of the articles contain a company name, a link is made in the DB between the article and the company.""" logger.debug( 'Inside add_nonprofit_company_news_article_connections(news_articles, companies)' ) for article_id in article_ids: article = DBSession.query(News_Article).get(article_id) if article is None: print '***************************' print '***************************' print article_id print '***************************' print '***************************' time.sleep(180) counter = 1 for company in companies: if counter % 100 == 0: print 'Processing article {0} for company {1} of {2}...'.format( article_id, counter, len(companies)) counter += 1 for mention in news_parser.get_company_mentions_in_text( article.text, company.name.encode('utf-8')): if news_parser.contains_supportive_wording(mention): article.companies.append(company) break DBSession.commit()
def add_guidestar_info_to_db(ein): """Takes the EIN of a nonprofit as input. If the nonprofit is already in the DB, its info is updated. If the nonprofit is not in the DB, it is inserted.""" logger.debug('Inside add_guidestar_info_to_db({0})'.format(ein)) query = DBSession.query(Nonprofit).filter(Nonprofit.ein == ein) nonprofit_db = query.first() nonprofit_gs = givinggraph.guidestar.search.get_nonprofit(ein) if nonprofit_gs is None: return None if nonprofit_db is None: nonprofit_db = Nonprofit(nonprofit_gs.name, nonprofit_gs.ein, nonprofit_gs.ntee_code, nonprofit_gs.mission, nonprofit_gs.mission, None, None, nonprofit_gs.city, nonprofit_gs.state, nonprofit_gs.zip) DBSession.add(nonprofit_db) else: nonprofit_db.name = nonprofit_gs.name nonprofit_db.ntee_code = nonprofit_gs.ntee_code nonprofit_db.mission = nonprofit_gs.mission nonprofit_db.description = nonprofit_gs.mission nonprofit_db.city = nonprofit_gs.city nonprofit_db.state = nonprofit_gs.state nonprofit_db.ZIP = nonprofit_gs.zip DBSession.commit() return nonprofit_db
def add_similarity_scores_for_nonprofit_tweets(): """Calculate similarity scores for every pair of nonprofit tweets and store them in the DB.""" logger.debug('Inside add_similarity_scores_for_nonprofit_tweets()') tweets = DBSession.query(Tweet.twitter_name, func.group_concat(Tweet.text).label('text')).group_by(Tweet.twitter_name).all() similarity_matrix = similarity.get_similarity_scores_all_pairs([tweet.text for tweet in tweets]) DBSession.query(Nonprofits_Similarity_By_Tweets).delete() for m in xrange(len(similarity_matrix) - 1): for n in xrange(m + 1, len(similarity_matrix)): DBSession.add(Nonprofits_Similarity_By_Tweets(tweets[m].twitter_name, tweets[n].twitter_name, similarity_matrix[m][n])) DBSession.commit()
def add_similarity_scores_for_nonprofit_descriptions(): """Calculate similarity scores for every pair of nonprofit descriptions and store them in the DB.""" logger.debug('Inside add_similarity_scores_for_nonprofit_descriptions()') nonprofits = DBSession.query(Nonprofit).filter(Nonprofit.description != None).all() # nopep8 similarity_matrix = similarity.get_similarity_scores_all_pairs([nonprofit.description for nonprofit in nonprofits]) DBSession.query(Nonprofits_Similarity_By_Description).delete() for m in xrange(len(similarity_matrix) - 1): for n in xrange(m + 1, len(similarity_matrix)): DBSession.add(Nonprofits_Similarity_By_Description(nonprofits[m].nonprofits_id, nonprofits[n].nonprofits_id, similarity_matrix[m][n])) DBSession.commit()
def add_news_articles_to_db_for_nonprofit(nonprofits_id): """Searches the web for news articles related to the nonprofit and stores them in the DB. Returns the IDs of the news articles found.""" logger.debug('Inside add_news_articles_to_db_for_nonprofit(nonprofit) for nonprofits_id {0}'.format(nonprofits_id)) nonprofit = DBSession.query(Nonprofit).get(nonprofits_id) query = DBSession.query(News_Article).filter(News_Article.nonprofits_id == nonprofits_id) already_retrieved_urls = [news_article.url for news_article in query.all()] news_articles = [] for article in news_searcher.find_news_articles(nonprofit.name, urls_to_ignore=already_retrieved_urls): news_articles.append(News_Article(nonprofit.nonprofits_id, article.url, article.headline, article.body)) DBSession.add_all(news_articles) DBSession.commit() return [news_article.news_articles_id for news_article in news_articles]
def update_null_nonprofit_twitter_ids(): """Finds nonprofits for which the Twitter name is not null, but the Twitter user ID is null, and gives the Twitter user ID a value.""" logger.debug('Inside update_null_nonprofit_twitter_ids()') query = DBSession.query(Nonprofit).filter(Nonprofit.twitter_id == None).filter(Nonprofit.twitter_name != None) # nopep8 nonprofits = query.all() screen_names = [nonprofit.twitter_name for nonprofit in nonprofits] screen_name_to_id_map = givinggraph.twitter.users.get_screen_name_to_id_map(screen_names) for nonprofit in nonprofits: if nonprofit.twitter_name.lower() in screen_name_to_id_map: nonprofit.twitter_id = screen_name_to_id_map[nonprofit.twitter_name.lower()] else: print '"{0}" was not found, the account may have been deleted or the screen name may have changed.'.format(nonprofit.twitter_name) DBSession.commit()
def add_similarity_scores_for_nonprofit_descriptions(): """Calculate similarity scores for every pair of nonprofit descriptions and store them in the DB.""" logger.debug('Inside add_similarity_scores_for_nonprofit_descriptions()') nonprofits = DBSession.query(Nonprofit).filter( Nonprofit.description != None).all() # nopep8 similarity_matrix = similarity.get_similarity_scores_all_pairs( [nonprofit.description for nonprofit in nonprofits]) DBSession.query(Nonprofits_Similarity_By_Description).delete() for m in xrange(len(similarity_matrix) - 1): for n in xrange(m + 1, len(similarity_matrix)): DBSession.add( Nonprofits_Similarity_By_Description( nonprofits[m].nonprofits_id, nonprofits[n].nonprofits_id, similarity_matrix[m][n])) DBSession.commit()
def update_nonprofit_twitter_name(nonprofits_id): """Takes the ID of a nonprofit and uses Yahoo to try to find the Twitter name for that nonprofit. If found, the nonprofit's entry in the DB is updated.""" logger.debug('Inside update_nonprofit_twitter_name(nonprofits_id) for nonprofits_id {0}'.format(nonprofits_id)) nonprofit = DBSession.query(Nonprofit).get(nonprofits_id) search_results = givinggraph.yahoo.search.get_search_results('twitter ' + nonprofit.name) if len(search_results) == 0: return twitter_url = search_results[0] twitter_url = twitter_url.replace('http://', '').replace('https://', '') twitter_name = None if twitter_url[:11] == 'twitter.com': twitter_name = twitter_url[12:] nonprofit.twitter_name = twitter_name DBSession.commit()
def get_tweets_for_nonprofit(nonprofits_id): """Retrieve tweets for the given nonprofit and store them in the DB.""" logger.debug( 'Inside get_tweets_for_nonprofit(nonprofit) for nonprofits_id {0}'. format(nonprofits_id)) nonprofit = DBSession.query(Nonprofit).get(nonprofits_id) max_tweet = DBSession.query( func.max(cast(Tweet.tweet_id, Integer)).label('max_tweet_id')).filter( Tweet.twitter_name == nonprofit.twitter_name).first() if max_tweet is None or max_tweet.max_tweet_id is None: max_tweet_id = 1 else: max_tweet_id = max_tweet.max_tweet_id tweets = [] if nonprofit.twitter_id is not None: tweets = givinggraph.twitter.tweets.get_tweets_by_id( nonprofit.twitter_id, True, since_id=max_tweet_id) elif nonprofit.twitter_name is not None: tweets = givinggraph.twitter.tweets.get_tweets_by_name( nonprofit.twitter_name, True, since_id=max_tweet_id) else: pass for tweet in tweets: DBSession.add( Tweet( tweet['user']['screen_name'], tweet['id_str'], tweet['created_at'], tweet['text'].encode('utf-8'), tweet['lang'], tweet['retweet_count'], tweet['favorite_count'], ', '.join([ mention['id_str'] for mention in tweet['entities']['user_mentions'] ]), ', '.join([ mention['screen_name'] for mention in tweet['entities']['user_mentions'] ]), ', '.join([ hashtag['text'] for hashtag in tweet['entities']['hashtags'] ]), ', '.join([ url['expanded_url'] for url in tweet['entities']['urls'] ]), tweet['in_reply_to_screen_name'], tweet['in_reply_to_user_id_str'], tweet['in_reply_to_status_id_str'])) DBSession.commit()
def add_similarity_scores_for_nonprofit_tweets(): """Calculate similarity scores for every pair of nonprofit tweets and store them in the DB.""" logger.debug('Inside add_similarity_scores_for_nonprofit_tweets()') tweets = DBSession.query(Tweet.twitter_name, func.group_concat( Tweet.text).label('text')).group_by( Tweet.twitter_name).all() similarity_matrix = similarity.get_similarity_scores_all_pairs( [tweet.text for tweet in tweets]) DBSession.query(Nonprofits_Similarity_By_Tweets).delete() for m in xrange(len(similarity_matrix) - 1): for n in xrange(m + 1, len(similarity_matrix)): DBSession.add( Nonprofits_Similarity_By_Tweets(tweets[m].twitter_name, tweets[n].twitter_name, similarity_matrix[m][n])) DBSession.commit()
def add_news_articles_to_db_for_nonprofit(nonprofits_id): """Searches the web for news articles related to the nonprofit and stores them in the DB. Returns the IDs of the news articles found.""" logger.debug( 'Inside add_news_articles_to_db_for_nonprofit(nonprofit) for nonprofits_id {0}' .format(nonprofits_id)) nonprofit = DBSession.query(Nonprofit).get(nonprofits_id) query = DBSession.query(News_Article).filter( News_Article.nonprofits_id == nonprofits_id) already_retrieved_urls = [news_article.url for news_article in query.all()] news_articles = [] for article in news_searcher.find_news_articles( nonprofit.name, urls_to_ignore=already_retrieved_urls): news_articles.append( News_Article(nonprofit.nonprofits_id, article.url, article.headline, article.body)) DBSession.add_all(news_articles) DBSession.commit() return [news_article.news_articles_id for news_article in news_articles]
def update_null_nonprofit_twitter_ids(): """Finds nonprofits for which the Twitter name is not null, but the Twitter user ID is null, and gives the Twitter user ID a value.""" logger.debug('Inside update_null_nonprofit_twitter_ids()') query = DBSession.query(Nonprofit).filter( Nonprofit.twitter_id == None).filter( Nonprofit.twitter_name != None) # nopep8 nonprofits = query.all() screen_names = [nonprofit.twitter_name for nonprofit in nonprofits] screen_name_to_id_map = givinggraph.twitter.users.get_screen_name_to_id_map( screen_names) for nonprofit in nonprofits: if nonprofit.twitter_name.lower() in screen_name_to_id_map: nonprofit.twitter_id = screen_name_to_id_map[ nonprofit.twitter_name.lower()] else: print '"{0}" was not found, the account may have been deleted or the screen name may have changed.'.format( nonprofit.twitter_name) DBSession.commit()
def update_nonprofit_twitter_name(nonprofits_id): """Takes the ID of a nonprofit and uses Yahoo to try to find the Twitter name for that nonprofit. If found, the nonprofit's entry in the DB is updated.""" logger.debug( 'Inside update_nonprofit_twitter_name(nonprofits_id) for nonprofits_id {0}' .format(nonprofits_id)) nonprofit = DBSession.query(Nonprofit).get(nonprofits_id) search_results = givinggraph.yahoo.search.get_search_results( 'twitter ' + nonprofit.name) if len(search_results) == 0: return twitter_url = search_results[0] twitter_url = twitter_url.replace('http://', '').replace('https://', '') twitter_name = None if twitter_url[:11] == 'twitter.com': twitter_name = twitter_url[12:] nonprofit.twitter_name = twitter_name DBSession.commit()
def add_nonprofit_company_news_article_connections(article_ids, companies): """Takes a list of IDs of news articles and a list of Company objects as input. If any of the articles contain a company name, a link is made in the DB between the article and the company.""" logger.debug('Inside add_nonprofit_company_news_article_connections(news_articles, companies)') for article_id in article_ids: article = DBSession.query(News_Article).get(article_id) if article is None: print '***************************' print '***************************' print article_id print '***************************' print '***************************' time.sleep(180) counter = 1 for company in companies: if counter % 100 == 0: print 'Processing article {0} for company {1} of {2}...'.format(article_id, counter, len(companies)) counter += 1 for mention in news_parser.get_company_mentions_in_text(article.text, company.name.encode('utf-8')): if news_parser.contains_supportive_wording(mention): article.companies.append(company) break DBSession.commit()