Example #1
0
 def run(self):
     print("thread post article to page already started")
     try:
         db_thread = None
         next_article = None
         count = 0
         #get statistic on FB, TW
         while self.is_running:
             try:
                 next_article = self.queue.get(True)
                 ''' sometime post bloomberg image causing error: (#100) picture URL is not properly formatted '''
                 if (POISON == next_article):
                     self.stop_running()
                     break
                 if ( next_article.thumbnail_url is None or len(next_article.thumbnail_url) < 8):#''''bloomberg' in next_article.url or'''
                     continue
                 if (next_article.thumbnail_url.startswith("//")):
                     next_article.thumbnail_url = "http:" + next_article.thumbnail_url
                     
                 '''put photo to fb'''
                 result_photo = None
                 result_photo_urls = None
                 tk_index = -1
                 try:
                     count = count + 1
                     tk_index= count % len(self.tokens)
                     api = facebook.GraphAPI(self.tokens[tk_index])
                     result_photo =  api.put_photo_url_2_page(next_article.short_description, next_article.thumbnail_url, profile_id="me", published=False)
                     post = api.get_object(id=result_photo['id'], fields="images")
                     result_photo_urls = str(post['images'])
                     print("photo_Id: " + result_photo['id'])
                     #print("result_photo_urls: " + str(result_photo_urls))
                 except Exception as e:
                     #print("Error index: " + tk_index)
                     print("Cannot post photo " + next_article.thumbnail_url + " to FB page {}".format(e))
                     
                 if ((result_photo is None or result_photo['id'] is None)):
                     print("Cannot post " + next_article.url + " to FB page")
                 else: # save id to database
                     '''save db post id'''
                     try:
                         try:
                             db_thread = IIIDatbaseConnection()
                             db_thread.init_database_cont()
                             print("update fb thumnail database result: ")
                             print(db_thread.update_article_fbid_photo(next_article.url, result_photo_urls, result_photo['id'] if result_photo is not None else None))
                         finally:
                             db_thread.close_database_cont()
                     except Exception as e:
                         print("Error save FB id to database: {}".format(e))
                     
             except Exception as e:
                 print("Error when post article to FB pages: {}".format(e))
             sleep(0.8)
         print("thread post article to FB stopped")
     except Exception as db_e:
         print("Error database thread post to Facebook: {}".format(db_e))
 def run(self):
     
     try:
         db_thread = IIIDatbaseConnection()
         db_thread.init_database_cont()
         ''' get list of existing url from db here'''
         running_time = 0
         while self.is_running:
             running_time = running_time + 1
             print("======================= Running time" + str(running_time) + "   ==============================")
             cur = db_thread.cursor()
             cur.execute("SELECT id,text, updated_time, title, category_id FROM articles WHERE is_duplicated = 0 AND UNIX_TIMESTAMP() - last_update_statistic < " + str(4 * 60 * 60))
             articles_set = cur.fetchall()
             print(len(articles_set))
             for article1 in articles_set:
                 for article2 in articles_set:
                     try:
                         similarity_content = get_consine_text(article1[1], article2[1])
                         similarity_title = get_consine_text(normalize_text(article1[3]), normalize_text(article2[3]))
                         similarity = (3 * similarity_content + 2 * similarity_title)/5
                         if (article1[0] != article2[0] and similarity > 0.56 
                                     and abs(article1[2] - article2[2]) < DUPLICATION_TIME_POSISIBLE 
                                     and article1[3] == article2[3]):
                             
                             print(similarity)
                             print("gotta delete")
                             print(article1[3] + "   ======================     " + article2[3])
                             duplicated_id = 0
                             if int(article1[2]) < int(article2[2]):
                                 duplicated_id = article1[0]
                             else:
                                 duplicated_id = article2[0]
                             print(db_thread.update_article_duplicated(duplicated_id, True))
                     
                     
                     
                     except Exception as e:
                         print("Error when get cosine and delete: {}".format(e))
             cur.close()
             print("finished: " + str(running_time))
             time.sleep(UPDATE_COUNT_PERIOD)
     except Exception as db_e:
         print("Error database duplicate: {}".format(db_e))
     finally:
         #close connection db before exit
         if (db_thread is not None):
             db_thread.close_database_cont()
Example #3
0
    def run(self):
        
        try:
            db_thread = IIIDatbaseConnection()
            db_thread.init_database_cont()
            next_url = None
            while self.is_running:
                #get statistic on FB, TW
                try:
                    next_url = self.queue.get(True)
                    if (next_url == None or next_url == POISON):
                        self.stop_running()
                        break
                    if (not db_thread.should_update_statisics(next_url)):
                        continue
                    param_url = next_url
                    if ("http://espn.go.com/" in next_url):
                        try:
                            param_url = next_url.split("_/id/")[0] + "_/id/" +  next_url.split("_/id/")[1].split("/")[0];
                        except Exception as e: 
                            print("cannot split espn url")
                            param_url = next_url
                    fb_param = dict(method = 'links.getStats',
                                urls = urllib.parse.quote(param_url, safe=''),
                                format = 'json')
                    
                    #tw_param = dict(url=next_url)
                    fb_resp = requests.get(url=FB_REST_API, params = fb_param)
                    data = json.loads(fb_resp.text)[0]
                    #tw_resp = requests.get(url=TWITTER_URL_API, params = tw_param)
                    #twi_data = json.loads(tw_resp.text)
                    #print(data)
                    #print(twi_data)
                    print(db_thread.update_article_count(next_url, data['comment_count'],
                                                         data['share_count'], data['like_count'], data['comments_fbid'], 0))
                except Exception as e:
                    print("Error when get FB, TW like comment: {}".format(e))

        except Exception as db_e:
            print("Error database thread get like_share count: {}".format(db_e))
        finally:
            #close connection db before exit
            if (db_thread is not None):
                db_thread.close_database_cont()
        print("commentlikeshare thread stoppped")
Example #4
0
                 
             
             
             
             
              
               
print('...................................................\n' +
      '...................................................\n' + 
      '...................................................\n' +
      'start get articles from washington post' +
      '...................................................\n' +
      '...................................................\n'
      )
try:
    db_connect = IIIDatbaseConnection()
    db_connect.init_database_cont()     
                  
                  
                  
    ''' we process homepage'''
    for home_page in washington_post_home_pages:
        print("extracting: " + home_page)
        washington_page = requests.get(home_page)
        html_tree = html.fromstring(washington_page.text)
        article_urls = html_tree.xpath('//a/@href')
        for home_url in article_urls:
            if  home_url is not None and len(home_url) > 16: 
                if ('http://' not in home_url and 'https://' not in home_url):
                    home_url = WASHINGTON_POST + home_url
                try:
Example #5
0
@author: Vu Trong Hoa
'''
from pymysql.err import MySQLError

#from DatabaseConnectionLib import IIIDatbaseConnection
from iiidatabase.DatabaseConnectionLib import IIIDatbaseConnection
cnn_category = {
    'sport': 'sport',
    'world': 'world',
    'tech': 'tech',
    'entertainment': 'entertainment',
    'opinions': 'opinions',
    'more': 'others'
}
try:
    db_connect = IIIDatbaseConnection()
    db_connect.init_database_cont()
    #start query everything here
    #print(db_connect.insert_category('category3', 'Category3'))
    #     print(db_connect.insert_source('usa_today', 'USA Today', 'http://www.usatoday.com'))
    #print(db_connect.insert_source2('vnexpress', 'VNEpress', 'http://vnexpress.net','http://graph.facebook.com/612208245488345/picture?type=large'))
    print(
        db_connect.insert_source2(
            'huffington_usa', 'The Huffington Post',
            'http://www.huffingtonpost.com/',
            'graph.facebook.com/18468761129/picture?type=large'))
    #print(db_connect.insert_article('http://this_is_first_bbc_url', 'this is funny article',
    #                             '43252', 'bbc_uk', 'entertainment', 10, 10, 10, True, False))

    #print(db_connect.update_article_like('http://this_is_first_bbc_url', 100))
    #print(db_connect.is_url_existed('http://this_is_first_bbc_urldf'))
    def run(self):
        try:
            ''' get list of existing url from db here'''
            running_time = 0
            while self.is_running:
                try:
                    running_time = running_time + 1
                    print("======================= Running time" +
                          str(running_time) +
                          "   ==============================")
                    all_urls = {}
                    try:
                        load_db_cont = IIIDatbaseConnection()
                        load_db_cont.init_database_cont()
                        cur = load_db_cont.cursor()
                        query = "SELECT id, url, updated_time, UNIX_TIMESTAMP() - updated_time FROM articles "\
                                    "WHERE (UNIX_TIMESTAMP() - last_time_update) > " + str(25 * 60) \
                                    + " AND UNIX_TIMESTAMP() - updated_time < " +str(AGE_GET_UPDATE)
                        print(query)
                        cur.execute(query)
                        all_urls = cur.fetchall()
                    except Exception as e:
                        print("Cannot read database {}".format(e))
                    finally:
                        try:
                            cur.close()
                            load_db_cont.close_database_cont()
                        except Exception as e:
                            pass

                    print(len(all_urls))
                    round_count = 0
                    urlparams = ""
                    for r in all_urls:
                        if (self.is_running):
                            try:
                                next_url = r[1]
                                if (int(time.time()) - int(r[2]) >
                                        DELETE_PERIOD):
                                    try:
                                        delelte_cont = IIIDatbaseConnection()
                                        delelte_cont.init_database_cont()
                                        delelte_cont.delete_article(r[0])
                                    except Exception as e:
                                        print(
                                            "Error when delete old articles: {}"
                                            .format(e))
                                    finally:
                                        try:
                                            delelte_cont.close_database_cont()
                                        except Exception as e:
                                            pass
                                    continue
                                if (int(time.time()) - int(r[2]) >
                                        AGE_GET_UPDATE):
                                    print("do not need to update this")
                                    continue
                                round_count = round_count + 1

                                urlparams += urllib.parse.quote(next_url,
                                                                safe='') + ","
                                if (round_count == 10):
                                    round_count = 0  # reset count
                                    fb_param = dict(method='links.getStats',
                                                    urls=urlparams,
                                                    format='json')
                                    fb_resp = requests.get(url=FB_REST_API,
                                                           params=fb_param)
                                    #print(fb_resp.text)
                                    for data in json.loads(fb_resp.text):
                                        try:
                                            save_cont = IIIDatbaseConnection()
                                            save_cont.init_database_cont()
                                            print(
                                                save_cont.update_article_count(
                                                    urllib.parse.unquote(
                                                        data['url']),
                                                    data['comment_count'],
                                                    data['share_count'],
                                                    data['like_count'],
                                                    data['comments_fbid'], 0))
                                        except Exception as e:
                                            print(
                                                "Error when update count for article: {}"
                                                .format(e))
                                        finally:
                                            try:
                                                save_cont.close_database_cont()
                                            except Exception as e:
                                                pass
                                    urlparams = ""
                            except Exception as e:
                                urlparams = ""
                                print("Error when get FB, TW like comment: {}".
                                      format(e))
                    all_urls = None
                except Exception as db_e:
                    print("Error database thread get like_share count: {}".
                          format(db_e))
                print("finsihed one round, now take sleep")
                time.sleep(UPDATE_COUNT_PERIOD)
        except Exception as db_e:
            print(
                "Error database thread get like_share count: {}".format(db_e))
    def run(self):
        print("thread post article to page already started")
        try:
            db_thread = IIIDatbaseConnection()
            #get statistic on FB, TW

            while self.is_running:
                try:
                    '''if (int(time.time()) - self.LAST_TIME_GET_TOKEN > TOKEN_TIMEOUT):
                        with open ("../../fb_token.txt", "r") as myfile:
                            self.FB_LONGLIVE_ACTK =myfile.read().replace('\n', '')
                            self.LAST_TIME_GET_TOKEN = int(time.time())
                            print("refreshed Facebook token")'''
                    for source_id in sources_page_id:
                        print("=============== sourceid: " + source_id +
                              "   ===============")
                        api = get_page_api(TOKEN, '1661911627411850')
                        print("get post on each source")
                        posts = api.get_object(id=source_id + '/posts')
                        #print(posts['data'])
                        for post in posts['data']:
                            try:
                                string_post_content = ''
                                if ('story' in post):
                                    string_post_content = string_post_content + post[
                                        'story'] + " "
                                if ('link' in post):
                                    string_post_content = string_post_content + " " + post[
                                        'link'] + " "
                                    print(post['link'])
                                if ('message' in post):
                                    string_post_content = string_post_content + post[
                                        'message']
                                    print(post['message'])
                                if (string_post_content is not None):
                                    #print("post content: " + string_post_content)
                                    urls = re.findall(r'(https?://\S+)',
                                                      string_post_content)

                                if (urls is None or len(urls) <= 0):
                                    print("not found url")
                                else:  # save id to database
                                    for url in urls:
                                        try:
                                            db_thread.init_database_cont()
                                            url = utils.normalize_url(url)
                                            print("postId: " + post['id'] +
                                                  " : " + url)
                                            print(
                                                db_thread.update_article_fbid(
                                                    url, post['id']))
                                            db_thread.close_database_cont()
                                        except Exception as e:
                                            print(
                                                "Error when save facebook post id to db: {}"
                                                .format(e))
                            except Exception as e:
                                print(
                                    "Error when get url from facebook post: {}"
                                    .format(e))
                except Exception as e:
                    print(
                        "Error when get url from facebook post: {}".format(e))
                time.sleep(2 * UPDATE_COUNT_PERIOD)
        except Exception as db_e:
            print("Error database thread post to Facebook: {}".format(db_e))
        finally:
            #close connection db before exit
            if (db_thread is not None):
                db_thread.close_database_cont()
Example #8
0
    except Exception as dbE:
        print("Error when insert article to db. {}".format(dbE))
    # after insert to database, we put this url to get share, comment, like
    url_sharelikecomment_queue.put(normalized_url, True)



print('...................................................\n' +
      '...................................................\n' + 
      '...................................................\n' +
      'start get articles from vnexpress' +
      '...................................................\n' +
      '...................................................\n'
      )
try:
    db_connect = IIIDatbaseConnection()
    db_connect.init_database_cont()
    
    

    
#     ''' we process homepage news'''
#     VNEXPRESS_HOMPAGE_NEWS = 'http://vnexpress.net/tin-tuc/thoi-su/'
#     vnexpress_homepage_news = requests.get(VNEXPRESS_HOMPAGE_NEWS)
#     html_tree = html.fromstring(vnexpress_homepage_news.text)
#     article_urls = html_tree.xpath('//a/@href')
#     for home_url in article_urls:
#         if  home_url is not None and len(home_url) > 16 : 
#             if ('http://' not in home_url and 'https://' not in home_url):
#                 home_url = VNEXPRESS_HOME + home_url
#             try: