def run_from_sqlite(start=0,antall=5): """ Now this function only targets the links sqlite3 db that have status NULL start parameter should then always be 0 """ con = None try: lite_con = lite.connect('nrk2013_2.db') lite_cur = lite_con.cursor() # fetch links that do not have a status yet lite_sql = 'SELECT * FROM links WHERE status is NULL ORDER BY date DESC LIMIT %s,%s' % (start,antall) #print lite_sql lite_cur.execute(lite_sql) # 'SELECT SQLITE_VERSION()' rows = lite_cur.fetchall() #fetchone() # sjekk om url er scrapet i MYSQL db'n connection, cur = connect() # loop through SQLite set for row in rows: #print row[0], row[1], row[2], row[3] # 1 er url, 2 er tidspunkt for innsamling # check if url is in mysql as url or url_self_link (either is fine) # rewrite to only check for primary url (not self_utl) mysql_query = "SELECT * FROM page WHERE url = '%s' " % (row[1]) # OR url_self_link = '%s' cur.execute(mysql_query) mysql_rows = cur.fetchall() #print len(mysql_rows), row[1] if len(mysql_rows) == 0: # hvis 0 -> sett inn logger.info( "finnes, ikke, må settes inn: %s" % (row[1].encode('utf8')) ) #print row[1] sleep(scraping_request_stagger) # found in settings status = create_dictionary(row[1]) # I aleardy DO commit in rdbms_insertion.py # does this help? connection.commit() # updata sqlite3 with status logger.info( "status: %s" % status ) lite_cur.execute("UPDATE links SET status=? WHERE page=? AND link=?", (status, row[0], row[1])) lite_con.commit() else: # does exist in mysql, but has status NULL in .db # update row in .db lite_cur.execute("UPDATE links SET status=? WHERE page=? AND link=?", ("scraped", row[0], row[1])) lite_con.commit() logger.info( "fantes allarede i mysql, oppdaterer sqlite3: %s" % (row[1]) ) lite_con.close() # close sqlite3 connection.close() # close mysql except lite.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def ui(): # 发送按钮响应事件 m = inputUrl.get() n = connect_mysql.connect(m) #编写send1函数将消息发送到后台做相应处理 print(n)
import pandas as pd import numpy as np from sqlalchemy.orm import sessionmaker from connect_mysql import connect # create a configured "Session" Session = sessionmaker(bind=connect()) def get_unique_df_values(dataframe, col_name): """ Get unique values from a dataframe """ # Get unique value of a column unique_value_list = dataframe[col_name].unique().tolist() return unique_value_list def read_csv(file_name): """ read csv file and map nutrition grade into integer """ df_original = pd.read_csv(file_name) df_original = df_original.replace({"-": None}) df_original = df_original.replace({np.nan: None}) # Here we map the nutrition grade into integers grade = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, None: 6} df_original['nutrition_grade_fr'] = [ grade[item] for item in df_original['nutrition_grade_fr'] ]
def add_to_db(dict): # <this is moved to run.py> # # create logger with 'tldextract' # logger = logging.getLogger('tldextract') # logger.setLevel(logging.DEBUG) # # create file handler which logs even debug messages # fh = logging.FileHandler('spam.log') # fh.setLevel(logging.DEBUG) # # create console handler with a higher log level # ch = logging.StreamHandler() # ch.setLevel(logging.ERROR) # # create formatter and add it to the handlers # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # fh.setFormatter(formatter) # ch.setFormatter(formatter) # # add the handlers to the logger # logger.addHandler(fh) # logger.addHandler(ch) # <added local logger> rdbms_logger = logging.getLogger('nrk2013')#nrk2013.rdbms_insertion') with open('insertion.sql', 'r') as f: insertion = f.read() with open('insertion_link.sql', 'r') as f: insertion_link = f.read() with open('insertion_author.sql', 'r') as f: insertion_author = f.read() with open('insertion_factbox.sql', 'r') as f: insertion_factbox = f.read() # connection info in connect_mysql.py connection, cur = connect() # kommentrer ut try/except her for å finne en feil... #try: # Vi må være forsiktige med forfattere, fordi NRK ikke alltid klarer å huske på dem. # Se td. http://www.nrk.no/livsstil/test-av-norges-mest-solgte-brod-1.8352163, med tre forfattere. for author in dict['authors']: authorName = author[0] if(authorName): authorName = authorName else: authorName = None authorMail = author[1] if(authorMail): authorMail = authorMail else: authorMail = None authorRole = author[2] if(authorRole): authorRole = authorRole else: authorRole = None # if all three are not None if not ((authorName is None) & (authorMail is None) & (authorRole is None)): #print authorName, authorMail, authorRole # insert cur.execute(insertion_author, (dict['url'], # dict['url_self_link'] authorName, authorMail, authorRole)) # import sys # sys.exit(0) # else: # print authorName, authorMail,authorRole for box in dict['factbox']: cur.execute(insertion_factbox, (dict['url'], # dict['url_self_link'] len(box['links']), box['wordcount'], box['text'].encode('utf-8'))) for link in dict['internal_links']: extr = tldextract.extract(link) cur.execute(insertion_link.encode('utf-8'), (dict['url'].encode('utf-8'), # dict['url_self_link'] link.encode('utf-8'), u"html", extr[0].encode('utf-8'), extr[1].encode('utf-8'), extr[2].encode('utf-8'), '1'.encode('utf-8'))) for link in dict['external_links']: extr = tldextract.extract(link) cur.execute(insertion_link.encode('utf-8'), (dict['url'].encode('utf-8'), # url_self_link link.encode('utf-8'), u"html", extr[0].encode('utf-8'), extr[1].encode('utf-8'), extr[2].encode('utf-8'), '0'.encode('utf-8'))) published = dict['published'] if(published != "NULL") : published = datetime.fromtimestamp(mktime(dict['published'])).strftime("%Y-%m-%d %H:%M:%S") #updated = "" timestamp = dict['timestamp'].strftime("%Y-%m-%d %H:%M:%S") # print dict['updated'] # if dict['updated']: # updated = (" ".join(elem for elem in dict['updated'])).replace('.', '-') + ":00" # else: # pass # #updated = published # print updated cur.execute(insertion, (dict['url'], dict['url_self_link'], dict['headline'], #.encode('utf-8'), dict['body'], #.encode('utf-8'), published, dict['updated'], #updated, timestamp, dict['fb_like'], dict['fb_share'], dict['googleplus_share'], dict['twitter_share'], dict['others_share'], dict['language'], #.encode('utf-8'), dict['lesbahet'], dict['news_bureau'], #"NA", # Får ikke fatt på nyhetsbyrå enda. len(dict['external_links']), len(dict['internal_links']), dict['word_count'], dict['line_count'], dict['char_count'], len(dict['factbox']), dict['comment_fields'], dict['comment_number'], dict['interactive_elements'], #"interactive_elements IS NOT DONE", dict['poll'], #"NOT DONE", dict['game'], #"NOT DONE", dict['video_files'], dict['video_files_nrk'], dict['flash_file'], dict['image_collection'], dict['images'], dict['image_captions'],# .encode('utf-8'), dict['related_stories'], dict['related_stories_box_thematic'], #"related_stories_box_thematic IS NOT DONE", dict['related_stories_box_les'], #"related_stories_box_les IS NOT DONE", dict['map'], # map IS NOT DONE dict['publiseringssted'], dict['programtilknytning'], dict['hovedkategori'], dict['iframe'], dict['css'], dict['js'], dict['template'])) connection.commit() return #except: # print "hva?! SLutten av rdbms_insertion.py. hadde ikke ventet å komme hit. noe " # rdbms_logger.error("DB insert feilet!")
self.url = url self.nutrition_grade = nutrition_grade self.energy = energy self.proteins = proteins class Favorite(Base): __tablename__ = 'favorites' id = Column(Integer, primary_key=True) product_name = Column(String(500)) bar_code = Column(String(1500)) url = Column(String(2500)) nutrition_grade = Column(Integer) energy = Column(Float) proteins = Column(Float) store_name = Column(String(200)) def __init__(self, product_name, bar_code, url, nutrition_grade, energy, proteins, store_name): self.product_name = product_name self.bar_code = bar_code self.url = url self.nutrition_grade = nutrition_grade self.energy = energy self.proteins = proteins self.store_name = store_name Base.metadata.create_all(connect())
def add_to_db(dict): # <this is moved to run.py> # # create logger with 'tldextract' # logger = logging.getLogger('tldextract') # logger.setLevel(logging.DEBUG) # # create file handler which logs even debug messages # fh = logging.FileHandler('spam.log') # fh.setLevel(logging.DEBUG) # # create console handler with a higher log level # ch = logging.StreamHandler() # ch.setLevel(logging.ERROR) # # create formatter and add it to the handlers # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # fh.setFormatter(formatter) # ch.setFormatter(formatter) # # add the handlers to the logger # logger.addHandler(fh) # logger.addHandler(ch) # <added local logger> rdbms_logger = logging.getLogger('nrk2013') #nrk2013.rdbms_insertion') with open('insertion.sql', 'r') as f: insertion = f.read() with open('insertion_link.sql', 'r') as f: insertion_link = f.read() with open('insertion_author.sql', 'r') as f: insertion_author = f.read() with open('insertion_factbox.sql', 'r') as f: insertion_factbox = f.read() # connection info in connect_mysql.py connection, cur = connect() # kommentrer ut try/except her for å finne en feil... #try: # Vi må være forsiktige med forfattere, fordi NRK ikke alltid klarer å huske på dem. # Se td. http://www.nrk.no/livsstil/test-av-norges-mest-solgte-brod-1.8352163, med tre forfattere. for author in dict['authors']: authorName = author[0] if (authorName): authorName = authorName else: authorName = None authorMail = author[1] if (authorMail): authorMail = authorMail else: authorMail = None authorRole = author[2] if (authorRole): authorRole = authorRole else: authorRole = None # if all three are not None if not ((authorName is None) & (authorMail is None) & (authorRole is None)): #print authorName, authorMail, authorRole # insert cur.execute( insertion_author, ( dict['url'], # dict['url_self_link'] authorName, authorMail, authorRole)) # import sys # sys.exit(0) # else: # print authorName, authorMail,authorRole for box in dict['factbox']: cur.execute( insertion_factbox, ( dict['url'], # dict['url_self_link'] len(box['links']), box['wordcount'], box['text'].encode('utf-8'))) for link in dict['internal_links']: extr = tldextract.extract(link) cur.execute( insertion_link.encode('utf-8'), ( dict['url'].encode('utf-8'), # dict['url_self_link'] link.encode('utf-8'), u"html", extr[0].encode('utf-8'), extr[1].encode('utf-8'), extr[2].encode('utf-8'), '1'.encode('utf-8'))) for link in dict['external_links']: extr = tldextract.extract(link) cur.execute( insertion_link.encode('utf-8'), ( dict['url'].encode('utf-8'), # url_self_link link.encode('utf-8'), u"html", extr[0].encode('utf-8'), extr[1].encode('utf-8'), extr[2].encode('utf-8'), '0'.encode('utf-8'))) published = dict['published'] if (published != "NULL"): published = datetime.fromtimestamp(mktime( dict['published'])).strftime("%Y-%m-%d %H:%M:%S") #updated = "" timestamp = dict['timestamp'].strftime("%Y-%m-%d %H:%M:%S") # print dict['updated'] # if dict['updated']: # updated = (" ".join(elem for elem in dict['updated'])).replace('.', '-') + ":00" # else: # pass # #updated = published # print updated cur.execute( insertion, ( dict['url'], dict['url_self_link'], dict['headline'], #.encode('utf-8'), dict['body'], #.encode('utf-8'), published, dict['updated'], #updated, timestamp, dict['fb_like'], dict['fb_share'], dict['googleplus_share'], dict['twitter_share'], dict['others_share'], dict['language'], #.encode('utf-8'), dict['lesbahet'], dict['news_bureau'], #"NA", # Får ikke fatt på nyhetsbyrå enda. len(dict['external_links']), len(dict['internal_links']), dict['word_count'], dict['line_count'], dict['char_count'], len(dict['factbox']), dict['comment_fields'], dict['comment_number'], dict['interactive_elements'], #"interactive_elements IS NOT DONE", dict['poll'], #"NOT DONE", dict['game'], #"NOT DONE", dict['video_files'], dict['video_files_nrk'], dict['flash_file'], dict['image_collection'], dict['images'], dict['image_captions'], # .encode('utf-8'), dict['related_stories'], dict[ 'related_stories_box_thematic'], #"related_stories_box_thematic IS NOT DONE", dict[ 'related_stories_box_les'], #"related_stories_box_les IS NOT DONE", dict['map'], # map IS NOT DONE dict['publiseringssted'], dict['programtilknytning'], dict['hovedkategori'], dict['iframe'], dict['css'], dict['js'], dict['template'])) connection.commit() return