def get_top(table='secwiki', column='domain', time=2020, top=10): """ 取top数据作饼图 :param table: :param column: :param time: :param top: :return type:dict :return value:percentage(domain top10+other) """ so = SQLite("data/secwiki.db") sql = "select {column},count(url) as ct from {table} \ where ts like '%{time}%' \ group by {column} \ order by ct DESC".format(column=column, table=table, time=time) r = so.query(sql) od = OrderedDict() for i in r: od[i[0]] = i[1] od_pec = dict() i = 0 for k, v in od.items(): if i < top: od_pec[k] = round(v / sum(od.values()), 4) else: break i = i + 1 od_pec['other'] = round(1 - sum(od_pec.values()), 4) return od_pec
def __init__(self): # ==== Required vars ===== # self.stdin_path = '/dev/null' self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' # self.pidfile_path = '/var/run/toidaemon/toidaemon.pid' self.pidfile_path = PID_FILE_PATH self.pidfile_timeout = 5 # ========================= # self.db_name = DB_PATH self.db = SQLite(self.db_name) self.table = self.db.get(ToiScraper.TABLE_NAME) print("Initializing...") if not self.table: print("No table found with name {0}. Creating it.".format( ToiScraper.TABLE_NAME)) self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA) else: if not self.table.get_info() == ToiScraper.TABLE_SCHEMA: error_str = "Table {0} exists but with incorrect schema".format( ToiScraper.TABLE_NAME) print(error_str) raise Exception(error_str) self.iter_date = self._get_init_date_full()
def setup(cls): """ Makes sure we have a database """ if cls.did_setup: return # init the database if needed if not os.path.exists(DB_FILE): # make sure the parent directory exists if len(os.path.dirname(DB_FILE)) > 0 \ and not os.path.exists(os.path.dirname(DB_FILE)): try: os.makedirs(os.path.dirname(DB_FILE)) except Exception, e: print "Failed to create %s: %s" % (os.path.dirname(DB_FILE), e) return # database init sql = SQLite.get(DB_FILE) sql.create('record_tokens', '''( token_id INTEGER PRIMARY KEY, record_id INT, on_server VARCHAR, cookie VARCHAR, token VARCHAR, secret VARCHAR, added TIMESTAMP, CONSTRAINT record_server UNIQUE (record_id, on_server) ON CONFLICT REPLACE )''') sql.execute("CREATE INDEX IF NOT EXISTS record_index ON record_tokens (record_id)") sql.execute("CREATE INDEX IF NOT EXISTS server_index ON record_tokens (on_server)") sql.execute("CREATE INDEX IF NOT EXISTS cookie_index ON record_tokens (cookie)") sql.execute("CREATE INDEX IF NOT EXISTS token_index ON record_tokens (token)")
def setup_tables(cls): """ Creates the SQLite tables we need, not the tables we deserve. Does nothing if the tables/indexes already exist """ if cls.sqlite_handle is None: cls.sqlite_handle = SQLite.get(os.path.join('databases', 'snomed.db')) # descriptions cls.sqlite_handle.create('descriptions', '''( concept_id INTEGER PRIMARY KEY, lang TEXT, term TEXT, isa VARCHAR, active INT )''') cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)") # relationships cls.sqlite_handle.create('relationships', '''( relationship_id INTEGER PRIMARY KEY, source_id INT, destination_id INT, rel_type INT, rel_text VARCHAR, active INT )''') cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)") cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)") cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)") cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)")
def setup_tables(cls): """ Creates the SQLite tables we need, not the tables we deserve. """ if cls.sqlite_handle is None: cls.sqlite_handle = SQLite.get("databases/snomed.db") # descriptions cls.sqlite_handle.create( "descriptions", """( concept_id INTEGER PRIMARY KEY, lang TEXT, term TEXT, isa VARCHAR, active INT )""", ) cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)") # relationships cls.sqlite_handle.create( "relationships", """( relationship_id INTEGER PRIMARY KEY, source_id INT, destination_id INT, rel_type INT, rel_text VARCHAR, active INT )""", ) cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)") cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)") cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)") cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)")
def setup_tables(cls): """ Creates the SQLite tables we need, not the tables we deserve. Does nothing if the tables/indexes already exist """ if cls.sqlite_handle is None: cls.sqlite_handle = SQLite.get(cls.database_path()) # descriptions cls.sqlite_handle.create( 'descriptions', '''( concept_id INTEGER PRIMARY KEY, lang TEXT, term TEXT, isa VARCHAR, active INT )''') # relationships cls.sqlite_handle.create( 'relationships', '''( relationship_id INTEGER PRIMARY KEY, source_id INT, destination_id INT, rel_type INT, rel_text VARCHAR, active INT )''')
def setup_tables(cls): """ Creates the SQLite tables we need, not the tables we deserve. Does nothing if the tables/indexes already exist """ if cls.sqlite_handle is None: cls.sqlite_handle = SQLite.get(cls.database_path()) # descriptions cls.sqlite_handle.create('descriptions', '''( concept_id INTEGER PRIMARY KEY, lang TEXT, term TEXT, isa VARCHAR, active INT )''') # relationships cls.sqlite_handle.create('relationships', '''( relationship_id INTEGER PRIMARY KEY, source_id INT, destination_id INT, rel_type INT, rel_text VARCHAR, active INT )''')
def setup_tables(cls): """ Creates the SQLite tables and imports SNOMED from flat files, if not already done """ if cls.sqlite_handle is None: cls.sqlite_handle = SQLite.get('umls.db') cls.sqlite_handle.create('snomed', '''( concept_id INTEGER PRIMARY KEY, lang TEXT, term TEXT )''') cls.import_snomed_from_csv()
def __init__(self): super().__init__() from sqlite import SQLite absolute = os.path.dirname(os.path.realpath(__file__)) db_file = os.environ.get("SQLITE_FILE") db_file = db_file if db_file else os.path.join(absolute, "databases/rxnorm.db") self.db_file = db_file self.handled = 0 self.sqlite = SQLite.get(self.db_file) self.sqlite.execute("DROP TABLE IF EXISTS drug_cache") self.sqlite.execute( """CREATE TABLE drug_cache (rxcui varchar, property text, value text)""" ) self.sqlite.execute("CREATE INDEX i_drug_cache ON drug_cache (rxcui, property)") self.sqlite.execute("DROP VIEW IF EXISTS drug_treatments_by_ndc") self.sqlite.execute( """CREATE VIEW drug_treatments_by_ndc as select a.value as ndc, b.value as treatment_intent from drug_cache a join drug_cache b on a.rxcui=b.rxcui where a.property='ndc' and b.property='treatment_intent' """ ) self.sqlite.execute("DROP VIEW IF EXISTS drug_classes_by_ndc") self.sqlite.execute( """CREATE VIEW drug_classes_by_ndc as select a.value as ndc, b.value as drug_class from drug_cache a join drug_cache b on a.rxcui=b.rxcui where a.property='ndc' and b.property='drug_class' """ ) self.sqlite.execute("DROP VIEW IF EXISTS drug_ingredients_by_ndc") self.sqlite.execute( """CREATE VIEW drug_ingredients_by_ndc as select a.value as ndc, b.value as drug_ingredient, c.str as ingredient_name from drug_cache a join drug_cache b on a.rxcui=b.rxcui join RXNCONSO c on c.rxcui=b.value where a.property='ndc' and b.property='ingredient' and c.sab='RXNORM' and c.tty='IN' """ )
def setup_tables(cls): if cls.sqlite_handle is None: cls.sqlite_handle = SQLite.get('storage.db') cls.sqlite_handle.create('studies', '''( nct UNIQUE, updated TIMESTAMP, elig_gender INTEGER, elig_min_age INTEGER, elig_max_age INTEGER, elig_population TEXT, elig_sampling TEXT, elig_accept_healthy INTEGER DEFAULT 0, elig_criteria TEXT )''') StudyEligibility.setup_tables()
def parse_all(fnames, reparse=False): """ 格式化为ts、tag、url、title、root_domain、domain、url_path :param reparse:是否重新全部解析 :return: """ sqldb = SQLite('data/secwiki.db') # 判断是否重新全部解析 if reparse: fnames = [] gen_file = glob.iglob(r'data/html/secwiki_*.html') sql = 'delete from `secwiki`' for gfile in gen_file: fnames.append(gfile) sqldb.execute(sql) if fnames is None: print('No new secwiki') return sql = 'insert into `secwiki` (`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`url_path`) values(?,?,?,?,?,?,?);' for fname in fnames: # 判断目标文件本地是否存在 m = re.search(r'secwiki_(\d+)\.html', fname) rname = m.group(1) rname = path('data/txt', 'secwiki_' + rname + '.txt') if not os.path.exists(path("data/txt")): os.mkdir(path("data/txt")) if os.path.exists(rname) and os.path.getsize(rname) > 0: continue # 待统一写入目标文件 rf = codecs.open(rname, mode='wb') # 读本地源文件并解析 with codecs.open(fname, 'rb') as f: all_content = {} #print(fname) for content in parse_single(f): if content: # 解析完写入目标文件 k = content[0] + content[2] all_content[k] = content line = "\t".join(content) rf.write(line.encode() + b'\r\n') # 批量存入sqlite3 if all_content: sqldb.executemany(sql, all_content.values()) rf.close()
def load(): """ 载入nvd.nist原始exp标记数据 """ # 取CVE exp白样本 so = SQLite('data/nvd.db') sql = 'select CVE_Items_cve_CVE_data_meta_ID,CVE_Items_cve_description_description_data_value from nvd_cve where CVE_Items_cve_references_reference_data_tags not like "%Exploit%"' cve_0 = sql2cve(so, sql) cve_0['label'] = 0 # 取CVE exp黑样本 sql = 'select CVE_Items_cve_CVE_data_meta_ID,CVE_Items_cve_description_description_data_value from nvd_cve where CVE_Items_cve_references_reference_data_tags like "%Exploit%"' cve_1 = sql2cve(so, sql) cve_1['label'] = 1 cve = pd.concat([cve_0, cve_1]) print(cve.head()) cve.to_csv('cve2.csv', index=False) return cve
def __init__(self): super().__init__() from sqlite import SQLite absolute = os.path.dirname(os.path.realpath(__file__)) db_file = os.environ.get('SQLITE_FILE') db_file = db_file if db_file else os.path.join(absolute, 'databases/rxnorm.db') self.db_file = db_file self.handled = 0 self.sqlite = SQLite.get(self.db_file) self.sqlite.execute('DROP TABLE IF EXISTS drug_cache') self.sqlite.execute('''CREATE TABLE drug_cache (rxcui varchar, property text, value text)''') self.sqlite.execute( 'CREATE INDEX i_drug_cache ON drug_cache (rxcui, property)') self.sqlite.execute('DROP VIEW IF EXISTS drug_treatments_by_ndc') self.sqlite.execute('''CREATE VIEW drug_treatments_by_ndc as select a.value as ndc, b.value as treatment_intent from drug_cache a join drug_cache b on a.rxcui=b.rxcui where a.property='ndc' and b.property='treatment_intent' ''') self.sqlite.execute('DROP VIEW IF EXISTS drug_classes_by_ndc') self.sqlite.execute('''CREATE VIEW drug_classes_by_ndc as select a.value as ndc, b.value as drug_class from drug_cache a join drug_cache b on a.rxcui=b.rxcui where a.property='ndc' and b.property='drug_class' ''') self.sqlite.execute('DROP VIEW IF EXISTS drug_ingredients_by_ndc') self.sqlite.execute('''CREATE VIEW drug_ingredients_by_ndc as select a.value as ndc, b.value as drug_ingredient, c.str as ingredient_name from drug_cache a join drug_cache b on a.rxcui=b.rxcui join RXNCONSO c on c.rxcui=b.value where a.property='ndc' and b.property='ingredient' and c.sab='RXNORM' and c.tty='IN' ''')
def setup_tables(cls): """ Creates the SQLite tables we need, not the tables we deserve. """ if cls.sqlite_handle is None: cls.sqlite_handle = SQLite.get('databases/snomed.db') # descriptions cls.sqlite_handle.create( 'descriptions', '''( concept_id INTEGER PRIMARY KEY, lang TEXT, term TEXT, isa VARCHAR, active INT )''') cls.sqlite_handle.execute( "CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)") # relationships cls.sqlite_handle.create( 'relationships', '''( relationship_id INTEGER PRIMARY KEY, source_id INT, destination_id INT, rel_type INT, rel_text VARCHAR, active INT )''') cls.sqlite_handle.execute( "CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)" ) cls.sqlite_handle.execute( "CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)" ) cls.sqlite_handle.execute( "CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)" ) cls.sqlite_handle.execute( "CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)" )
def __init__(self): # ==== Required vars ===== # self.stdin_path = '/dev/null' self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' # self.pidfile_path = '/var/run/toidaemon/toidaemon.pid' self.pidfile_path = PID_FILE_PATH self.pidfile_timeout = 5 # ========================= # self.db_name = DB_PATH self.db = SQLite(self.db_name) self.table = self.db.get(ToiScraper.TABLE_NAME) logger.info("Initializing...") if not self.table: logger.info("No table found with name {0}. Creating it.".format(ToiScraper.TABLE_NAME)) self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA) else: if not self.table.get_info() == ToiScraper.TABLE_SCHEMA: error_str = "Table {0} exists but with incorrect schema".format(ToiScraper.TABLE_NAME) logger.error(error_str) raise Exception(error_str) self.iter_date = self._get_init_date_full()
class ToiScraper(): TABLE_NAME = 'articles' TABLE_SCHEMA = [(u'ds', u'text(10)'), (u'title', u'text'), (u'url', u'text')] # Manually observed minimum date on TOI INIT_DATE = (2020, 1, 1) MIN_ENTRIES = 600 MAX_SLEEP = 3600 def __init__(self): # ==== Required vars ===== # self.stdin_path = '/dev/null' self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' # self.pidfile_path = '/var/run/toidaemon/toidaemon.pid' self.pidfile_path = PID_FILE_PATH self.pidfile_timeout = 5 # ========================= # self.db_name = DB_PATH self.db = SQLite(self.db_name) self.table = self.db.get(ToiScraper.TABLE_NAME) print("Initializing...") if not self.table: print("No table found with name {0}. Creating it.".format( ToiScraper.TABLE_NAME)) self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA) else: if not self.table.get_info() == ToiScraper.TABLE_SCHEMA: error_str = "Table {0} exists but with incorrect schema".format( ToiScraper.TABLE_NAME) print(error_str) raise Exception(error_str) self.iter_date = self._get_init_date_full() # Get the last date in the database with at least 600 entries in it (enough to tell that it's full) def _get_init_date_full(self): print( "Retrieving last retrieved date from database with at least {0} in it" .format(ToiScraper.MIN_ENTRIES)) first_date = self.db.execute(""" SELECT a.ds, a.count FROM ( SELECT ds, count(1) AS count FROM {0} GROUP BY ds ORDER BY DATE(ds) DESC ) a WHERE a.count > {1} LIMIT 1; """.format(ToiScraper.TABLE_NAME, ToiScraper.MIN_ENTRIES), get=True) if len(first_date) == 0: print( "No last date with given minimum entries found in DB, starting from beginning." ) return ToiScraper.INIT_DATE print("Last date with entries {0} found. {1} entries total.".format( first_date[0][0], first_date[0][1])) return self.get_next_day(*tuple(map(int, first_date[0][0].split('-')))) # Get the last date in the database with entries in it def _get_init_date(self): print("Retrieving last retrieved date from database") first_date = self.db.execute( 'SELECT ds FROM {0} ORDER BY DATE(ds) DESC LIMIT 1'.format( ToiScraper.TABLE_NAME), get=True) if len(first_date) == 0: print("No last date found in DB, starting from beginning.") return ToiScraper.INIT_DATE print("Last date {0} found.".format(first_date[0]['ds'])) return self.get_next_day( *tuple(map(int, first_date[0]['ds'].split('-')))) def get_last_valid_date(self): return datetime.utcnow() + timedelta(hours=5, minutes=30) # Check if the date is strictly before today in IST def is_valid_date(self, year, month, day): try: datetime(year, month, day) except ValueError: return False cur_time = datetime(year, month, day) india_time = self.get_last_valid_date() return cur_time + timedelta( days=1) < india_time and cur_time >= datetime( *ToiScraper.INIT_DATE) def compute_url_for_day(self, year, month, day): if not self.is_valid_date(year, month, day): return None # Day count used in TOI URL (1st October, 2015 == 42278) day_count = (date(year, month, day) - date(1900, 1, 1)).days + 2 return "http://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{daycount}.cms".format( year=year, month=month, day=day, daycount=day_count) def get_next_day(self, year, month, day): next_day = datetime(year, month, day) + timedelta(days=1) return (next_day.year, next_day.month, next_day.day) def _retrieve_url_contents(self, url, datetuple): print("Request sent to url {0}".format(url)) req = requests.get(url) print("Response retrieved, parsing") soup = BeautifulSoup(req.text, 'lxml') # Signature of the element we're interested in. We rely on the TOI webpage # not to change divs = soup.find_all( 'div', style= 'font-family:arial ;font-size:12;font-weight:bold; color: #006699') if not len(divs) == 1: error_str = "Found {0} divs matching signature. Aborting.".format( len(divs)) self.error(error_str) raise Exception(error_str) articles = divs[0].find_all('a') print("Found {0} hyperlinks in the archive.".format(len(articles))) articles = [a for a in articles if len(a.text) > 0] res = [] titles = set({}) for art in articles: corr_url = self.validate_url(art['href']) if corr_url: if art.text in titles: continue titles.add(art.text) res.append([ datetime(*datetuple).strftime('%Y-%m-%d'), art.text, corr_url, ]) print("Finished parsing, {0} rows remain".format(len(res))) return res # TOI specific article URL validation and correction def validate_url(self, url): URL_CORRECT = 'http://timesofindia.indiatimes.com/' URL_STANDARD = 'http://' URL_INSIDE = '.indiatimes.com/' if not url.startswith(URL_STANDARD) or not URL_INSIDE in url: if not url.endswith('.cms') or 'http' in url or ' ' in url: return None else: return URL_CORRECT + url return url def dedup_insert(self, data, ds): date_str = '-'.join(map(str, ds)) print("Asking to insert {0} articles in {1}".format( len(data), date_str)) rows = self.table.where({'ds': date_str}) print("Already {0} rows exist in {1}".format(len(rows), date_str)) titles = set({}) res = [] for a in rows: if not a['title'] in titles: titles.add(a['title']) res.append((a['ds'], a['title'], a['url'])) for r in data: if not r[1] in titles: titles.add(r[1]) res.append(r) print("{0} rows left after deduplicating".format(len(res))) if len(rows) > 0: print("Deleting {0} rows from {1}".format(len(rows), date_str)) self.table.del_where({'ds': date_str}) if len(res) > 0: print("Inserting {0} rows from {1}".format(len(res), date_str)) self.table.insert(res) def get_articles_for_day(self, year, month, day): print("Getting articles for the day") url = self.compute_url_for_day(year, month, day) if not url: return 0 data = self._retrieve_url_contents(url, (year, month, day)) self.dedup_insert(data, (year, month, day)) return len(data) def run(self): while True: while not self.is_valid_date(*self.iter_date): next_date = datetime(*self.iter_date) + timedelta(days=1) sec_to_next_date = (next_date - self.get_last_valid_date()).seconds print("Reached the end, {0} seconds until {1}".format( sec_to_next_date, datetime(*self.iter_date).strftime('%Y-%m-%d'))) if sec_to_next_date <= ToiScraper.MAX_SLEEP: time.sleep(sec_to_next_date) else: print( 'Seconds till next day {0} greater than {1}, so only sleeping for {1}' .format(sec_to_next_date, ToiScraper.MAX_SLEEP)) time.sleep(ToiScraper.MAX_SLEEP) print('Woken up, getting init date again') self.iter_date = self._get_init_date_full() print('New date set to {0}'.format(self.iter_date)) print("Retrieving articles for date {0}".format(self.iter_date)) num_rows = self.get_articles_for_day(*self.iter_date) print("Retrieved {0} rows from TOI".format(num_rows)) if num_rows == 0: print("Sleeping for 10 seconds, no rows retrieved") time.sleep(10) else: self.iter_date = self.get_next_day(*self.iter_date) print("Iterated to next day - {0}".format( datetime(*self.iter_date)))
def __init__(self): absolute = os.path.dirname(os.path.realpath(__file__)) self.sqlite = SQLite.get(os.path.join(absolute, 'databases/umls.db'))
class ToiScraper(): TABLE_NAME = 'articles' TABLE_SCHEMA = [(u'ds', u'text(10)'), (u'title', u'text'), (u'url', u'text')] # Manually observed minimum date on TOI INIT_DATE = (2000, 1, 18) MIN_ENTRIES = 600 MAX_SLEEP = 3600 def __init__(self): # ==== Required vars ===== # self.stdin_path = '/dev/null' self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' # self.pidfile_path = '/var/run/toidaemon/toidaemon.pid' self.pidfile_path = PID_FILE_PATH self.pidfile_timeout = 5 # ========================= # self.db_name = DB_PATH self.db = SQLite(self.db_name) self.table = self.db.get(ToiScraper.TABLE_NAME) logger.info("Initializing...") if not self.table: logger.info("No table found with name {0}. Creating it.".format(ToiScraper.TABLE_NAME)) self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA) else: if not self.table.get_info() == ToiScraper.TABLE_SCHEMA: error_str = "Table {0} exists but with incorrect schema".format(ToiScraper.TABLE_NAME) logger.error(error_str) raise Exception(error_str) self.iter_date = self._get_init_date_full() # Get the last date in the database with at least 600 entries in it (enough to tell that it's full) def _get_init_date_full(self): logger.info("Retrieving last retrieved date from database with at least {0} in it".format(ToiScraper.MIN_ENTRIES)) first_date = self.db.execute(""" SELECT a.ds, a.count FROM ( SELECT ds, count(1) AS count FROM {0} GROUP BY ds ORDER BY DATE(ds) DESC ) a WHERE a.count > {1} LIMIT 1; """.format(ToiScraper.TABLE_NAME, ToiScraper.MIN_ENTRIES), get=True ) if len(first_date) == 0: logger.info("No last date with given minimum entries found in DB, starting from beginning.") return ToiScraper.INIT_DATE logger.info("Last date with entries {0} found. {1} entries total.".format(first_date[0][0], first_date[0][1])) return self.get_next_day(*tuple(map(int, first_date[0][0].split('-')))) # Get the last date in the database with entries in it def _get_init_date(self): logger.info("Retrieving last retrieved date from database") first_date = self.db.execute('SELECT ds FROM {0} ORDER BY DATE(ds) DESC LIMIT 1'.format(ToiScraper.TABLE_NAME), get=True) if len(first_date) == 0: logger.info("No last date found in DB, starting from beginning.") return ToiScraper.INIT_DATE logger.info("Last date {0} found.".format(first_date[0]['ds'])) return self.get_next_day(*tuple(map(int, first_date[0]['ds'].split('-')))) def get_last_valid_date(self): return datetime.utcnow() + timedelta(hours=5, minutes=30) # Check if the date is strictly before today in IST def is_valid_date(self, year, month, day): try: datetime(year, month, day) except ValueError: return False cur_time = datetime(year, month, day) india_time = self.get_last_valid_date() return cur_time + timedelta(days = 1) < india_time and cur_time >= datetime(*ToiScraper.INIT_DATE) def compute_url_for_day(self, year, month, day): if not self.is_valid_date(year, month, day): return None # Day count used in TOI URL (1st October, 2015 == 42278) day_count = (date(year, month, day) - date(1900, 1, 1)).days + 2 return "http://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{daycount}.cms".format( year = year, month = month, day = day, daycount = day_count ) def get_next_day(self, year, month, day): next_day = datetime(year, month, day) + timedelta(days = 1) return (next_day.year, next_day.month, next_day.day) def _retrieve_url_contents(self, url, datetuple): logger.debug("Request sent to url {0}".format(url)) req = requests.get(url) logger.debug("Response retrieved, parsing") soup = BeautifulSoup(req.text, 'lxml') # Signature of the element we're interested in. We rely on the TOI webpage # not to change divs = soup.find_all('div', style='font-family:arial ;font-size:12;font-weight:bold; color: #006699') if not len(divs) == 1: error_str = "Found {0} divs matching signature. Aborting.".format(len(divs)) self.error(error_str) raise Exception(error_str) articles = divs[0].find_all('a') logger.debug("Found {0} hyperlinks in the archive.".format(len(articles))) articles = [a for a in articles if len(a.text) > 0] res = [] titles = set({}) for art in articles: corr_url = self.validate_url(art['href']) if corr_url: if art.text in titles: continue titles.add(art.text) res.append([ datetime(*datetuple).strftime('%Y-%m-%d'), art.text, corr_url, ]) logger.debug("Finished parsing, {0} rows remain".format(len(res))) return res # TOI specific article URL validation and correction def validate_url(self, url): URL_CORRECT = 'http://timesofindia.indiatimes.com/' URL_STANDARD = 'http://' URL_INSIDE = '.indiatimes.com/' if not url.startswith(URL_STANDARD) or not URL_INSIDE in url: if not url.endswith('.cms') or 'http' in url or ' ' in url: return None else: return URL_CORRECT + url return url def dedup_insert(self, data, ds): date_str = '-'.join(map(str, ds)) logger.debug("Asking to insert {0} articles in {1}".format(len(data), date_str)) rows = self.table.where({'ds': date_str}) logger.debug("Already {0} rows exist in {1}".format(len(rows), date_str)) titles = set({}) res = [] for a in rows: if not a['title'] in titles: titles.add(a['title']) res.append((a['ds'], a['title'], a['url'])) for r in data: if not r[1] in titles: titles.add(r[1]) res.append(r) logger.debug("{0} rows left after deduplicating".format(len(res))) if len(rows) > 0: logger.info("Deleting {0} rows from {1}".format(len(rows), date_str)) self.table.del_where({'ds': date_str}) if len(res) > 0: logger.info("Inserting {0} rows from {1}".format(len(res), date_str)) self.table.insert(res) def get_articles_for_day(self, year, month, day): logger.debug("Getting articles for the day") url = self.compute_url_for_day(year, month, day) if not url: return 0 data = self._retrieve_url_contents(url, (year, month, day)) self.dedup_insert(data, (year, month, day)) return len(data) def run(self): while True: while not self.is_valid_date(*self.iter_date): next_date = datetime(*self.iter_date) + timedelta(days=1) sec_to_next_date = (next_date - self.get_last_valid_date()).seconds logger.info("Reached the end, {0} seconds until {1}".format(sec_to_next_date, datetime(*self.iter_date).strftime('%Y-%m-%d'))) if sec_to_next_date <= ToiScraper.MAX_SLEEP: time.sleep(sec_to_next_date) else: logger.info('Seconds till next day {0} greater than {1}, so only sleeping for {1}'.format(sec_to_next_date, ToiScraper.MAX_SLEEP)) time.sleep(ToiScraper.MAX_SLEEP) logger.info('Woken up, getting init date again') self.iter_date = self._get_init_date_full() logger.info('New date set to {0}'.format(self.iter_date)) logger.info("Retrieving articles for date {0}".format(self.iter_date)) num_rows = self.get_articles_for_day(*self.iter_date) logger.info("Retrieved {0} rows from TOI".format(num_rows)) if num_rows == 0: logger.debug("Sleeping for 10 seconds, no rows retrieved") time.sleep(10) else: self.iter_date = self.get_next_day(*self.iter_date) logger.debug("Iterated to next day - {0}".format(datetime(*self.iter_date)))
def __init__(self): self.sqlite = SQLite.get("databases/umls.db")
def __init__(self): self.sqlite = SQLite.get("databases/rxnorm.db")
def __init__(self): self.sqlite = SQLite.get('databases/snomed.db')
__license__ = 'MIT' __copyright__ = 'Copyright (c) 2015 Muntashir Al-Islam' """ Sample Database Test Tasted on v0.2.0 Note: Error handling will not work on v0.1.0 Date: 22 Oct, 2015 """ began = datetime.now() print("Program starts at:", began) print("Connecting to Database...", end=' ') sqlite = SQLite(":memory:") print("...") if sqlite.connect_error: raise sqlite.connect_errno(sqlite.connect_error) else: print("Connected to Database.") print("Creating a table...", end=' ') stmt = sqlite.prepare("""CREATE TABLE sample ( ID integer PRIMARY KEY AUTOINCREMENT NOT NULL, Name text )""") stmt.execute() print("...") if stmt.error: raise stmt.errno(stmt.error) else: print("Table created.")
ts = float(ts) ts_str = datetime.datetime.fromtimestamp(ts).strftime( '%Y-%m-%d %H:%M:%S.%f') return ts_str def get_md5(path): return hashlib.md5(open(path, 'rb').read()).hexdigest() def get_sha1(path): return hashlib.sha1(open(path, 'rb').read()).hexdigest() if __name__ == '__main__': sqlite = SQLite('data.db') # sqlite.insert() while True: input_dir = input('Enter folder path: ') if isinstance(input_dir, str): work_dir = input_dir break else: pass # work_dir = 'D:\\共享区' for parent, dirnames, filenames in os.walk(work_dir, followlinks=True): for filename in filenames: file_path = os.path.join(parent, filename) file_attr = stat(file_path) attr_list = [ file_attr.st_mode, file_attr.st_uid, file_attr.st_gid,
def __init__(self): self.sqlite = SQLite.get(SNOMED.database_path())
def __init__(self): self.sqlite = SQLite.get(DB_FILE)
def __init__(self): absoulte = os.path.dirname(os.path.realpath(__file__)) self.sqlite = SQLite.get("../../databases/umls.db")
def __init__(self): self.sqlite = SQLite.get('databases/rxnorm.db')
def __init__(self): absolute = os.path.dirname(os.path.realpath(__file__)) self.sqlite = SQLite.get(os.path.join(absolute, "databases/rxnorm.db"))
def __init__(self): self.sqlite = SQLite.get('databases/umls.db')
def __init__(self): self.sqlite = SQLite.get("databases/snomed.db")
def sqlite_assure_handle(cls): if cls.sqlite_handle is None: cls.sqlite_handle = SQLite.get(cls.sqlite_default_db)