def parse(fcode): url = "http://www.clcindex.com/category/%s/" % fcode ip = {"http": random_ip()} try: res = requests.get(url, proxies=ip, timeout=3) if res.status_code == 200: print(url) print(ip) html = Selector(res.text, 'html') clc_nos = html.xpath( "//tr[@name='item-row']//td[2]/text()").extract() for i, clc_no in enumerate(clc_nos): clc_no = clc_no.replace("\t", "").replace("\n", "") clc_name = html.xpath("//tr[@name='item-row']//td[3]//text()" ).extract()[i].replace("\t", "").replace( "\n", "") sql = "insert or replace into clc(fcode,info) values ('%s','%s')" % ( clc_no, clc_name) curser = conn.cursor() curser.execute(sql) conn.commit() utils.printf("%s 插入成功" % clc_no) sql_up = "update clc set stat = 1 where fcode = '%s'" % fcode curser = conn.cursor() curser.execute(sql_up) conn.commit() else: print("ip err") except Exception as e: print(e)
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'apsjournal') cur = conn.cursor() current_year = time.strftime('%Y') cur.execute( "select url,stat from issue where stat=0 or year=%s or year=%s" % (current_year, int(current_year) - 1)) rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.list_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) else: self.sendwork('parse_list') for url, _ in rows: fdir = self.list_path + '/' + url.split('/')[-4] if not os.path.exists(fdir): os.makedirs(fdir) fname = fdir + '/' + url.split('/')[-2] + '_' + url.split( '/')[-1] + '.html' self.sendwork('down_list', (url, fname))
def down_detail(self): utils.printf("下载详情页开始...") super().down_detail() conn = utils.init_db('mysql', 'cqjtu_kingbook') cur = conn.cursor() while True: cur.execute( 'select bookid,stat from book where stat=0 limit 10000') rows = cur.fetchall() conn.commit() if len(rows) == 0: break for bookid, _ in rows: print(bookid) url = 'http://123.56.143.23/kingbookwaiwen/book/info.aspx?id={}'.format( bookid) dirname = '%s/%s' % (self.detail_path, bookid[:3]) if not os.path.exists(dirname): os.makedirs(dirname) filename = '%s/%s.html' % (dirname, bookid) if os.path.exists(filename): sql = 'update book set stat=1 where bookid="{}"'.format( bookid) cur.execute(sql) conn.commit() continue resp = utils.get_html(url, proxies=self.proxy) if not resp: continue with open(filename, mode='w', encoding='utf8') as f: f.write(resp.content.decode()) sql = 'update book set stat=1 where bookid="{}"'.format(bookid) cur.execute(sql) conn.commit() utils.printf("下载", bookid, "成功...")
def _summary_tvalues(self): ''' Summary t-values :return: ''' print('') utils.printf('=== Summary T-Values ===') cols = [ 'MIN', 'MEDIAN', 'MEAN', 'MAX', 'STD. DEV.', 'T-TEST', 'P(+)', 'P(-)' ] df = pd.DataFrame(index=self.tvalues.keys(), columns=cols) for k, v in self.tvalues.items(): tstats = self.model.tvalues[k] df.loc[k, 'MIN'] = round(min(v), 4) df.loc[k, 'MEDIAN'] = round(np.median(v), 4) df.loc[k, 'MEAN'] = round(np.mean(v), 4) df.loc[k, 'MAX'] = round(max(v), 4) df.loc[k, 'STD. DEV.'] = round(np.std(v), 4) df.loc[k, 'T-TEST'] = round(tstats, 4) df.loc[k, 'P(+)'] = round(np.mean([int(c >= tstats) for c in v]), 4) df.loc[k, 'P(-)'] = round(np.mean([int(c <= tstats) for c in v]), 4) print(df)
def parse_index(self, message): try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'hepengineeringjournal', 4) self.sqlList.clear() cur = conn.cursor() for filename, fullname in utils.file_list(self.index_path): with open(fullname, encoding='utf8') as f: text = f.read() dic = json.loads(text, encoding='utf-8') gch = filename.replace('.json', '') dicitem = dic['resultValue'] issn = dicitem['issnNm'] cnno = dicitem['cnNm'] sql = 'update journal set issn="%s",cnno="%s" where journal_id="%s"' % ( issn, cnno, gch) cur.execute(sql) conn.commit() cur.close() conn.close() utils.printf('%s:解析索引页完成...' % self.provider) # self.sendwork('down_cover') self.senddistributefinish('get_issuelist') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def startdown_detail(self, message): if not self.detail_path: self.initpath() self.sqlList.clear() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'hepengineeringjournal', 4) cur = conn.cursor() cur.execute( 'select article_id,journal_id from article where stat=0 and failcount<3' ) rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: utils.printf('%s:下载详情页完成' % self.provider) # self.sendwork('parse_detail_meta') self.sendwork('parse_detail') # self.sendwork('down_cover') return messagelist = [] for article_id, journal_id in rows: fdir = '%s/%s' % (self.detail_path, journal_id) if not os.path.exists(fdir): os.makedirs(fdir) messagelist.append((article_id, journal_id)) if len(messagelist) == 30: blist = messagelist.copy() self.sendwork('down_detail', blist) # utils.printf('a'+len(messagelist)) # utils.printf(messagelist) messagelist.clear() if len(messagelist) > 0: self.sendwork('down_detail', messagelist)
def run(self): conn = utils.init_db('mysql', 'aipjournal') cur = conn.cursor() sql = "select url,stat from issue where stat=0 limit 1000;" time_last = time.time() cnt = 0 while True: if url_queue.empty(): cur.execute(sql) rows = cur.fetchall() conn.commit() if rows: for row in rows: url_queue.put(row) elif sql_queue.empty(): break time_now = time.time() if (sql_queue.qsize() > 100) or (time_now - time_last > 60): num = sql_queue.qsize() while num > 0: url, flag = sql_queue.get() cur.execute( "update issue set stat={} where url='{}'".format( flag, url)) cnt += 1 num -= 1 conn.commit() utils.printf('succssed:%d' % (cnt)) time_last = time.time() time.sleep(1)
def down_cover(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) now_time = time.strftime('%Y%m%d') dirpath = cover_path + '/' + now_time if not os.path.exists(dirpath): os.makedirs(dirpath) sql_up = "update video set cover_stat = 1 where rawid = %s" result = [] while True: sql = "select rawid,cover_url from video where cover_stat=0 limit 1000" cur = conn.cursor() cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for rawid, cover_url in rows: path = dirpath + '/%s.jpg' % rawid res = utils.get_html(cover_url, proxies=proxy, timeout=50) if res: if os.path.exists(path): result.append((rawid)) utils.printf("该", rawid, "存在...") else: if utils.Img2Jpg(res.content, path): result.append((rawid)) utils.printf("下载", rawid, "成功...") else: print('%s -- down cover error' % rawid) if utils.parse_results_to_sql(conn, sql_up, result, 100): total = len(result) result.clear() print('更新 ', total, ' 个结果到数据库成功') utils.parse_results_to_sql(conn, sql_up, result) print('更新 ', len(result), ' 个结果到数据库成功')
def down_index(self, message): try: journalname = message[0] year = message[1] fname = self.index_path + '/' + journalname + '_' + str( year) + '.html' utils.printf('开始下载 %s' % fname) if os.path.exists(fname): self.senddistributefinish('process_index') return feature = 'issue-month-detail' url = 'http://{}.sciencemag.org/content/by/year/{}'.format( journalname, str(year)) resp = self.gethtml(url, feature) if not resp: self.sendwork('down_index', (journalname, year)) return with open(fname, mode='w', encoding='utf8') as f: f.write(resp.content.decode('utf8')) utils.printf('下载 %s 成功' % fname) self.senddistributefinish('process_index') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def handleMenuItem_(self, ns_item): '''ObjC callback to handle and dispatch C{NSMenuItem} clicks and shortcuts. All clicks and shortcuts are dispatched to the I{action} method of this I{NSDelegate}'s L{App} instance. Unhandled clicks, shortcuts and dispatch errors are silently ignored, unless L{App} C{raiser} keyword argument was C{True}. ''' item = ns2Item(ns_item) act = item._action for t, i in ((self.app, item), (self, ns_item)): m = getattr(t, act, None) if m and callable(m): try: m(i) break except Exception: if _Globals.raiser: printf('%s(%r): %r method %s ...', _handleMenuItem_name, i, t, act) raise else: if _Globals.raiser: raise RuntimeError('%s(%r): %s' % ('unhandled', item, act))
def parse_detal(): for file, fullpath in utils.file_list(detailpath): j_id = file.replace(".html", '') with open(fullpath, encoding='utf8') as f: text = f.read() html = Selector(text, 'html') title = html.xpath("//h3/text()").extract_first("") title_en = html.xpath("//h4/text()").extract_first("").replace( "'", "''") div = html.xpath("//div[@class='perinfo']/text()").extract() zbdw = dq = issn = cn = shijian = "" for item in div: if item.startswith("主办单位:"): zbdw = item.replace("主办单位:", "") if item.startswith("地区:"): dq = item.replace("地区:", "") if item.startswith("国际刊号:"): issn = item.replace("国际刊号:", "") if item.startswith("国内刊号:"): cn = item.replace('国内刊号:', '') if item.startswith("出版周期:"): shijian = item.replace("出版周期:", "") # utils.printf(title,title_en,zbdw,dq,issn,cn,shijian) sql = "update journal set 期刊名称_外文 = '%s' , 主办单位 = '%s' , 地区 = '%s' , 国际刊号 = '%s' , 国内刊号 = '%s' , 出版周期 = '%s' where 期刊id = '%s'" % ( title_en, zbdw, dq, issn, cn, shijian, j_id) curser = db.cursor() curser.execute(sql) curser.commit() utils.printf("更新%s信息成功" % title)
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'science') cur = conn.cursor() cur.execute('select url,stat from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.index_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) utils.msg2weixin('%s:没有新的issue不需要更新' % self.provider) else: self.sendwork('parse_list') for url, _ in rows: fdir = self.list_path + '/' + url.split('.')[0] if not os.path.exists(fdir): os.makedirs(fdir) fname = fdir + '/' + url.split('/')[-2] + '_' + url.split( '/')[-1] + '.html' url = 'http://' + url self.sendwork('down_list', (url, fname))
def get_year2que(): for year in range(1949, 2021): base_url = dic_journal['Pharmacological Reviews'] url = base_url + '/%s' % str(year) message = (str(year), url) year_que.put(message) utils.printf("%s年url添加成功!~" % year)
def parse_detail(self, message): conn = utils.init_db('mysql', 'hepjournal', 4) cur = conn.cursor() cur.execute( 'select journal_id,journal_name,issn,eissn,cnno from journal') rows = cur.fetchall() for journal_id, journal_name, issn, eissn, cnno in rows: self.dic[journal_id] = (journal_name, issn, eissn, cnno) cur.close() conn.close() self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, volume, issue, page, beginpage, endpage, publisher, subject, date,creator_institution, date_created, source, identifier_pissn, identifier_eissn, identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type, medium, batch, gch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def down_cover(self, message): HEADER = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } bookid = message[0] cover_url = message[1] filename = self.cover_path + '/' + bookid + '.jpg' if os.path.exists(filename): self.senddistributefinish('process_cover', bookid) return try: proxy = self.getproxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(cover_url, headers=HEADER, timeout=20, proxies=proxies) # resp = requests.get(cover_url, headers=HEADER, timeout=20) except: self.sendwork('down_cover', message) return if utils.Img2Jpg(resp.content, filename): utils.printf('下载图片%s成功' % filename) self.senddistributefinish('process_cover', bookid) else: self.sendwork('down_cover', message) return
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.sqlList.clear() self.count = 0 conn = utils.init_db('mysql', 'hepjournal', 4) cur = conn.cursor() cur.execute('select url,journal_id from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.list_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) else: # self.sendwork('down_cover') self.sendwork('parse_list') for url, journal_id in rows: fdir = self.list_path + '/' + journal_id if not os.path.exists(fdir): os.makedirs(fdir) fname = fdir + '/' + journal_id + '_' + url.split( '/')[-2] + '_' + url.split('/')[-1].replace('.shtml', '.html') self.sendwork('down_list', (url, fname))
def down_detail(): utils.printf("下载详情页开始...") now_time = datetime.datetime.now().strftime("%Y%m%d") conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) cur = conn.cursor() while True: cur.execute('select bookid,stat from book where stat=0 limit 10000') rows = cur.fetchall() conn.commit() if len(rows) == 0: break for bookid, _ in rows: print(bookid) url = 'http://10.5.23.18:8079/book/bookinfo.aspx?id={}'.format(bookid) dir_path = detail_path + '/' + now_time dirname = '%s/%s' % (dir_path,bookid[:3]) if not os.path.exists(dirname): os.makedirs(dirname) filename = '%s/%s.html' % (dirname,bookid) if os.path.exists(filename): sql = 'update book set stat=1 where bookid="{}"'.format(bookid) cur.execute(sql) conn.commit() continue resp = utils.get_html(url, proxies=proxy) if not resp: continue with open(filename, mode='w', encoding='gb18030') as f: f.write(resp.content.decode()) sql = 'update book set stat=1 where bookid="{}"'.format(bookid) cur.execute(sql) conn.commit() utils.printf("下载", bookid, "成功...")
def parse_detail_meta(self, message): conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() cur.execute( 'select gch,journal_name,journal_name_en,pissn,eissn from journal') rows = cur.fetchall() for gch, journal_name, journal_name_en, pissn, eissn in rows: self.dic[gch] = (journal_name, journal_name_en, pissn, eissn) cur.close() conn.close() self.predb3('base_obj_meta_a_template_qk.db3', 'base_obj_meta_a_qk.aiaajournal') self.sqlList.clear() stmt = """insert into base_obj_meta_a (author,author_1st,organ,organ_1st,title,title_alt,keyword,pub_year,pub_date, vol,num,journal_raw_id,journal_name,journal_name_alt,page_info,begin_page,end_page,subject,is_oa,down_cnt,lngid, rawid,product,sub_db, provider,sub_db_id,source_type,provider_url,country,language,batch,down_date,publisher,issn,eissn,abstract, abstract_alt,doi,fund,ref_cnt,fulltext_type) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?, ?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname, 'meta') if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_detail(self, message): conn = utils.init_db('mysql', 'aiaabook', 2) cur = conn.cursor() cur.execute('select url,pub_year from book') rows = cur.fetchall() for url, pub_year in rows: doi = '10.2514/' + url.split('/')[-1] self.dic[doi] = (pub_year) cur.close() conn.close() self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, identifier_pisbn, identifier_eisbn, description, publisher,cover,title_series, date,date_created, price, language, country, provider, provider_url, identifier_doi, provider_id, type,medium, batch) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """ count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname, 'zt') # print(onemessage) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def generateFrequencyMatrix(self): if self.sorted_data is None: utils.printf('ERROR: Your (sorted) data should be loaded!') return if self.sorted_data.shape[1] != 2: utils.printf( 'ERROR: Your (sorted) matrix should have 2 columns only (attribute, class)' ) return unique_attribute_values, indices = np.unique( self.sorted_data[:, 0], return_inverse=True) # first intervals: unique attribute values unique_class_values = np.unique( self.sorted_data[:, 1]) # classes (column index 1) self.frequency_matrix = np.zeros( (len(unique_attribute_values), len(unique_class_values))) # init frequency_matrix self.frequency_matrix_intervals = unique_attribute_values # init intervals (unique attribute values) self.nclasses = len(unique_class_values) # number of classes self.degrees_freedom = self.nclasses - 1 # degress of freedom (look at table) # Generating first frequency values (contingency table), number of instances found in data: attribute-class for row in np.unique(indices): for col, clase in enumerate(unique_class_values): self.frequency_matrix[row, col] += np.where( self.sorted_data[np.where( indices == row)][:, 1] == clase)[0].shape[0] self.printInitialSummary()
def down_cover(self, message): HEADER = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } url = message[0] cover_url = message[1] filename = '%s/10.2514/%s.jpg' % (self.cover_path, url.split('/')[-1]) if os.path.exists(filename): self.senddistributefinish('process_cover', url) return try: proxy = self.getproxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get('https://arc.aiaa.org' + cover_url, headers=HEADER, timeout=20, proxies=proxies) # resp = requests.get(cover_url, headers=HEADER, timeout=20) except: exMsg = '* ' + traceback.format_exc() print(exMsg) self.sendwork('down_cover', message) return if utils.Img2Jpg(resp.content, filename): utils.printf('下载图片%s成功' % filename) self.senddistributefinish('process_cover', url) else: self.sendwork('down_cover', message) return
def downpage(url, proxy): urlsplist = url.split('/') index_dir = r'E:\lqx\AIP\issue' + '/' + urlsplist[-3] index_file = index_dir + '/' + urlsplist[-2] + '_' + urlsplist[-1] + '.html' if os.path.exists(index_file): return True url = url + '?size=all' proxies = { "http": "http://{}".format(proxy), "https": "https://{}".format(proxy) } feature = 'Table of Contents' try: resp = utils.get_html(url, feature=feature, proxies=proxies) except: # exMsg = '* ' + traceback.format_exc() # print(exMsg) return -1 if not resp: return -1 if resp.text.find(feature) < 0: return -1 if resp.text.find('</html>') < 0: return -1 if not os.path.exists(index_dir): os.makedirs(index_dir) with open(index_file, mode='w', encoding='utf8') as f: f.write(resp.content.decode('utf8')) utils.printf('下载', url, '成功...') return True
def startdown_html(self, message): infotype = message if not self.html_path: self.initpath() self.refreshproxypool() url = 'https://www.ydylcn.com/%s/index.shtml' % infotype feature = 'class="list-link-1"' fdir = '%s/%s' % (self.html_path, infotype) if not os.path.exists(fdir): os.makedirs(fdir) fname = '%s/1.html' % fdir utils.printf(fname) if not os.path.exists(fname): while True: resp = self.gethtml(url, feature) if resp: break text = resp.content.decode('utf8') with open(fname, mode='w', encoding='utf8') as f: f.write(text) else: with open(fname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) pagetotalnum = sel.xpath('//table//tr/td/text()')[1].re( r'\s*/(.*)页')[0] self.count = 0 self.totalcount = int(pagetotalnum) - 1 for page in range(2, int(pagetotalnum) + 1): self.sendwork('down_html', (page, infotype))
def downcover(): url = 'https://aip.scitation.org' cover_dir_fullpath = os.path.dirname(os.path.abspath(__file__)) + '/cover' if not os.path.exists(cover_dir_fullpath): os.makedirs(cover_dir_fullpath) try: resp = utils.get_html(url) except: # exMsg = '* ' + traceback.format_exc() # print(exMsg) return False if not resp: return False if resp.text.find('</html>') < 0: return False soup = BeautifulSoup(resp.content.decode('utf8'), 'lxml') divList = soup.select('div.publicationCoverImage') # divpb = soup.select_one('div', data - widget - id='bfd39502-c303-4169-88ba-1d2b9bba85ab') for divtag in divList: coverurl = url + divtag.a.img['src'] covername = cover_dir_fullpath + '/' + divtag.a['href'].split( '/')[-1].lower() + '.jpg' if os.path.exists(covername): continue resp = utils.get_html(coverurl) if utils.Img2Jpg(resp.content, covername): utils.printf('下载', covername, '成功...') time.sleep(3) # apburl = 'https://aip.scitation.org/pb-assets/images/publications/apb/apl-bioeng-1483023557097.jpg' # apbname = cover_dir_fullpath + '/' + 'apb.jpg' # resp = utils.get_html(apburl) # if utils.Img2Jpg(resp.content, apbname): # utils.printf('下载', apbname, '成功...') return True
def gethtml(self, url, feature=None, coverflag=False): try: resp = self.session.get(url, headers=self.headers, timeout=20, proxies=self.proxies) if not coverflag: if resp.content.decode('utf-8').find('Just a moment...') > 0: utils.printf('Just a moment...') if not self.refreshflag: self.refreshflag = True self.refreshsession() else: while self.refreshflag: time.sleep(1) resp = self.session.get(url, headers=self.headers, timeout=20, proxies=self.proxies) if resp.status_code != 200: print('code !=200') return False if resp.content.decode('utf-8').find('</html>') < 0: print('not endwith </html>') return False if feature: if resp.content.decode('utf-8').find(feature) < 0: print('can not find feature') utils.logerror(url) return False except: return False return resp
def parse_list(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) result = [] sql_in = "insert ignore into detail(provider_subject,title,url,add_time,look_time) values (%s,%s,%s,%s,%s)" for _, filedir in utils.file_list(list_path): # E:\work\美星外文\list\日文图书;随笔\2.html utils.printf(filedir) regex = r"E:\\work\\美星外文\\list\\(.*?)\\" provider_subject = re.findall(regex, filedir)[0] with open(filedir, mode='r', encoding='gb18030') as f: text = f.read() html = Selector(text, 'html') list_urls = html.xpath( "//tr[@class='tdbg_leftall']/td/strong/a/@href").extract() for i, item in enumerate(list_urls): title = html.xpath("//tr[@class='tdbg_leftall']/td/strong/a/text()" ).extract()[i].split(" ")[0] url = "http://202.207.22.13:100/" + item add_time = html.xpath( "//tr[@class='tdbg_leftall']/td[3]/text()").extract()[i] look_time = html.xpath( "//tr[@class='tdbg_leftall']/td[4]/text()").extract()[i] result.append((provider_subject, title, url, add_time, look_time)) utils.parse_results_to_sql(conn, sql_in, result) print('插入', len(result), ' 个结果到数据库成功') result.clear()
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.sqlList.clear() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'cambridgejournal') cur = conn.cursor() cur.execute('select url,stat from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.list_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) else: # self.sendwork('down_cover') self.sendwork('parse_list') return self.refreshsession() for url, _ in rows: fdir = self.list_path + '/' + url.split('/')[-3] if not os.path.exists(fdir): os.makedirs(fdir) flast = url.split('/')[-1] if flast.find('?pageNum=') > 0: flast = flast.split('?')[0] + '_' + flast.split('=')[-1] fname = fdir + '/' + flast + '.html' self.sendwork('down_list', (url, fname))
def actTryTargetLogin(self, objBrowser, tryUsername, tryPassword, currentTry): try: # Fill Login field Information objBrowser.select_form(nr = self.formLoginID) objBrowser.form[self.formUsernameField] = tryUsername objBrowser.form[self.formPasswordField] = tryPassword # Print progress bar utils.prints("%10s : %20s%12s%10s / %10s" %(tryUsername, tryPassword, '=' * 6, currentTry, self.sizePasslist)) # Send request objBrowser.submit() # Refresh page, useful for redirect after login objBrowser.reload() # If result has no login form -> Success **NEED IMPROVE** # add login information to fndData, return True if not actions.action_getFormInformation(objBrowser.forms()): utils.printf("Found: %s:%s" %(tryUsername, tryPassword), "good") self.credentials.append([tryUsername, tryPassword]) return True return False except mechanize.HTTPError as error: utils.printf(error, "bad") sys.exit(1)
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.sqlList.clear() self.count = 0 conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() cur.execute('select url,stat from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.list_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) else: self.sendwork('parse_list') for url, _ in rows: urlsp = url.split('/') base_name = '%s_%s.html' % (urlsp[-2], urlsp[-1]) fdir = '%s/%s' % (self.list_path, urlsp[-3]) fname = '%s/%s' % (fdir, base_name) if not os.path.exists(fdir): os.makedirs(fdir) self.sendwork('down_list', (url, fname))
def process_list(self, message): self.count = self.count + 1 sql = "update issue set stat=1 where url='{}'".format(message) self.sqlList.append(sql) if self.count % 40 == 1: utils.printf('%s:下载成功 %s 页' % (self.provider, self.count)) conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() for sql in self.sqlList: cur.execute(sql) conn.commit() conn.close() self.sqlList.clear() if self.count % 100 == 0: self.refreshproxypool() if self.count == self.totalcount: conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() for sql in self.sqlList: cur.execute(sql) conn.commit() conn.close() self.sqlList.clear() utils.printf('downloadlist finish') self.sendwork('parse_list')
def _continue(self, smallest): c1 = self._too_many_intervals() c2 = self._more_merges(smallest) utils.printf('- Too many intervals?: {}'.format(c1)) utils.printf('- Can we merge more?: {}'.format(c2)) return c1 or c2
def chi2(self): if self.data is None: utils.printf('ERROR: Your data matrix should be loaded!') return ### Phase1: defining sigLevel values for every numeric attribute, and chimerge for every attribute-column sigLevel0 = self._phase1() ### Phase2: merging attrinutes if needed (vertical-wise) self._phase2(sigLevel0)
def loadFrequencyMatrix(self, frequency_matrix, unique_attribute_values): ''' :param frequency_matrix: numpy array :return: void ''' if type(frequency_matrix) != np.array: utils.printf('ERROR: data must be a numpy.array') return self.frequency_matrix = frequency_matrix self.frequency_matrix_intervals = unique_attribute_values self.nclasses = self.frequency_matrix.shape[1] self.degrees_freedom = self.nclasses - 1 self.printInitialSummary()
def joinscandirlbname(lbnamemap,scandir_list): out = {} for (date,aid,lid,file_path) in scandir_list: if not lbnamemap.has_key(lid): utils.printf("Warning lid[%i] was not found in lbmap\n",lid) continue name = lbnamemap[lid][2] if not out.has_key(aid): out[aid] = [] dict_row = {"file_path" :file_path, "aid":aid, "lid":lid, "lname":name, "date":date} out[aid].append(dict_row) return out
def loadData(self, data, issorted=False): ''' :param data: numpy matrix :param issorted: boolean, if data is already sorted, no need to sort again (based on attribute_column) :return: ''' if type(data) != np.matrix and type(data) != np.array: utils.printf('ERROR: data must be a numpy.matrix or numpy.array') return self.data = data # numpy.matrix (x,2). column index 0 refers to attributes column and index 1 classes if not issorted: self.sorted_data = np.array(np.sort(data.view('i8,i8'), order=['f0'], axis=0).view(np.float)) #always sorting column 0 (attribute column) else: self.sorted_data = np.array(data) utils.printf('Sorted data: matrix {}x{}'.format(self.sorted_data.shape[0],self.sorted_data.shape[1]))
def loadData(self, data): ''' :param data: numpy matrix :return: ''' if type(data) != np.matrix and type(data) != np.array: utils.printf('ERROR: data must be a numpy.matrix or numpy.array') return self.data = np.array(data) # no need to sort at this point self.nattributes = self.data.shape[1]-1 # last column refers to class label self.nclasses = np.unique(self.data[:,self.nattributes]).shape[0] self.degrees_freedom = self.nclasses - 1 self.chimerge_per_column = {colid:None for colid in range(self.nattributes)} self.alpha_per_column = {colid:None for colid in range(self.nattributes)} self.attribute_can_be_merged = {colid:True for colid in range(self.nattributes)} utils.printf('Data: matrix {}x{} ({} numeric attributes)'.format(self.data.shape[0],self.data.shape[1], self.nattributes)) self._loadChiDistribution()
def generateFrequencyMatrix(self): if self.sorted_data is None: utils.printf('ERROR: Your (sorted) data should be loaded!') return if self.sorted_data.shape[1] != 2: utils.printf('ERROR: Your (sorted) matrix should have 2 columns only (attribute, class)') return unique_attribute_values, indices = np.unique(self.sorted_data[:,0], return_inverse=True) # first intervals: unique attribute values unique_class_values = np.unique(self.sorted_data[:,1]) # classes (column index 1) self.frequency_matrix = np.zeros((len(unique_attribute_values), len(unique_class_values))) # init frequency_matrix self.frequency_matrix_intervals = unique_attribute_values # init intervals (unique attribute values) self.nclasses = len(unique_class_values) # number of classes self.degrees_freedom = self.nclasses - 1 # degress of freedom (look at table) # Generating first frequency values (contingency table), number of instances found in data: attribute-class for row in np.unique(indices): for col, clase in enumerate(unique_class_values): self.frequency_matrix[row,col] += np.where(self.sorted_data[np.where(indices == row)][:,1] == clase)[0].shape[0] self.printInitialSummary()
def _inConsistency(self): #1. matrix with all attribute-columns (except class-column) #2. find duplicates (register indexes) #3. for every duplicated instance do: # 3.1. calculate inconsistency_count = (n-ck) where n is the number of time such instance is duplicated and ck the largest number of duplicates of such instance among all classes #4. incosistency rate sum all inconsistency_count and divide by the number of instances (total instances) # # IT SHOULD NOT BE OVER RAW DATA, BUT OVER THE MERGED DATA!!! # To be fixed! # if self.data is None: utils.printf('ERROR: Your data matrix should be loaded!') return # 1. matrix with only attribute values # 2. identify duplicates unique_values, unique_indexes = np.unique(self.data[:,:self.nattributes-1], return_inverse=True) unique_counts = np.bincount(unique_indexes) matching_instances = unique_values[unique_counts>1] sum_inconsistencies = 0 total_instances = unique_indexes.shape[0] # 3. calculating inconsistency_count for every instance for matching_instance in matching_instances: c = {} for colid in range(self.nclasses): c[colid] = (self.data[self.data[:,self.nattributes]==colid] == matching_instance).sum() n = sum(c.values()) cmax = max(c.values()) inconsistency_count = n - cmax sum_inconsistencies += inconsistency_count # 4. inconsistency rate inconsistency_rate = sum_inconsistencies / float(total_instances) return inconsistency_rate
def main(args): prog = args[0] if len(args) < 3: usage(prog) sys.exit() conf_file = [] conf_file_a = args[1] conf_file_b = args[2] printf("Enter username: "******"Enter password: "******"Scanning files %s and %s\n", conf_file_a, conf_file_b) ldap_a = utils.LdapConnecter(user, passwd, conf_file=conf_file_a) ldap_b = utils.LdapConnecter(user, passwd, conf_file=conf_file_b) printf("binding to ldap from conf %s\n", conf_file_a) ldap_a.bind() printf("binding to ldap from conf %s\n", conf_file_b) ldap_b.bind() printf("\n") ldap_groups = [v for (k,v) in ldap_a.conf["roles"].items()] for ldap_group in ldap_groups: printf("ldap group %s\n", ldap_group) printf("----------------------------------------------\n") printf("\n") members_a = ldap_a.get_group_members_ssos(ldap_group) members_b = ldap_b.get_group_members_ssos(ldap_group) a_ssos = set(members_a.keys()) b_ssos = set(members_b.keys()) n_common = len(a_ssos & b_ssos) printf("common member count: %d\n", n_common ) printf(" missing from %s:\n", conf_file_a) missing_from_a = sorted(list(b_ssos - a_ssos)) if len(missing_from_a) > 0: max_col_len = max([len(sso) for sso in missing_from_a]) for sso in missing_from_a: lsso = "%s" %((sso + ":").ljust(max_col_len +2),) printf(" %s%s\n", lsso, members_b[sso]) else: printf(" None\n") printf("\n") printf(" missing from %s:\n", conf_file_b) missing_from_b = sorted(list(a_ssos - b_ssos)) if len(missing_from_b) > 0: max_col_len = max([len(sso) for sso in missing_from_b]) for sso in missing_from_a: lsso = "%s" % ((sso + ":").ljustt(max_col_len +2),) printf(" %s%s\n", lsso, members_a[sso]) else: printf(" None\n") printf("\n") printf("\n") ldap_a.unbind() ldap_b.unbind()
def usage(prog): printf("usage is %s <conf_file1> <conf_file2>\n", prog) printf("\b") printf("list the members of the specified group name\n") printf("\n")
import sys import os def usage(prog): printf("usage is %s <conf_file>\n", prog) printf("get the user group and roles based on the conf file\n") printf("You will be prompted for your SSO user and passwd\n") printf("\n") if __name__ == "__main__": prog = sys.argv[0] if len(sys.argv) < 2: usage(prog) sys.exit() conf_file = sys.argv[1] printf("Enter username: "******"Enter password: "******"User %s is a member of %d groups\n", user, len(groups)) printf("--------------------------------------------------------\n") for (group_cn, group_dn) in groups: printf("%s\n", group_dn) printf("\n") printf("User %s has %d roles\n", user, len(roles)) printf("--------------------------------------------------------\n") for role in roles:
def usage(prog): printf("usage is %s <conf_file>\n", prog) printf("get the user group and roles based on the conf file\n") printf("You will be prompted for your SSO user and passwd\n") printf("\n")
def chimerge(self): if self.frequency_matrix is None: utils.printf('ERROR: Your frequency matrix should be loaded!') return chitest = {} counter = 0 smallest = -1 while self._too_many_intervals(): ### # CHI2 TEST ### chitest = {} shape = self.frequency_matrix.shape for r in range(shape[0] - 1): interval = r,r+1 chi2 = self.chisqrtest(self.frequency_matrix[[interval],:][0]) if chi2 not in chitest: chitest[chi2] = [] chitest[chi2].append( (interval) ) smallest = min(chitest.keys()) biggest = max(chitest.keys()) ### # SUMMARY ### counter += 1 utils.printf('') utils.printf('ROUND {}: {} intervals. Chi min:{}, Chi max:{}'.format(counter, self.frequency_matrix.shape[0], smallest, biggest)) utils.printf('CHI2 VALUES: {}'.format(chitest.keys())) ### # MERGE ### if self._more_merges(smallest): utils.printf('MERGING INTERVALS: chi {} -> {}'.format(smallest, chitest[smallest])) for (lower,upper) in list(reversed(chitest[smallest])): # reversed, to be able to remove rows on the fly for col in range(shape[1]): # checking columns (to append values from row i+1 ---to be removed--- to row i) self.frequency_matrix[lower,col] += self.frequency_matrix[upper,col] # appending frequencies to the remaining interval self.frequency_matrix = np.delete(self.frequency_matrix, upper, 0) # removing interval (because we merged it in the previous step) self.frequency_matrix_intervals = np.delete(self.frequency_matrix_intervals, upper, 0) # also removing the corresponding interval (real values) utils.printf('NEW INTERVALS: ({}):{}'.format(len(self.frequency_matrix_intervals),self.frequency_matrix_intervals)) else: break self.chitestvalues = chitest utils.printf('END (chi {} > {})\n'.format(smallest, self.threshold))
def printInitialSummary(self): utils.printf('') utils.printf('ROUND 0: Initial values:') utils.printf('- Number of classes: {}'.format(self.nclasses)) utils.printf('- Degrees of Freedom: {} (deprecated)'.format(self.degrees_freedom)) utils.printf('- Threshold: {}'.format(self.threshold)) utils.printf('- Max number of intervals: {}'.format(self.max_number_intervals)) utils.printf('- Number of (unique) intervals: {}'.format(len(self.frequency_matrix_intervals))) utils.printf('- Frequency matrix: {}x{} (sum {})'.format(self.frequency_matrix.shape[0], self.frequency_matrix.shape[1], self.frequency_matrix.sum())) utils.printf('- Intervals: {}'.format(self.frequency_matrix_intervals))
def printInitialSummary(self): utils.printf('') utils.printf('ROUND 0: Initial values:') utils.printf('- Number of attributes: {}'.format(self.nattributes)) utils.printf('- Number of classes: {}'.format(self.nclasses)) utils.printf('- Degrees of Freedom: {} (deprecated)'.format(self.degrees_freedom)) utils.printf('- alpha (initial value of sigLevel): {}'.format(self.alpha)) utils.printf('- delta (inConsistency level): {}'.format(self.delta))
def flush_row(self, row): printf("flush-row: %d", row) return
def put_shape(self, location, shape_vector): printf("put-shape: location: [row: %d, col: %d], shape: %s", location[0], location[1], shape_vector) return
def put_center_string(self, row, display_string): printf("put-center-string: [row: %d, display-string: %s]", row, display_string) return
def printFinalSummary(self): utils.printf('FINAL SUMMARY') utils.printf('{}{}'.format('Intervals: ',self.frequency_matrix_intervals)) utils.printf('{}{}'.format('Chi2: ',', '.join(['[{}-{}):{:5.1f}'.format(v[0][0],v[0][1],k) for k,v in utils.sortDictByValue(self.chitestvalues,False)]))) utils.printf('{} ({}x{})\n{}'.format('Interval-Class Frequencies',self.frequency_matrix.shape[0],self.frequency_matrix.shape[1],self.frequency_matrix))
lbRows = getLidAidNameFromLbs(con,cur) lb2name = getLbid2name(lbRows) cur.close() con.close() scan_list = scandir_filter(scandir(cache_dir),**kw) cfFiles = joinscandirlbname(lb2name,scan_list) return (scan_list,cfFiles) def main(args): kw = {} for arg in args[1:]: (k,v) = arg.split("=") try: kw[k.strip()] = int(v.strip()) except ValueError: kw[k.strip()] = v.strip() #if its not an int treat it like a string return getCfFiles(**kw) if __name__ == "__main__": (scan_list,cfFiles) = main(sys.argv) for i in xrange(0,len(scan_list)): utils.printf("scan_list[%i]=%s\n",i,scan_list[i]) for(aid,files) in cfFiles.items(): for file_info in files: utils.printf("%s: %s\n",aid,file_info) #example invocation #./cfupload.py aid=452605 date_gte=2013060600 #./cfupload.py aid=452605 #/cfupload.py aid=682644 date_gte=2013052500 date_lte=2013052523
import os def usage(prog): printf("usage is %s <conf_file> <group_name>\n", prog) printf("\b") printf("list the members of the specified group name\n") printf("\n") if __name__ == "__main__": prog = sys.argv[0] if len(sys.argv) < 3: usage(prog) sys.exit() conf_file = sys.argv[1] group_name = sys.argv[2] printf("Enter username: "******"Enter password: "******"group %s has %d members\n", group_name, n_groups); printf("-----------------------------------------------------------\n") llen = max([len(sso) for sso in ssos]) for sso in ssos: lsso = "%s" % ((sso + ":").ljust(llen + 2),)
def put_string(self, location, display_string): printf("put-string: location: [row: %d, col: %d], display-string: [fmt: %s, str: %s]", location[0], location[1], display_string[0], display_string[1]) return
def execute(frame, bc): code = bc.code pc = 0 while True: # required hint indicating this is the top of the opcode dispatch driver.jit_merge_point(pc=pc, code=code, bc=bc, frame=frame) if pc >= len(code): return W_Null() c = ord(code[pc]) arg = ord(code[pc + 1]) pc += 2 if c == bytecode.LOAD_CONSTANT: w_constant = bc.constants[arg] frame.push(w_constant) elif c == bytecode.LOAD_VAR: frame.push(frame.vars[arg]) elif c == bytecode.LOAD_NULL: frame.push(W_Null()) elif c == bytecode.LOAD_BOOLEAN: frame.push(W_Boolean(bool(arg))) elif c == bytecode.LOAD_PARAM: frame.push_arg(frame.pop()) # push to the argument-stack elif c == bytecode.DISCARD_TOP: frame.pop() elif c == bytecode.RETURN: if frame.valuestack_pos > 0: return frame.pop() else: return W_Null() elif c == bytecode.BINARY_ADD: right = frame.pop() left = frame.pop() w_res = left.add(right) frame.push(w_res) elif c == bytecode.BINARY_LT: right = frame.pop() left = frame.pop() frame.push(left.lt(right)) elif c == bytecode.BINARY_GE: right = frame.pop() left = frame.pop() frame.push(left.ge(right)) elif c == bytecode.BINARY_EQ: right = frame.pop() left = frame.pop() frame.push(left.eq(right)) elif c == bytecode.BINARY_SUB: right = frame.pop() left = frame.pop() frame.push(left.sub(right)) elif c == bytecode.BINARY_STRINGJOIN: right = frame.pop() left = frame.pop() frame.push(left.append(right)) elif c == bytecode.JUMP_IF_FALSE: if not frame.pop().is_true(): pc = arg elif c == bytecode.JUMP_BACKWARD: pc = arg # required hint indicating this is the end of a loop driver.can_enter_jit(pc=pc, code=code, bc=bc, frame=frame) elif c == bytecode.CALL: method = bc.functions[arg] method.body.globals = [None]*bc.numvars # XXX new_bc = method.body new_frame = Frame(new_bc) # reverse args index to preserve order for i in range(len(method.params)): index = len(method.params) - 1 - i new_frame.vars[index] = frame.pop_arg() res = execute(new_frame, new_bc) frame.push(res) elif c == bytecode.PRINT: item = frame.pop() printf(item.str()) elif c == bytecode.ASSIGN: frame.vars[arg] = frame.pop() else: raise Exception("Unkown operation %s" % bytecode.bytecodes[c])
def _loadChiDistribution(self): with open('data/chisquare_distribution.data','r') as f: data = json.load(f) self.chidistribution = {float(k):v for k,v in data.items()} utils.printf('ChiSquare distribution table loaded. {} sigLevel and {} degrees of freedom.'.format(len(self.chidistribution.keys()),len(self.chidistribution.values()[0])-1))