def processInNewDriver(href): ''' 处理文章链接 ''' print "处理链接", href driverContent = webdriver.Chrome(executable_path=executable_path) driverContent.get(href) retry(lambda: driver.find_element_by_class_name("rprt_all")) contentEle = driverContent.find_element_by_class_name("rprt_all") content = contentEle.text for key in keys: if content.find(key) != -1: print "find" reusltData.append([href, key]) write(reusltData, FILENAME_RESULT) break recoreDate.append([href]) driverContent.quit()
#!/usr/bin/python # -*-coding:UTF-8-*- # encoding=utf8 from com.office.util.excelUtil import write from com.office.util.wordUtil import readDocx content = readDocx("/Users/yangjie/Downloads/qq/(最完整版)胡希恕讲伤寒论.docx") lines = content.split(u"。") resultData = [] for line in lines: resultData.append([line]) write(resultData, "/Users/yangjie/Downloads/qq/(最完整版)胡希恕讲伤寒论.xls")
#!/usr/bin/python # -*-coding:UTF-8-*- # encoding=utf8 import matplotlib.pyplot as plt import numpy as np from com.office.util.excelUtil import write plt.figure(figsize=(9, 6), dpi=100) n = 100 #rand 均匀分布和 randn高斯分布 r = [] for i in range(6): x = np.random.randn(1, n) r.append(x[0]) write(r, "scatterplot.xls") print x y = np.random.randn(1, n) print y T = np.arctan2(x, y) print T plt.scatter(x, y, c=T, s=25, alpha=0.4, marker='o') #T:散点的颜色 #s:散点的大小 #alpha:是透明程度 plt.show()
) Molecular2 = Moleculars1[1].text result.append([key, ID2, Molecular2]) pubChem.recorddata.append([key]) drivercontent.quit() return except Exception, e: result.append([key, "NA", "NA"]) pubChem.recorddata.append([key]) break else: driver.get("http://www.ncbi.nlm.nih.gov/pccompound") mylists.reverse() for key in mylists: if key in pubChem.recorddata: print "chuliguo" continue else: try: key = key[0] print "处理", key getchemID(key) write(result, pubChem.resultPathname) write(pubChem.recorddata, pubChem.recordPathname) except Exception, e: import traceback print traceback.format_exc() print "3", e
if rowDate[2] != "": cacheline = [] cacheline.append(rowDate[2]) if rowDate[3] != "": lastSrc = rowDate[3] cacheline.append(lastSrc) cache.append(cacheline) resultDate = [] for rowDate in srcDate: newLine = [] newLine.append(rowDate[0]) if rowDate[1] != "": newLine.append(rowDate[1]) newLine.append("TCMID") else: if len(cache) != 0: takeCache = cache[0] cache.remove(takeCache) newLine.extend(takeCache) resultDate.append(newLine) for cacheData in cache: newLine = [] newLine.append("") newLine.extend(cacheData) resultDate.append(newLine) print resultDate write(resultDate, "六神曲_res.xls")
#!/usr/bin/python #-*-coding:UTF-8-*- # encoding=utf8 from com.office.util.excelUtil import write resultDate = [] for i in range(3): resultDate.append(range(3)) print resultDate write(resultDate, "num.xls")
recoreDate.append([href]) driverContent.quit() while True: retry(lambda: driver.find_elements_by_xpath("//p[@class='title']/a")) aEles = driver.find_elements_by_xpath("//p[@class='title']/a") hrefs = [] for aEle in aEles: href = aEle.get_attribute("href") if href in matchRecordData: print "processd ", href else: hrefs.append(href) if 0 == len(hrefs): print "no new href" else: doInThread(processInNewDriver, hrefs, poolNum=4) write(recoreDate, FILENAME_RECORD) try: driver.find_element_by_xpath( "//a[@id='EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page' and @sid='3']" ).send_keys(Keys.ENTER) print "下一页" import time time.sleep(1) except Exception, e: print "已到尾页" break