def downloadZIP(self): print '- Starting download ZIP files.' st = time.time() sp = SourceParser.SourceParser() sp.getALLFormats() ls_xml4 = sp.links_G_XML4 #396 412 ls_xml2 = sp.links_G_XML2 #157 ls_xml24 = sp.links_G_XML2_4 #52 ls_aps = sp.links_G_APS #252 final 26 for dLink in ls_xml4: exist_path = (os.getcwd() + '/ZIP_G/XML4/' + os.path.basename(dLink)).replace('\\', '/') if (not os.path.exists(exist_path)): urllib.urlretrieve(dLink, exist_path) urllib.urlcleanup() for dLink in ls_xml2: exist_path = (os.getcwd() + '/ZIP_G/XML2/' + os.path.basename(dLink)).replace('\\', '/') if (not os.path.exists(exist_path)): urllib.urlretrieve(dLink, exist_path) urllib.urlcleanup() for dLink in ls_xml24: exist_path = (os.getcwd() + '/ZIP_G/XML24/' + os.path.basename(dLink)).replace('\\', '/') if (not os.path.exists(exist_path)): urllib.urlretrieve(dLink, exist_path) urllib.urlcleanup() for dLink in ls_aps: exist_path = (os.getcwd() + '/ZIP_G/APS/' + os.path.basename(dLink)).replace('\\', '/') if (not os.path.exists(exist_path)): urllib.urlretrieve(dLink, exist_path) urllib.urlcleanup() print '[Downloaded ZIP files. Cost: {0}]'.format(time.time() - st)
def newfuncXML2(filePath): try: log = LogProcessor.LogProcess() fileName = os.path.basename(filePath) sp = SourceParser.SourceParser() xmlStr = sp.getXML2Content_DPL(filePath) g = GrantsParser() g.extractXML2(xmlStr) g.writeCSV() log.write(log.logPath_G, fileName + ', Processed') except Exception, e: print e log.write(log.logPath_G, fileName + ', Failed')
def checkAll(self): log = LogProcessor.LogProcess() f_log_g = open(self.logPath_G, 'rb') f_log_p = open(self.logPath_P, 'rb') f_log_pair = open(self.logPath_PAIR, 'rb') self.processedG = f_log_g.readlines() self.processedP = f_log_p.readlines() #self.processedPAIR=f_log_pair.readlines() for i in self.processedG: if (i.split('\t')[4].strip() == 'Processed' or i.split('\t')[4].strip() == 'Passed'): fileName = i.split('\t')[1] if (fileName not in self.sprocessedG): self.sprocessedG.append(fileName) for i in self.processedP: if (i.split('\t')[4].strip() == 'Processed'): fileName = i.split('\t')[1] if (fileName not in self.sprocessedP): self.sprocessedP.append(fileName) ## for i in self.processedPAIR: ## if(i.split('\t')[4].strip()=='Processed'): ## fileName=i.split('\t')[2] ## if(fileName not in self.sprocessedPAIR): ## self.sprocessedPAIR.append(fileName) sp = SourceParser.SourceParser() self.allG = sp.getdLinksPG() self.allP = sp.getdLinksPP() #self.allPAIR=sp.getdLinksPAIR() self.allG_XML4 = sp.getFileNamesPG_XML4() self.allP_XML4 = sp.getFileNamesPP_XML4() self.unprocessedG = list(set(self.allG_XML4) - set(self.sprocessedG)) self.unprocessedP = list(set(self.allP_XML4) - set(self.sprocessedP))
bList[9 * m / n:10 * m / n] ] return sList def multiProcess(allLinkList, numStep): for i in range(0, len(allLinkList), numStep): st = time.time() linkList = allLinkList[i:i + numStep] linkListProcesses = partTen(linkList) processes = [] for linkList in linkListProcesses: processes.append( multiprocessing.Process(target=mainProcess, args=(linkList, ))) for ps in processes: ps.start() for ps in processes: ps.join() print '[multiProcess finished! Time:{time}]'.format(time=time.time() - st) if __name__ == "__main__": st = time.time() sp = SourceParser.SourceParser() allLinkList = sp.getdLinksPAIR() multiProcess(allLinkList, 1000) print 'All PAIR data have been POPULATED successfully! Cost time:{0}'.format( time.time() - st)