def mainFunction(): # #读取文件 filePathList = readFiles(files_path) print('read is ready') for fileP in filePathList: html = readTXT(fileP) eid = getID(html) if eid > 0: topicID = extractTopic(eid, html) if len(topicID) > 0: insertTopic(eid, topicID, fileP) else: try: print('empty: ' + fileP) updateSQL = 'update dlurl1 set status=2 where id=' + str( eid) cur.execute(updateSQL) #标记已抽取 conn.commit() except Exception: print('error: ' + updateSQL) else: print('eid error: ' + fileP) #break#只运行一次 cur.close() conn.close()
def mainFunction(): # filesPath = readFiles(files_path) for fileP in filesPath: eid = fileP.split('inst_')[1].replace('.csv', '') institList = readList(fileP) for inst in institList: if len(inst) < 4: break inst = cleanInstit(inst) insertSQL = 'insert into experience1 (eid,institution) values(' + str( eid) + ',"' + inst + '")' try: cur.execute(insertSQL) conn.commit() #print('complete:' +str(eid)) except Exception: print('insert error' + str(eid)) print('complete:' + str(eid)) cur.execute('update dlurl1 set tem=5 where id=' + str(eid)) conn.commit() #break #only once cur.close() conn.close()
def mainFunction(): # #读取文件 filePathList = readFiles(files_path) print('read is ready') for fileP in filePathList: html = readTXT(fileP) eid = getID(html) if eid >0: instit = extractInstitut(html) if len(instit)>0: insertInstitution(eid,instit,fileP) else: try: print('empty: '+fileP) updateSQL = 'update dlurl1 set tem=5 where id='+str(eid) cur.execute(updateSQL)#标记已抽取 conn.commit() except Exception: print('error: '+updateSQL) else: print('eid error: '+fileP) #break#只运行一次 cur.close();conn.close();
def insertNull(): # ''' fileList = readFiles('E:/Code/Pickle/samesingle/same') for fp in fileList: sameList = readSeriz(fp) for sl in sameList: selectResult = getResult('select paperid from paper where id='+str(sl[1]),cur) updateSQL = 'update paper set paperid='+str(selectResult[0]['paperid'])+' where id='+str(sl[0]) print(updateSQL) cur.execute(updateSQL) conn.commit() print('completed: '+str(selectResult[0]['paperid'])+' '+str(sl[0])) print('update: '+fp) ''' maxPID = 4263215 fileList = readFiles('E:/Code/Pickle/samesingle/single') for fp in fileList: print('update: ' + fp) single = readSeriz(fp) for s in single: maxPID += 1 updateSQL = 'update paper set paperid=' + str( maxPID) + ' where id=' + str(s) cur.execute(updateSQL) conn.commit() print('now is ' + str(maxPID))
def mainFunction(): conn, cur = getCursor() filePathList = readFiles(files_path) for fileP in filePathList: insertPaperSQL(fileP) #break#只运行一次 cur.close() conn.close()
def compareNull(): yearListNull = readSeriz(yearList_pickle_null) yearList = readSeriz(yearList_pickle) for fp in readFiles(nullDict): sameList = [] single = [] print('now begin: ' + str(fp)) nullYearTitle = readSeriz(fp) yidx_null, nidx_null = extractYearTitle(fp) if yidx_null > len(yearListNull): print('error!!!!!!!!!!????') continue year = yearListNull[yidx_null] if year in yearList: yidx = yearList.index(year) else: print('error!!!!!!!!!!') continue path = idyeartitle_path + str(yidx) + '_' + str(nidx_null) + '.pickle' yearTitle = readSeriz(path) if len(yearTitle) < 1: for i in range(len(nullYearTitle)): single.append(nullYearTitle[i][0]) for i in range(len(nullYearTitle)): flag = False for j in range(len(yearTitle)): if nullYearTitle[i][2] == yearTitle[j][2]: sameList.append([nullYearTitle[i][0], yearTitle[j][0]]) flag = True continue if flag == False: single.append(nullYearTitle[i][0]) sameList_path = sameList_pickle + str(yidx) + '_' + str( nidx_null) + '.pickle' single_path = single_pickle + str(yidx) + '_' + str( nidx_null) + '.pickle' constructSeriz(sameList_path, sameList) constructSeriz(single_path, single)
def mainFunction(): filePathList = readFiles(dict_path) for fileP in filePathList: eid = fileP.split('topicsupply_')[1].replace('.csv', '') id = cleanID(eid) insertPaperSQL(id, fileP)