def readTSV(filename, name): dict = {} file = open(filename, "r", encoding="utf8") # conn=get_conn() # cur=conn.cursor() # cur.execute("select * from articles_"+name+" ") # datarows=cur.fetchall() # count=len(datarows) conn = get_conn() cur = conn.cursor() # 按行读文件 alllines = file.readlines() for line in alllines: list = line.split("\t") dict["id"] = list[0] # print (dict["id"]) dict["url"] = list[1] # print (dict["url"]) dict["title"] = list[2] dict["content"] = list[3] # dict["count"]=count insertArticleInDB(dict, name, conn, cur) # count+=1 cur.close() conn.close()
def combineMeniton(tablename): conn = get_conn() cur = conn.cursor() sql = "select * from mention1_"+tablename+" as A, mention2_"+tablename+" as B " +\ "WHERE A.doc_id=B.doc_id and A.sentence_index=B.sentence_index" #print(sql) #return cur.execute(sql) datarows = cur.fetchall() taskNum = len(datarows) string = "%s:N=%d|" % ("candidate", taskNum) pbar = pb.ProgressBar( widgets=[string, pb.Percentage(), pb.Bar(), pb.ETA()], maxval=taskNum) pbar.start() num_completed = 0 for data in datarows: mention_id, mention_text, doc_id, sentence_index, begin_index, end_index = data[: 6] mention_id2, mention_text2, doc_id2, sentence_index2, begin_index2, end_index2 = data[ 6:] # 词距离过长 if begin_index2 - end_index > 25 or end_index2 - begin_index > 25: continue insertCandidationDB(mention_id, mention_text, mention_id2, mention_text2, tablename, conn, cur) num_completed += 1 pbar.update(num_completed) pbar.finish() cur.close() conn.close()
def selectMentionDB(filePath, index, tablename, mode = "dict"): conn = get_conn() cur = conn.cursor() sql = "select id,title,content from articles_" + tablename +" where id not in (select distinct doc_id from mention"+index+"_" + tablename+")" cur.execute(sql) datarows = cur.fetchall() taskNum = len(datarows) string = "%s:N=%d|" % ("findNer", taskNum) pbar = pb.ProgressBar(widgets=[string, pb.Percentage(), pb.Bar(), pb.ETA()], maxval=taskNum) pbar.start() num_completed = 0 data.append(string + "|" + "已完成" + str((num_completed / taskNum) * 100) + "%") nerData = loadNERFile(filePath, mode) for row in datarows: docId,title,content = row keyList = getKeyListPerDoc(title,content,nerData,mode) # 按照文章遍历,每次获取一篇文章下所有句子,统一处理wordList cur.execute("select sentence_index,sentence_text,tokens from sentences_" + tablename+" where doc_id = '%s'"%docId) senrows = cur.fetchall() for sen in senrows: sentence_index, sentence_text, tokens = sen senLength = len(sentence_text) if senLength > 100 or senLength < 10: continue getMentionFromSen(docId,sentence_index,sentence_text,tokens,keyList,index,tablename,conn,cur) num_completed += 1 pbar.update(num_completed) pbar.finish() cur.close() conn.close()
def selectMentionDB2(filePath,index,tablename, mode = "dict"): conn = get_conn() cur = conn.cursor() cur.execute("select doc_id,sentence_index,sentence_text,tokens from sentences_"+tablename) data.append("select doc_id,sentence_index,sentence_text,tokens from sentences_"+tablename) datarows=cur.fetchall() taskNum = len(datarows) string = "%s:N=%d|" % ("findNer", taskNum) pbar = pb.ProgressBar(widgets=[string, pb.Percentage(), pb.Bar(), pb.ETA()], maxval=taskNum) pbar.start() num_completed = 0 nerData = loadNERFile(filePath, mode) for row in datarows: doc_id, sentence_index,sentence_text, tokens=row senLength = len(sentence_text) #筛选句子 if senLength > 100 or senLength < 10: continue if mode == "dict": mentionDict(doc_id,sentence_index,sentence_text,tokens,nerData,index,tablename,conn,cur) #elif mode == "type": # mentionType(doc_id, sentence_index, sentence_text, tokens, nerData, index, tablename, conn, cur) num_completed += 1 pbar.update(num_completed) pbar.finish() cur.close() conn.close()
def selectfeatureMentionDB(tablename): conn = get_conn() cur = conn.cursor() # 从候选集里提取两词特征 sql = """select A.id,A.p1_id,A.p2_id,B.begin_index,B.end_index,C.begin_index, C.end_index,D.tokens,D.lemmas,D.pos_tags,D.ner_tags,D.dep_types,D.dep_tokens from candidate_"""+tablename+""" as A, mention1_"""+tablename+""" as B,mention2_"""+tablename+""" as C,sentences_"""+tablename+""" as D where A.p1_id=B.mention_id and A.p2_id=C.mention_id and B.doc_id=D.doc_id and B.sentence_index=D.sentence_index and A.id not in (select distinct cid from feature_"""+tablename+""")""" cur.execute(sql) data.append("实体关系选取中......") data.append(sql) datarows=cur.fetchall() taskNum = len(datarows) string = "%s:N=%d|" % ("featureGet", taskNum) pbar = pb.ProgressBar(widgets=[string, pb.Percentage(), pb.Bar(), pb.ETA()], maxval=taskNum) pbar.start() num_completed = 0 #flash(string + "|" + "已完成" + str((num_completed / taskNum) * 100) + "%") for row in datarows: featureMention(row,tablename,conn,cur) num_completed += 1 pbar.update(num_completed) pbar.finish() cur.close() conn.close()
def insertCandidationAll(tablename): conn = get_conn() cur = conn.cursor() cur.execute("TRUNCATE TABLE candidate_" + tablename) # 清空原表 cur.execute("select setval( 'candidate_" + tablename + "_id_seq',1,false);") #重置自增id为1 sql = "insert into candidate_" + tablename + " (p1_id, p1_name, p2_id, p2_name) " +\ "select p1_id, p1_name, p2_id, p2_name from ("+\ "select A.mention_id as p1_id, A.mention_text as p1_name, A.doc_id, A.sentence_index, A.begin_index,A.end_index,"+\ "B.mention_id as p2_id, B.mention_text as p2_name, B.doc_id, B.sentence_index, B.begin_index,B.end_index "+\ "from mention1_" + tablename + " as A, mention2_" + tablename + " as B "+\ "WHERE A.doc_id=B.doc_id and A.sentence_index=B.sentence_index "+\ "and (B.begin_index - A.end_index <= 25 or B.end_index - A.begin_index <= 25)" +\ ") as C" data.append(sql) cur.execute(sql) conn.commit() cur.close() conn.close()
def getSensNLP(tablename, nlp): conn = get_conn() cur = conn.cursor() sql = "select doc_id, sen_id,max(txt),max(tokens) from ( "+\ "select D.doc_id as doc_id, D.sentence_index as sen_id, D.sentence_text as txt, D.tokens as tokens "+\ "from candidate_"+tablename+" as A, mention1_"+tablename+" as B,mention2_"+tablename+" as C,sentences_"+tablename+" as D "+\ " where A.p1_id=B.mention_id and A.p2_id=C.mention_id and B.doc_id=D.doc_id "+\ "and B.sentence_index=D.sentence_index and D.flag = 0 "+\ ") as E group by doc_id,sen_id" data.append("实体关系选取中......") data.append(sql) cur.execute(sql) rows = cur.fetchall() taskNum = len(rows) string = "%s:N=%d|" % ("senNLP", taskNum) pbar = pb.ProgressBar( widgets=[string, pb.Percentage(), pb.Bar(), pb.ETA()], maxval=taskNum) pbar.start() num_completed = 0 #flash(string + "|" + "已完成" + str((num_completed / taskNum) * 100) + "%") for row in rows: docid, sen_id, sen_txt, tokens = row try: print("processing doc id:" + docid) buildSentenceDB(parsertext(sen_txt, tokens, docid, sen_id, nlp), tablename, conn, cur) #buildSentenceDB(parsertext(tokens, docid, sen_id, nlp), tablename, conn, cur) except Exception as e: print("1", e) continue num_completed += 1 pbar.update(num_completed) pbar.finish() cur.close() conn.close()
def createAllDB(name): #连接数据库 conn = get_conn() # 建立cursor对象 cur = conn.cursor() # id 文章id惟一,url文章链接,title文章标题,content文章内容,source文章来源,主键id sql = """CREATE TABLE if not exists articles_""" + name + """ ( id text COLLATE "default", url text COLLATE "default", title text COLLATE "default", content text COLLATE "default" ) WITH (OIDS=FALSE); ALTER TABLE articles_""" + name + """ OWNER TO postgres""" #执行sql命令,创建一个新表 cur.execute(sql) #print(sql) #提交数据改变 conn.commit() data.append(sql) data.append("表articles_" + name + "创建完成") data.append("=========================================") # doc_id文章id,sentence_index句号,sentence_text句子内容,tokens分词,lemmas原词,pos_tags词性标记,ner_tags实体识别标记,doc_offsets所在文章第几个词,dep_types文法依赖,dep_tokens文法依赖树 sql = """CREATE TABLE if not exists sentences_""" + name + """ ( id SERIAL primary key, doc_id text COLLATE "default", sentence_index int4, sentence_text text COLLATE "default", tokens text[] COLLATE "default", lemmas text[] COLLATE "default", pos_tags text[] COLLATE "default", ner_tags text[] COLLATE "default", doc_offsets int4[], dep_types text[] COLLATE "default", dep_tokens int4[], flag int4 ) WITH (OIDS=FALSE); ALTER TABLE sentences_""" + name + """ OWNER TO postgres""" cur.execute(sql) data.append(sql) data.append("表sentences_" + name + "创建完成") data.append("=========================================") conn.commit() # mention_id实体1id,mention_text实体内容,doc_id文章id,sentence_index句号,begin_index实体1句中起始位置,end_index实体1句中结束位置 sql = """CREATE TABLE if not exists mention1_""" + name + """ ( mention_id text COLLATE "default", mention_text text COLLATE "default", doc_id text COLLATE "default", sentence_index int4, begin_index int4, end_index int4 ) WITH (OIDS=FALSE); ALTER TABLE mention1_""" + name + """ OWNER TO postgres""" cur.execute(sql) data.append(sql) data.append("表mention1_" + name + "创建完成") data.append("=========================================") conn.commit() # mention_id实体2id,mention_text实体内容,doc_id文章id,sentence_index句号,begin_index实体2句中起始位置,end_index实体2句中结束位置 sql = """CREATE TABLE if not exists mention2_""" + name + """ ( mention_id text COLLATE "default", mention_text text COLLATE "default", doc_id text COLLATE "default", sentence_index int4, begin_index int4, end_index int4 ) WITH (OIDS=FALSE); ALTER TABLE mention2_""" + name + """ OWNER TO postgres""" cur.execute(sql) data.append(sql) data.append("表mention2_" + name + "创建完成") data.append("=========================================") conn.commit() # p1_id实体1id,p1_name实体1内容,p2_id实体2id,p2_name实体2内容,关系对候选表 sql = """CREATE TABLE if not exists candidate_""" + name + """ ( id SERIAL primary key, p1_id text COLLATE "default", p1_name text COLLATE "default", p2_id text COLLATE "default", p2_name text COLLATE "default" ) WITH (OIDS=FALSE); ALTER TABLE candidate_""" + name + """ OWNER TO postgres""" cur.execute(sql) data.append(sql) data.append("表candidate_" + name + "创建完成") data.append("=========================================") conn.commit() # p1_id实体1id,p2_id实体2id,feature关系对特征 sql = """CREATE TABLE if not exists feature_""" + name + """ ( cid int NOT NULL, feature text COLLATE "default" ) WITH (OIDS=FALSE); ALTER TABLE feature_""" + name + """ OWNER TO postgres""" cur.execute(sql) data.append(sql) data.append("表feature_" + name + "创建完成") data.append("=========================================") conn.commit() conn.close()