def fill_new_table(sql, name, ins1, ins2): data = getdata(sql) id = 0 for row in data: title = row[1].strip().strip('\n').strip('\r') authors = row[2] mname = row[3] index_terms = row[4] if row[5] is not None: keywords = row[5].strip().strip('\n').strip('\r') else: keywords = row[5] abstract = row[6].strip().strip('\n').strip('\r') pdate = row[7] try: # 找到包含名字是name的记录 authorlist = authors.split(";") for author in authorlist: new_author = re.split(r'\^c', author.strip()) if new_author[0] == name: if len(new_author) > 1: # 标记已有机构信息的作者 temp = new_author[1] id += 1 sql = ins1.format(id, title, temp, authors, mname, index_terms, keywords, abstract, pdate, name) handledata(sql) else: id += 1 sql = ins2.format(id, title, authors, mname, index_terms, keywords, abstract, pdate, name) handledata(sql) except Exception as e: print(e) print("填充新表" + "\"" + name + "\"" + "完成!")
def update_departs(sql, lis, name): data = getdata(sql) for a in data: dd = a[0] depart = a[1] if depart not in lis: temp = "update %s set author_unique=NULL where id =%d" % (name, dd) handledata(temp)
def insert_into_mysql(access_source, insert): author_paper_num = {} # 统计原始数据作者发文量 for row in access_source: title = row[0].strip() mname = row[2] index_terms = row[3] keywords = row[4] abstract = row[5] pdate = row[6] if abstract is None or abstract.strip() == "": continue try: authors = row[1] authors.replace("0800^a", "") # 0800^a后跟集体投稿人的名称,不起分隔作用 authors.replace("0070^a", "") # 0070^a后跟部分作者名,不起分隔作用 authorlist = re.split( r';|2300\^a', authors.strip()) # 2300^a后跟作者名,多出现与中外合著,有分隔作者的作用 if len(authorlist) < 2: # 文章只有一个作者 sql = insert.format(title, authorlist[0].strip(), mname, index_terms, keywords, abstract, pdate) handledata(sql) # 统计作者出现频次 if authorlist[0].strip() in author_paper_num: author_paper_num[authorlist[0].strip()] += 1 else: author_paper_num[authorlist[0].strip()] = 1 else: # 文章有多个作者 # 对每个作者只保留作者名和机构信息 new_authors = [] for author in authorlist: author = re.split(r'\^c|\^d|\^e|\^f', author.strip()) if len(author) > 2: temp = author[0].strip() + "^c" + author[1].strip() new_authors.append(temp) # 统计作者频次 if temp in author_paper_num: author_paper_num[temp] += 1 else: author_paper_num[temp] = 1 else: new_authors.append(author[0].strip()) # 统计作者频次 if author[0].strip() in author_paper_num: author_paper_num[author[0].strip()] += 1 else: author_paper_num[author[0].strip()] = 1 sql3 = insert.format(title, ";".join(new_authors), mname, index_terms, keywords, abstract, pdate) handledata(sql3) except Exception as e: print(e) return author_paper_num
def insert_into_mysql(access_source, insert): author_paper_num = {} # 统计原始数据作者发文量 for row in access_source: title = row[0].strip() mname = row[2] index_terms = row[3] keywords = row[4] abstract = row[5] pdate = row[6] if abstract is None or abstract.strip() == "": continue try: authors = row[1] authors.replace("0800^a", "") # 0800^a后跟集体投稿人的名称,不起分隔作用 authors.replace("0070^a", "") # 0070^a后跟部分作者名,不起分隔作用 authorlist = re.split(r';|2300\^a', authors.strip()) # 2300^a后跟作者名,多出现与中外合著,有分隔作者的作用 if len(authorlist) < 2: # 文章只有一个作者 sql = insert.format(title, authorlist[0].strip(), mname, index_terms, keywords, abstract, pdate) handledata(sql) # 统计作者出现频次 if authorlist[0].strip() in author_paper_num: author_paper_num[authorlist[0].strip()] += 1 else: author_paper_num[authorlist[0].strip()] = 1 else: # 文章有多个作者 # 对每个作者只保留作者名和机构信息 new_authors = [] for author in authorlist: author = re.split(r'\^c|\^d|\^e|\^f', author.strip()) if len(author) > 2: temp = author[0].strip() + "^c" + author[1].strip() new_authors.append(temp) # 统计作者频次 if temp in author_paper_num: author_paper_num[temp] += 1 else: author_paper_num[temp] = 1 else: new_authors.append(author[0].strip()) # 统计作者频次 if author[0].strip() in author_paper_num: author_paper_num[author[0].strip()] += 1 else: author_paper_num[author[0].strip()] = 1 sql3 = insert.format(title, ";".join(new_authors), mname, index_terms, keywords, abstract, pdate) handledata(sql3) except Exception as e: print(e) return author_paper_num
def create_name_table(name): create = '''CREATE TABLE `{0}` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` tinytext, `author_unique` varchar(200) DEFAULT NULL, `authors` mediumtext, `mname` tinytext, `index_terms` tinytext, `keywords` tinytext, `abstract` mediumtext, `pdate` varchar(45) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=372 DEFAULT CHARSET=utf8;'''.format(name) handledata(create) print("生成新表:" + name)
def coauthor_dict_match(sql, dictionary, name): data = getdata(sql) # 没有机构信息的文章数据 paper_get_type = 0 only_one_type = 0 get_more_type = 0 for line in data: departs = {} paper_id = line[1] for coauthor in line[0].split(";"): # 合著者列表 author = coauthor.split("^c") if author[0] != name: # 合著者不等于重名者 for item in dictionary.items(): if author[0] in item[1]: # 合著者的姓名在合著者列表中 if item[0] not in departs: departs[item[0]] = 1 else: temp = departs[item[0]] + 1 departs[item[0]] = temp # 输出匹配到合著者匹配结果 # if len(departs) > 0: # paper_get_type += 1 # print("第%d篇文章的可能机构信息:" % paper_id) # aa = reversed(sorted(departs.items(), key=lambda value: value[1])) # for a in aa: # if a[1] > 1: # 限制 匹配到合著者的个数 # print(a) # print() if len(departs) == 1: for key in departs: if departs[key] > 1: # 至少两个合著者 print("%d:%s\n" % (paper_id, key)) with open('./data/depart_coauthor_match.txt', 'a', encoding='utf-8') as f: f.write("%d:%s\n" % (paper_id, key)) only_one_type += 1 aa = "update {0} set author_unique = '{1}' where id ={2};".format(name, key, int(paper_id)) handledata(aa) # print("可能匹配到机构信息的文章个数:" + str(paper_get_type)) print("匹配到一个机构信息且至少两个合著者的文章个数:" + str(only_one_type))
else: author_paper_num[temp] = 1 else: new_authors.append(author[0].strip()) # 统计作者频次 if author[0].strip() in author_paper_num: author_paper_num[author[0].strip()] += 1 else: author_paper_num[author[0].strip()] = 1 sql3 = insert.format(title, ";".join(new_authors), mname, index_terms, keywords, abstract, pdate) handledata(sql3) except Exception as e: print(e) return author_paper_num if __name__ == '__main__': # 从文摘库中抽取出包含摘要的文章信息,并对作者信息过滤 sql1 = 'select title,author,mname,keyword,NN_AKEYWORD_S,abstract,pdate from 查询;' sources = link_access(sql1) insert1 = "insert into geopaper(title,author,mname,index_terms,keywords,abstract,pdate) " \ "values(\'{0}\',\'{1}\',\'{2}\',\'{3}\',\'{4}\',\'{5}\',\'{6}\');" # counter存储姓名发文量统计 counter = insert_into_mysql(sources, insert1) for name in counter: number = counter[name] sql2 = 'insert into geocount(name,numofpapers) value(\'{0}\',{1})'.format(name, number) handledata(sql2) print('Well done!!!')
else: new_authors.append(author[0].strip()) # 统计作者频次 if author[0].strip() in author_paper_num: author_paper_num[author[0].strip()] += 1 else: author_paper_num[author[0].strip()] = 1 sql3 = insert.format(title, ";".join(new_authors), mname, index_terms, keywords, abstract, pdate) handledata(sql3) except Exception as e: print(e) return author_paper_num if __name__ == '__main__': # 从文摘库中抽取出包含摘要的文章信息,并对作者信息过滤 sql1 = 'select title,author,mname,keyword,NN_AKEYWORD_S,abstract,pdate from 查询;' sources = link_access(sql1) insert1 = "insert into geopaper(title,author,mname,index_terms,keywords,abstract,pdate) " \ "values(\'{0}\',\'{1}\',\'{2}\',\'{3}\',\'{4}\',\'{5}\',\'{6}\');" # counter存储姓名发文量统计 counter = insert_into_mysql(sources, insert1) for name in counter: number = counter[name] sql2 = 'insert into geocount(name,numofpapers) value(\'{0}\',{1})'.format( name, number) handledata(sql2) print('Well done!!!')