Python handledata 예제들, GetData.preprocess.handledata Python 예제들

예제 #1

0

파일 보기

파일: create_name_table.py 프로젝트: wenkefendou/TopicMine

def fill_new_table(sql, name, ins1, ins2):
    data = getdata(sql)
    id = 0
    for row in data:
        title = row[1].strip().strip('\n').strip('\r')
        authors = row[2]
        mname = row[3]
        index_terms = row[4]
        if row[5] is not None:
            keywords = row[5].strip().strip('\n').strip('\r')
        else:
            keywords = row[5]
        abstract = row[6].strip().strip('\n').strip('\r')
        pdate = row[7]
        try:
            # 找到包含名字是name的记录
            authorlist = authors.split(";")
            for author in authorlist:
                new_author = re.split(r'\^c', author.strip())
                if new_author[0] == name:
                    if len(new_author) > 1:  # 标记已有机构信息的作者
                        temp = new_author[1]
                        id += 1
                        sql = ins1.format(id, title, temp, authors, mname, index_terms, keywords, abstract, pdate,
                                          name)
                        handledata(sql)
                    else:
                        id += 1
                        sql = ins2.format(id, title, authors, mname, index_terms, keywords, abstract, pdate,
                                          name)
                        handledata(sql)
        except Exception as e:
            print(e)
    print("填充新表" + "\"" + name + "\"" + "完成!")

예제 #2

0

파일 보기

파일: name_dis_departs.py 프로젝트: cash2one/TopicMine

def update_departs(sql, lis, name):
    data = getdata(sql)
    for a in data:
        dd = a[0]
        depart = a[1]
        if depart not in lis:
            temp = "update %s set author_unique=NULL where id =%d" % (name, dd)
            handledata(temp)

예제 #3

0

파일 보기

파일: name_dis_departs.py 프로젝트: wenkefendou/TopicMine

def update_departs(sql, lis, name):
    data = getdata(sql)
    for a in data:
        dd = a[0]
        depart = a[1]
        if depart not in lis:
            temp = "update %s set author_unique=NULL where id =%d" % (name, dd)
            handledata(temp)

예제 #4

0

파일 보기

파일: access_to_mysql.py 프로젝트: cash2one/TopicMine

def insert_into_mysql(access_source, insert):
    author_paper_num = {}  # 统计原始数据作者发文量
    for row in access_source:
        title = row[0].strip()
        mname = row[2]
        index_terms = row[3]
        keywords = row[4]
        abstract = row[5]
        pdate = row[6]
        if abstract is None or abstract.strip() == "":
            continue
        try:
            authors = row[1]
            authors.replace("0800^a", "")  # 0800^a后跟集体投稿人的名称，不起分隔作用
            authors.replace("0070^a", "")  # 0070^a后跟部分作者名，不起分隔作用
            authorlist = re.split(
                r';|2300\^a', authors.strip())  # 2300^a后跟作者名，多出现与中外合著，有分隔作者的作用

            if len(authorlist) < 2:  # 文章只有一个作者
                sql = insert.format(title, authorlist[0].strip(), mname,
                                    index_terms, keywords, abstract, pdate)
                handledata(sql)
                # 统计作者出现频次
                if authorlist[0].strip() in author_paper_num:
                    author_paper_num[authorlist[0].strip()] += 1
                else:
                    author_paper_num[authorlist[0].strip()] = 1
            else:  # 文章有多个作者
                # 对每个作者只保留作者名和机构信息
                new_authors = []
                for author in authorlist:
                    author = re.split(r'\^c|\^d|\^e|\^f', author.strip())
                    if len(author) > 2:
                        temp = author[0].strip() + "^c" + author[1].strip()
                        new_authors.append(temp)
                        # 统计作者频次
                        if temp in author_paper_num:
                            author_paper_num[temp] += 1
                        else:
                            author_paper_num[temp] = 1
                    else:
                        new_authors.append(author[0].strip())
                        # 统计作者频次
                        if author[0].strip() in author_paper_num:
                            author_paper_num[author[0].strip()] += 1
                        else:
                            author_paper_num[author[0].strip()] = 1
                sql3 = insert.format(title, ";".join(new_authors), mname,
                                     index_terms, keywords, abstract, pdate)
                handledata(sql3)
        except Exception as e:
            print(e)
    return author_paper_num

예제 #5

0

파일 보기

파일: access_to_mysql.py 프로젝트: wenkefendou/TopicMine

def insert_into_mysql(access_source, insert):
    author_paper_num = {}  # 统计原始数据作者发文量
    for row in access_source:
        title = row[0].strip()
        mname = row[2]
        index_terms = row[3]
        keywords = row[4]
        abstract = row[5]
        pdate = row[6]
        if abstract is None or abstract.strip() == "":
            continue
        try:
            authors = row[1]
            authors.replace("0800^a", "")  # 0800^a后跟集体投稿人的名称，不起分隔作用
            authors.replace("0070^a", "")  # 0070^a后跟部分作者名，不起分隔作用
            authorlist = re.split(r';|2300\^a', authors.strip())  # 2300^a后跟作者名，多出现与中外合著，有分隔作者的作用

            if len(authorlist) < 2:  # 文章只有一个作者
                sql = insert.format(title, authorlist[0].strip(), mname, index_terms, keywords, abstract, pdate)
                handledata(sql)
                # 统计作者出现频次
                if authorlist[0].strip() in author_paper_num:
                    author_paper_num[authorlist[0].strip()] += 1
                else:
                    author_paper_num[authorlist[0].strip()] = 1
            else:  # 文章有多个作者
                # 对每个作者只保留作者名和机构信息
                new_authors = []
                for author in authorlist:
                    author = re.split(r'\^c|\^d|\^e|\^f', author.strip())
                    if len(author) > 2:
                        temp = author[0].strip() + "^c" + author[1].strip()
                        new_authors.append(temp)
                        # 统计作者频次
                        if temp in author_paper_num:
                            author_paper_num[temp] += 1
                        else:
                            author_paper_num[temp] = 1
                    else:
                        new_authors.append(author[0].strip())
                        # 统计作者频次
                        if author[0].strip() in author_paper_num:
                            author_paper_num[author[0].strip()] += 1
                        else:
                            author_paper_num[author[0].strip()] = 1
                sql3 = insert.format(title, ";".join(new_authors), mname, index_terms, keywords, abstract, pdate)
                handledata(sql3)
        except Exception as e:
            print(e)
    return author_paper_num

예제 #6

0

파일 보기

파일: create_name_table.py 프로젝트: wenkefendou/TopicMine

def create_name_table(name):
    create = '''CREATE TABLE `{0}` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` tinytext,
  `author_unique` varchar(200) DEFAULT NULL,
  `authors` mediumtext,
  `mname` tinytext,
  `index_terms` tinytext,
  `keywords` tinytext,
  `abstract` mediumtext,
  `pdate` varchar(45) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=372 DEFAULT CHARSET=utf8;'''.format(name)
    handledata(create)
    print("生成新表:" + name)

예제 #7

0

파일 보기

파일: create_name_table.py 프로젝트: cash2one/TopicMine

def create_name_table(name):
    create = '''CREATE TABLE `{0}` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` tinytext,
  `author_unique` varchar(200) DEFAULT NULL,
  `authors` mediumtext,
  `mname` tinytext,
  `index_terms` tinytext,
  `keywords` tinytext,
  `abstract` mediumtext,
  `pdate` varchar(45) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=372 DEFAULT CHARSET=utf8;'''.format(name)
    handledata(create)
    print("生成新表:" + name)

예제 #8

0

파일 보기

파일: name_dis_depart_coauthor.py 프로젝트: wenkefendou/TopicMine

def coauthor_dict_match(sql, dictionary, name):
    data = getdata(sql)  # 没有机构信息的文章数据
    paper_get_type = 0
    only_one_type = 0
    get_more_type = 0

    for line in data:
        departs = {}
        paper_id = line[1]
        for coauthor in line[0].split(";"):  # 合著者列表
            author = coauthor.split("^c")
            if author[0] != name:  # 合著者不等于重名者
                for item in dictionary.items():
                    if author[0] in item[1]:  # 合著者的姓名在合著者列表中
                        if item[0] not in departs:
                            departs[item[0]] = 1
                        else:
                            temp = departs[item[0]] + 1
                            departs[item[0]] = temp

        # 输出匹配到合著者匹配结果
        # if len(departs) > 0:
        #     paper_get_type += 1
        #     print("第%d篇文章的可能机构信息：" % paper_id)
        #     aa = reversed(sorted(departs.items(), key=lambda value: value[1]))
        #     for a in aa:
        #         if a[1] > 1:    # 限制 匹配到合著者的个数
        #             print(a)
        #     print()

        if len(departs) == 1:
            for key in departs:
                if departs[key] > 1:    # 至少两个合著者
                    print("%d:%s\n" % (paper_id, key))
                    with open('./data/depart_coauthor_match.txt', 'a', encoding='utf-8') as f:
                        f.write("%d:%s\n" % (paper_id, key))
                    only_one_type += 1
                    aa = "update {0} set author_unique = '{1}' where id ={2};".format(name, key, int(paper_id))
                    handledata(aa)

    # print("可能匹配到机构信息的文章个数：" + str(paper_get_type))
    print("匹配到一个机构信息且至少两个合著者的文章个数：" + str(only_one_type))

예제 #9

0

파일 보기

파일: create_name_table.py 프로젝트: cash2one/TopicMine

def fill_new_table(sql, name, ins1, ins2):
    data = getdata(sql)
    id = 0
    for row in data:
        title = row[1].strip().strip('\n').strip('\r')
        authors = row[2]
        mname = row[3]
        index_terms = row[4]
        if row[5] is not None:
            keywords = row[5].strip().strip('\n').strip('\r')
        else:
            keywords = row[5]
        abstract = row[6].strip().strip('\n').strip('\r')
        pdate = row[7]
        try:
            # 找到包含名字是name的记录
            authorlist = authors.split(";")
            for author in authorlist:
                new_author = re.split(r'\^c', author.strip())
                if new_author[0] == name:
                    if len(new_author) > 1:  # 标记已有机构信息的作者
                        temp = new_author[1]
                        id += 1
                        sql = ins1.format(id, title, temp, authors, mname,
                                          index_terms, keywords, abstract,
                                          pdate, name)
                        handledata(sql)
                    else:
                        id += 1
                        sql = ins2.format(id, title, authors, mname,
                                          index_terms, keywords, abstract,
                                          pdate, name)
                        handledata(sql)
        except Exception as e:
            print(e)
    print("填充新表" + "\"" + name + "\"" + "完成!")

예제 #10

0

파일 보기

파일: access_to_mysql.py 프로젝트: wenkefendou/TopicMine

                        else:
                            author_paper_num[temp] = 1
                    else:
                        new_authors.append(author[0].strip())
                        # 统计作者频次
                        if author[0].strip() in author_paper_num:
                            author_paper_num[author[0].strip()] += 1
                        else:
                            author_paper_num[author[0].strip()] = 1
                sql3 = insert.format(title, ";".join(new_authors), mname, index_terms, keywords, abstract, pdate)
                handledata(sql3)
        except Exception as e:
            print(e)
    return author_paper_num


if __name__ == '__main__':
    # 从文摘库中抽取出包含摘要的文章信息，并对作者信息过滤
    sql1 = 'select title,author,mname,keyword,NN_AKEYWORD_S,abstract,pdate from 查询;'
    sources = link_access(sql1)
    insert1 = "insert into geopaper(title,author,mname,index_terms,keywords,abstract,pdate) " \
              "values(\'{0}\',\'{1}\',\'{2}\',\'{3}\',\'{4}\',\'{5}\',\'{6}\');"

    # counter存储姓名发文量统计
    counter = insert_into_mysql(sources, insert1)
    for name in counter:
        number = counter[name]
        sql2 = 'insert into geocount(name,numofpapers) value(\'{0}\',{1})'.format(name, number)
        handledata(sql2)
    print('Well done!!!')

예제 #11

0

파일 보기

파일: access_to_mysql.py 프로젝트: cash2one/TopicMine

                    else:
                        new_authors.append(author[0].strip())
                        # 统计作者频次
                        if author[0].strip() in author_paper_num:
                            author_paper_num[author[0].strip()] += 1
                        else:
                            author_paper_num[author[0].strip()] = 1
                sql3 = insert.format(title, ";".join(new_authors), mname,
                                     index_terms, keywords, abstract, pdate)
                handledata(sql3)
        except Exception as e:
            print(e)
    return author_paper_num


if __name__ == '__main__':
    # 从文摘库中抽取出包含摘要的文章信息，并对作者信息过滤
    sql1 = 'select title,author,mname,keyword,NN_AKEYWORD_S,abstract,pdate from 查询;'
    sources = link_access(sql1)
    insert1 = "insert into geopaper(title,author,mname,index_terms,keywords,abstract,pdate) " \
              "values(\'{0}\',\'{1}\',\'{2}\',\'{3}\',\'{4}\',\'{5}\',\'{6}\');"

    # counter存储姓名发文量统计
    counter = insert_into_mysql(sources, insert1)
    for name in counter:
        number = counter[name]
        sql2 = 'insert into geocount(name,numofpapers) value(\'{0}\',{1})'.format(
            name, number)
        handledata(sql2)
    print('Well done!!!')