Example #1
0
def generate_pub_node_from_file(driver, file_name, sheet_name, column_name):
    """
    从文件中提取bib信息,然后写入neo4j.checked
    :param column_name:
    :param sheet_name:
    :param file_name: bib文件或Excel文件
    :param driver: 数据库接口
    :return:
    """
    # 提取bib信息
    result = ini_result()
    extract_result = extract_bib_info_from_file(file_name, sheet_name, column_name)  # 结果是models类

    if extract_result[RESULT_CODE] == 1003:
        extracted_data = extract_result[RESULT_DATA]  # list of pubs
        db_result = create_or_match_nodes(extracted_data, driver)  # 写入节点
        if db_result[RESULT_CODE] != 1303:
            result[RESULT_CODE] = -1201
            result[RESULT_MSG] = extract_result[RESULT_MSG]
        else:
            result[RESULT_CODE] = 1200
            result[RESULT_MSG] = "success"
    else:
        print("不写入数据库:" + extract_result[RESULT_MSG])
        result[RESULT_CODE] = -1200
        result[RESULT_MSG] = extract_result[RESULT_MSG]

    return result
Example #2
0
def upload_bib_add_record(request):
    result = ini_result()
    try:
        files = request.FILES.getlist('file')
    except:
        result[RESULT_MSG] = 'failed to retrieve file in the request'
        result[RESULT_CODE] = -701
        return wrap_result(result)

    if files is None or len(files) == 0:
        result[RESULT_MSG] = 'No file in the request'
        result[RESULT_CODE] = -702
        return wrap_result(result)

    driver = initialize_neo4j_driver()
    dir = os.path.join(os.path.dirname(__file__), 'upload_file')  # 拼装目录名称+文件名称

    file_not_processed = []
    for file in files:
        today = str(datetime.date.today())  # 获得今天日期
        filename = today + '_' + file.name  # 获得上传来的文件名称,加入下划线分开日期和名称

        file_path = save_file_stream_on_disk(file, dir, filename)  # 处理上传来的文件

        if file_path is None:
            file_not_processed.append(file)
            continue

        publication_info = extract_bib_info_from_file(file_path)  # dict

        pubs = []
        for entry in publication_info:
            # 解析文献
            tmp_result_pub = extract_publication_from_bib_info(entry)
            if tmp_result_pub[RESULT_CODE] == 1001:
                pubs.append(tmp_result_pub[RESULT_DATA])

        pubs = None if pubs == [] else pubs
        db_pub_result = create_or_match_nodes(driver,
                                              pubs,
                                              return_type="class",
                                              to_create=True)
        if db_pub_result[RESULT_CODE] != 1303:
            result[RESULT_CODE] = 00
            result[RESULT_MSG] = "Publication节点生成失败"
        else:
            result[RESULT_CODE] = 00
            result[RESULT_MSG] = "Publication节点生成成功"

        os.remove(file_path)

    if len(file_not_processed) > 0:
        result[RESULT_CODE] = -201
        result[RESULT_MSG] = "not all files are written into database"
        result[RESULT_DATA] = file_not_processed
    else:
        result[RESULT_CODE] = 200
        result[RESULT_MSG] = "success"

    return wrap_result(result)
Example #3
0
def add_venue(request):
    """
    向cypher添加venue
    :param request:
    :return: {RESULT_MSG: "", RESULT_CODE: 0}, 0:缺少参数;-1:参数格式错误;-10:请求方式错误,-2~-5见create_or_match_persons方法
    """
    result = ini_result()
    is_ajax = request.META.get('HTTP_X_REQUESTED_WITH') == 'XMLHttpRequest'
    if is_ajax and request.method == 'POST':
        node_info = request.body  # 处理后是dict,直接传到后台写入数据库就可以了

        if node_info is None or node_info == "":
            result[RESULT_CODE] = -301
            result[RESULT_MSG] = "no data is given"
            return wrap_result(result)

        try:
            node_info = bytes.decode(node_info)
            node_info = json.loads(node_info)
        except json.JSONDecodeError or TypeError:
            result[RESULT_CODE] = -1
            result[RESULT_MSG] = "given data is not a json string"
            return HttpResponse(json.dumps(result, ensure_ascii=False),
                                content_type='application/json',
                                charset='utf-8')
        # 调方法写数据库
        driver = initialize_neo4j_driver()

        # 应先从node_info构建Venue,然后创建
        venue = [Venue("", node_info)]

        db_ven_result = create_or_match_nodes(driver,
                                              venue,
                                              return_type="class",
                                              to_create=True)
        if db_ven_result[RESULT_CODE] != 1303:
            result[RESULT_CODE] = 00
            result[RESULT_MSG] = "Venue节点生成失败"
        else:
            result[RESULT_CODE] = 1
            result[RESULT_MSG] = "Venue节点生成成功"

    else:
        result[RESULT_CODE] = -10
        result[RESULT_MSG] = "not supported request form"
    return wrap_result(result)
Example #4
0
def build(driver, file_name, sheet_name=None, column_name=None):
    """
    从文件中提取文献信息,并生成Publication、Venue、Person和他们之间的Published_in、Authored_by关系。 ---checked
    :param driver:
    :param file_name:
    :param sheet_name:
    :param column_name:
    :return:
    """
    result = ini_result()
    if file_name is None or not os.path.exists(file_name):
        result["code"] = -504
        result["msg"] = "file does not exist"
        return result
    if driver is None:
        result[RESULT_CODE] = -500
        result[RESULT_MSG] = "the database is not configured!"
        return result

    # 从文件中提取info
    publication_info = extract_bib_info_from_file(file_name, sheet_name, column_name)  # dict

    pubs, venues, persons = [], [], []
    fail_pub, fail_venue, fail_person = [], [], []

    for entry in publication_info:
        # 解析文献
        tmp_result_pub = extract_publication_from_bib_info(entry)
        if tmp_result_pub[RESULT_CODE] == 1001:
            pubs.append(tmp_result_pub[RESULT_DATA])
        else:
            fail_pub.append(entry)
        # 解析venue
        tmp_result_venue = extract_venue_from_bib_info(entry)
        if tmp_result_venue[RESULT_CODE] == 1005:
            venues.append(tmp_result_venue[RESULT_DATA])
        else:
            fail_venue.append(entry)
        # 解析person
        tmp_result_person = extract_person_from_bib_info(entry)
        if tmp_result_person[RESULT_CODE] == 1006:
            persons.append(tmp_result_person[RESULT_DATA])
        else:
            fail_person.append(entry)

    pubs = None if pubs == [] else pubs
    venues = None if venues == [] else venues
    persons = None if persons == [] else persons
    # 利用提取的Publication、Venue、Person写入数据库
    db_pub_result = create_or_match_nodes(driver, pubs, return_type="class", to_create=True)
    if db_pub_result[RESULT_CODE] != 1303:
        result[RESULT_CODE] = -1205
        result[RESULT_MSG] = "Publication节点生成失败"
    db_ven_result = create_or_match_nodes(driver, venues, return_type="class", to_create=True)
    if db_ven_result[RESULT_CODE] != 1303:
        result[RESULT_CODE] = -1205
        result[RESULT_MSG] += "。Venue节点生成失败"
    db_ppl_result = create_or_match_nodes(driver, persons, return_type="class", to_create=True)
    if db_ppl_result[RESULT_CODE] != 1303:
        result[RESULT_CODE] = -1205
        result[RESULT_MSG] += "。Person节点生成失败"

    if result[RESULT_CODE] == -1205:
        return result

    # 生成边 published in
    publish_in_info_result = extract_rel_publish_in_from_pub_info(db_pub_result[RESULT_DATA])  # 获取published_in信息

    failed_pair = []
    if publish_in_info_result[RESULT_CODE] in [1009, 1010]:  # 若提取成功,则创建边
        pairs = publish_in_info_result[RESULT_DATA]["success"]
        for entry in pairs:
            venue_info = [entry]
            venue_result = create_or_match_nodes(driver, venue_info, "VENUE")
            if venue_result[RESULT_CODE] != 904:
                failed_pair.append(entry)
                continue
            tmp_result = query_or_create_relation(driver, "PUBLICATION", entry["pub"].uuid, "VENUE", venue_result[RESULT_DATA][0].uuid, "PUBLISHED_IN")
            if tmp_result[RESULT_CODE] not in [1304, 1306]:
                failed_pair.append(entry)

    # 生成边 AUTHORED_BY
    author_by_info_result = extract_rel_author_by_from_pub_info(db_pub_result[RESULT_DATA])  # 获取authored_by信息
    if author_by_info_result[RESULT_CODE] in [1009, 1010]:  # 若提取成功,则创建边
        pairs = author_by_info_result[RESULT_DATA]["success"]
        for entry in pairs:
            person_info = [entry]
            person_result = create_or_match_nodes(driver, person_info, "PERSON")
            if person_result[RESULT_CODE] != 904:
                failed_pair.append(entry)
                continue
            tmp_result = query_or_create_relation(driver, "PUBLICATION", entry["pub"].uuid, "PERSON",
                                                  person_result[RESULT_DATA][0].uuid, "AUTHORED_BY")
            if tmp_result[RESULT_CODE] not in [1304, 1306]:
                failed_pair.append(entry)
    #
    failed_pair = None if failed_pair == [] else failed_pair

    if failed_pair is not None:
        result[RESULT_CODE] = 1400
        result[RESULT_MSG] = "success"
    else:
        result[RESULT_CODE] = -1400
        result[RESULT_MSG] = "partially or fully failed"

    return result
Example #5
0
def build_relation_from_node_attribute(driver, source_node_type="PUBLICATION", target_node_type="VENUE",
                                       rel_type="PUBLISHED_IN", filters={"node_type": "ARTICLE"},
                                       info_field='JOURNAL', use_source=1, do_split=False):
    """
    实现了从某一类节点的指定字段中提取信息,新建其他节点并建立指定的连接,这个可以处理info_field中包含多个节点信息的情况,现在只支持人的
    多个信息处理
    :param driver: neo4j 连接信息
    :param source_node_type: 边起点类型
    :param target_node_type: 边终点类型
    :param rel_type: 边类型
    :param filters:dict,分析时,对接点进行过滤的条件,key为字段名,value为选出的可行值
    :param info_field: 待分析的字段名
    :param use_source: boolean,当为1时,使用起点节点进行数据分析,当为0时,使用终点节点进行数据分析
    :param do_split
    :return:json格式,其中
             code:-1:输入filters无效,-2:没查询到Publication节点,-3:Publication节点中没有指定信息,-4:创建/查询新节点失败,
                  -5:未识别的新节点类型,-6:部分边创建失败,1:创建成功
             msg:
    """
    result = ini_result()

    if driver is None:
        result[RESULT_CODE] = -500
        result[RESULT_MSG] = "driver is not given"
        return result

    if source_node_type not in NODE_TYPES or target_node_type not in NODE_TYPES:
        result[RESULT_CODE] = -1202
        result[RESULT_MSG] = "node type is not valid"
        return result

    if rel_type not in EDGE_TYPES:
        result[RESULT_CODE] = -1203
        result[RESULT_MSG] = "edge type is not valid"
        return result

    if info_field not in FIELD_NAMES_PUB:
        result[RESULT_CODE] = -1204
        result[RESULT_MSG] = "field is not valid"
        return result

    if (filters is not None and not isinstance(filters, dict)) or \
            (filters is not None and len(set(filters.keys()) & set(FIELD_NAMES_PUB)) == 0) or \
            use_source is None or do_split is None:
        result[RESULT_CODE] = -901
        result[RESULT_MSG] = "invalid arguments"
        return result

    identifier = 'node'
    # 解析filters的有效性,并生成查询条件语句
    if filters is not None:
        tmp_filter_str = ""
        for (key, value) in filters.items():
            tmp_filter_str += identifier + "." + key + "='" + value + "' and "
        tmp_filter_str = tmp_filter_str[:-5]
        # 生成完整查询语句
        if use_source:
            cypher = "match ({IF}:{NODE}) where {FILTER} return {IF}".format(IF=identifier, NODE=source_node_type, FILTER=tmp_filter_str)
        else:
            cypher = "match ({IF}:{NODE}) where {FILTER} return {IF}".format(IF=identifier, NODE=target_node_type, FILTER=tmp_filter_str)
    else:
        # 生成完整查询语句
        if use_source:
            cypher = "match ({IF}:{NODE}) return {IF}".format(IF=identifier, NODE=source_node_type)
        else:
            cypher = "match ({IF}:{NODE}) return {IF}".format(IF=identifier, NODE=target_node_type)

    # 查询在指定条件下的指定类型节点
    data_pair = []  # 查询之后的输出 source: source_node_type的uuid, target:venue的name, parameter:其他属性,如作者排序等
    if use_source:
        new_type = target_node_type
    else:
        new_type = source_node_type
    try:
        with driver.session() as session:
            nodes = session.run(cypher)
    except:
        result[RESULT_CODE] = -910
        result[RESULT_MSG] = "数据库连接失败"
        return result

    counter_has_content = 0
    counter_all = 0
    # counter_processed = 0  todo 这里没有检查有数据的记录是否成功处理,后面要加
    if use_source:
        for record in nodes:  # 把各节点的info_field字段提取出来,组成dict,key是节点的uuid,value是info_field字段值
            record_id = record[identifier][FIELD_NAMES_PUB[31]]  # uuid
            print("提取{NODE}与{NODE2}之间关系{REL}过程:查询到节点:".format(NODE=source_node_type, NODE2=target_node_type,
                                                              REL=rel_type) + str(record_id))
            counter_all += 1
            record_field = record[identifier][info_field]
            if not string_util(record_field):
                print("{ID} has empty {FIELD} field".format(ID=record_id, FIELD=info_field))
            else:
                if do_split:  # 需要将字段进行分割,然后生成多个节点
                    if new_type.upper() == NODE_TYPES[2]:  # person # todo 这是是什么情况??
                        names = process_person_names([record_field])  # 这里拆分成了多个,返回值:dict, original authors: list of dict of authors
                        names = names[record_field]  # list of dict = {name, index}
                        for name in names:
                            tmp = {"source": record_id, "target": name["name"], "parameter": {"index": name["index"]}}
                            data_pair.append(tmp)
                    else:
                        tmp = {"source": record_id, "target": record_field, "parameter": None}
                        data_pair.append(tmp)
                        result[RESULT_MSG] += "暂不支持针对【" + new_type + "】的拆分"
                        print("暂不支持针对【" + new_type + "】的拆分")
                else:
                    tmp = {"source": record_id, "target": record_field, "parameter": None}
                    data_pair.append(tmp)
                counter_has_content += 1
    else:
        for record in nodes:  # 把各节点的info_field字段提取出来,组成dict,key是节点的uuid,value是info_field字段值
            record_id = record[identifier][FIELD_NAMES_PUB[31]]  # uuid
            print("提取{NODE}与{NODE2}之间关系{REL}过程:查询到节点:".format(NODE=source_node_type, NODE2=target_node_type,
                                                              REL=rel_type) + str(record_id))
            counter_all += 1
            record_field = record[identifier][info_field]
            if not string_util(record_field):
                print("{ID} has empty {FIELD} field".format(ID=record_id, FIELD=info_field))
            else:
                if do_split:  # 需要将字段进行分割,然后生成多个节点
                    if new_type.upper() == NODE_TYPES[2]:  # person
                        names = process_person_names([record_field])  # 这里拆分成了多个,返回值:dict, original authors: list of dict of authors
                        names = names[record_field]  # list of dict = {name, index}
                        for name in names:
                            tmp = {"target": record_id, "source": name["name"], "parameter": {"index": name["index"]}}
                            data_pair.append(tmp)
                    else:
                        tmp = {"target": record_id, "source": record_field}
                        data_pair.append(tmp)
                        result[RESULT_MSG] += "暂不支持针对【" + new_type + "】的拆分"
                        print("暂不支持针对【" + new_type + "】的拆分")
                else:
                    tmp = {"target": record_id, "source": record_field}
                    data_pair.append(tmp)
                counter_has_content += 1
    if counter_all == 0:
        result[RESULT_CODE] = 125
        result[RESULT_MSG] += "\n 提取{NODE}与{NODE2}之间关系{REL}过程:未查询到{NODE_Q}节点中满足条件{FILTER}的节点".format(
            NODE=source_node_type, FILTER=str(filters), NODE2=target_node_type, REL=rel_type, NODE_Q=new_type)
        print(result[RESULT_MSG])
        return result
    if counter_has_content == 0:
        result[RESULT_CODE] = 126
        result[RESULT_MSG] += "\n 提取{NODE}与{NODE2}之间关系{REL}过程:在满足条件{FILTER}的{NODE_Q}节点的字段{FIELD}中没有有效信息".format(
            NODE=source_node_type, FILTER=str(filters), NODE2=target_node_type, REL=rel_type, FIELD=info_field,
            NODE_Q=new_type)
        print(result[RESULT_MSG])
        return result
    # 先将要生成的节点数据筛选出来 todo 没有检查是否所有记录都成功处理
    data_switched = []
    if use_source:
        for tmp in data_pair:
            data_switched.append(tmp["target"])
    else:
        for tmp in data_pair:
            data_switched.append(tmp["source"])
    data_switched = list(set(data_switched))
    # 将要建立节点的信息进行封装,models的类
    info = {"name": data_switched, "type": new_type}
    nodes = wrap_info_to_model(info, filters)  # node是封装后的节点类
    # 查询/创建节点
    create_result = create_or_match_nodes(nodes, database_info, to_create=True, return_type="class")
    if create_result[RESULT_CODE] < 0:
        result[RESULT_CODE] = -128
        result[RESULT_MSG] = create_result[RESULT_MSG] + "\t 查询/新建节点信息失败,停止创建关系"
        return result
    # 解析出新生成节点的uuid和关键字的对应关系,key是关键字,value是uuid
    nodes = create_result[RESULT_DATA]
    mapping = {}
    for datum in nodes:
        if new_type == NODE_TYPES[0]:  # Publication
            mapping[datum.id] = datum.uuid
        elif new_type == NODE_TYPES[1]:  # Venue
            mapping[datum.venue_name] = datum.uuid
        elif new_type == NODE_TYPES[2]:  # Person
            mapping[datum.full_name] = datum.uuid
    # 更新data_pair,将其中关键字部分改成uuid
    if use_source:
        for pair in data_pair:
            tmp_id = mapping[pair["target"]]
            pair["target"] = tmp_id
    else:
        for pair in data_pair:
            tmp_id = mapping[pair["source"]]
            pair["source"] = tmp_id
    # 查询/建立边
    counter_has_content = 0
    counter_all = len(data_pair)
    with driver.session() as session:
        for pair in data_pair:
            tmp = query_or_create_relation(session, source_node_type, pair["source"], target_node_type, pair["target"], rel_type,
                                           to_create=True, parameters=pair["parameter"])
            if tmp[RESULT_CODE] == 170 or tmp[RESULT_CODE] == 171:
                counter_has_content += 1
    if counter_has_content < counter_all:
        result[RESULT_CODE] = -131
        result[RESULT_MSG] = "部分边创建失败"
        return result
    else:
        result[RESULT_CODE] = 127
        result[RESULT_MSG] = "全部边创建成功"
        return result
Example #6
0
    from neo4j import GraphDatabase
    import neo4j
    from configparser import ConfigParser

    neo4j_config_file = "/Volumes/Transcend/web/web_pages/webs/neo4j.conf"

    cf = ConfigParser()
    cf.read(neo4j_config_file, encoding="utf-8")

    uri = cf.get("neo4j", "uri")
    neo4j_username = cf.get("neo4j", "username")
    neo4j_pwd = cf.get("neo4j", "pwd")

    driver = GraphDatabase.driver(uri,
                                  auth=neo4j.basic_auth(
                                      neo4j_username, neo4j_pwd))

    # 从文件中解析文献,
    # flag, msg, info = parse_bib("bibtex.bib")
    result0 = parse_bib_file("/Users/X/Downloads/reference.bib")
    if result0["code"] < 0:
        print(result0["msg"])
    # 并创建节点

    create_or_match_nodes(result0["data"], driver)
    # # 从网络中解析文献节点,并提取journal信息,创建venue节点、[wenxian]->[publication]
    # build_network_of_venues(node_type="ARTICLE", publication_field="journal")
    # build_network_of_venues(node_type="inproceedings".upper(), publication_field="book_title")
    # # 从文献中解析author字段,创建Person节点、person->publication
    # build_network_of_persons()
Example #7
0
def add_publication(request):
    """
    向cypher添加pub ---- 已改
    :param request:
    :return:
    """
    result = ini_result()
    result[RESULT_COUNT] = -1

    if request.method == 'POST':
        is_ajax = request.META.get('HTTP_X_REQUESTED_WITH') == 'XMLHttpRequest'
        if is_ajax:
            pub_info = request.body

            if pub_info is None or pub_info == "":
                result[RESULT_CODE] = -301
                result[RESULT_MSG] = "no data is given"
                return wrap_result(result)

            try:
                pub_info = bytes.decode(pub_info)
                pub_info = json.loads(pub_info)
            except json.JSONDecodeError:
                result[RESULT_MSG] = "given data is not a json string"
                result[RESULT_CODE] = -4
                return HttpResponse(json.dumps(result, ensure_ascii=False),
                                    content_type='application/json',
                                    charset='utf-8')
        else:
            pub_info = request.POST
            if pub_info is None:
                result[RESULT_MSG] = "no data is given"
                result[RESULT_CODE] = -3
                return HttpResponse(json.dumps(result, ensure_ascii=False),
                                    content_type='application/json',
                                    charset='utf-8')
        # 参数提取
        to_create = pub_info.get("to_create", False)
        return_type = pub_info.get("return_type", 'dict')
        # 特殊字段的处理:作者
        authors = pub_info.get("author", None)
        if authors is None:
            pub_info["author"] = ""
        elif isinstance(authors, list):
            tmp, num, counter = ["", len(authors), 0]
            for author in authors:
                tmp += author["lastName"] + ", " + author[
                    "firstName"] + " " + author["middleName"]
                counter += 1
                if counter < num:
                    tmp += " and "
            pub_info["author"] = tmp
        # 特殊字段的处理:文章类型
        if pub_info["node_type"] == "0":
            pub_info["node_type"] = "ARTICLE"
        elif pub_info["node_type"] == "1":
            pub_info["node_type"] = "BOOK"
        elif pub_info["node_type"] == "2":
            pub_info["node_type"] = "BOOKLET"
        elif pub_info["node_type"] == "3":
            pub_info["node_type"] = "CONFERENCE"
        elif pub_info["node_type"] == "4":
            pub_info["node_type"] = "INBOOK"
        elif pub_info["node_type"] == "5":
            pub_info["node_type"] = "INCOLLECTION"
        elif pub_info["node_type"] == "6":
            pub_info["node_type"] = "INPROCEEDINGS"
        elif pub_info["node_type"] == "7":
            pub_info["node_type"] = "MANUAL"
        elif pub_info["node_type"] == "8":
            pub_info["node_type"] = "MASTERSTHESIS"
        elif pub_info["node_type"] == "9":
            pub_info["node_type"] = "MISC"
        elif pub_info["node_type"] == "10":
            pub_info["node_type"] = "PHDTHESIS"
        elif pub_info["node_type"] == "11":
            pub_info["node_type"] = "PROCEEDINGS"
        elif pub_info["node_type"] == "12":
            pub_info["node_type"] = "TECHREPORT"
        elif pub_info["node_type"] == "13":
            pub_info["node_type"] = "UNPUBLISHED"
        else:
            result[RESULT_MSG] = "unsupported paper type"
            result[RESULT_CODE] = -5
            return HttpResponse(json.dumps(result, ensure_ascii=False),
                                content_type='application/json',
                                charset='utf-8')
        # 调方法写数据库
        query_result = create_or_match_nodes(driver,
                                             pub_info,
                                             return_type=return_type,
                                             to_create=to_create)

        result[RESULT_MSG] = query_result.get(RESULT_MSG, "查询数据接口无返回值")
        result[RESULT_CODE] = query_result.get(RESULT_CODE, -6)
        result[RESULT_DATA] = query_result.get(RESULT_DATA, [])
        try:
            result["count"] = len(result[RESULT_DATA])
        except KeyError:
            result["count"] = 0
        return HttpResponse(json.dumps(result, ensure_ascii=False),
                            content_type='application/json',
                            charset='utf-8')
    else:
        result[RESULT_MSG] = "请求方式应为post"
        result[RESULT_CODE] = -7
        return HttpResponse(json.dumps(result, ensure_ascii=False),
                            content_type='application/json',
                            charset='utf-8')