Ejemplo n.º 1
0
def get_directory_information(collection):
    directory = collection

    real_dir = real_directory(directory)
    
    assert_allowed_to_read(real_dir)
    
    # Get the document names
    base_names = [fn[0:-4] for fn in _listdir(real_dir)
            if fn.endswith('txt')]

    doclist = base_names[:]
    doclist_header = [("Document", "string")]

    # Then get the modification times
    doclist_with_time = []
    for file_name in doclist:
        file_path = path_join(DATA_DIR, real_dir,
            file_name + "." + JOINED_ANN_FILE_SUFF)
        doclist_with_time.append([file_name, _getmtime(file_path)])
    doclist = doclist_with_time
    doclist_header.append(("Modified", "time"))

    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError
                
    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    doclist_header += stats_types

    dirlist = [dir for dir in _listdir(real_dir)
            if isdir(path_join(real_dir, dir))]
    # just in case, and for generality
    dirlist = [[dir] for dir in dirlist]

    # check whether at root, ignoring e.g. possible trailing slashes
    if normpath(real_dir) != normpath(DATA_DIR):
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None]+i)
    for i in doclist:
        combolist.append(["d", None]+i)

    # plug in the search config too
    search_config = get_search_config(real_dir)

    # ... and the disambiguator config ... this is getting a bit much
    disambiguator_config = get_disambiguator_config(real_dir)

    # ... and the normalization config (TODO: rethink)
    normalization_config = get_normalization_config(real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    # fill in a flag for whether annotator logging is active so that
    # the client knows whether to invoke timing actions
    ann_logging = annotation_logging_active(real_dir)

    # fill in NER services, if any
    ner_taggers = get_annotator_config(real_dir)

    return _inject_annotation_type_conf(real_dir, json_dic={
            'items': combolist,
            'header' : doclist_header,
            'parent': parent,
            'messages': [],
            'description': readme_text,
            'search_config': search_config,
            'disambiguator_config' : disambiguator_config,
            'normalization_config' : normalization_config,
            'annotation_logging': ann_logging,
            'ner_taggers': ner_taggers,
            })
Ejemplo n.º 2
0
def get_directory_information(collection):
    directory = collection

    real_dir = real_directory(directory)

    assert_allowed_to_read(real_dir)

    # Get the document names
    base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')]

    doclist = base_names[:]
    doclist_header = [("Document", "string")]

    # Then get the modification times
    doclist_with_time = []
    for file_name in doclist:
        file_path = path_join(DATA_DIR, real_dir,
                              file_name + "." + JOINED_ANN_FILE_SUFF)
        doclist_with_time.append([file_name, _getmtime(file_path)])
    doclist = doclist_with_time
    doclist_header.append(("Modified", "time"))

    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError

    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    doclist_header += stats_types

    dirlist = [
        dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))
    ]
    # just in case, and for generality
    dirlist = [[dir] for dir in dirlist]

    # check whether at root, ignoring e.g. possible trailing slashes
    if normpath(real_dir) != normpath(DATA_DIR):
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None] + i)
    for i in doclist:
        combolist.append(["d", None] + i)

    # plug in the search config too
    search_config = get_search_config(real_dir)

    # ... and the disambiguator config ... this is getting a bit much
    disambiguator_config = get_disambiguator_config(real_dir)

    # ... and the normalization config (TODO: rethink)
    normalization_config = get_normalization_config(real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    # fill in a flag for whether annotator logging is active so that
    # the client knows whether to invoke timing actions
    ann_logging = annotation_logging_active(real_dir)

    # fill in NER services, if any
    ner_taggers = get_annotator_config(real_dir)

    #send logging directory:
    logging = options_get_annlogfile(real_dir)

    return _inject_annotation_type_conf(real_dir,
                                        json_dic={
                                            'items': combolist,
                                            'header': doclist_header,
                                            'parent': parent,
                                            'messages': [],
                                            'description': readme_text,
                                            'search_config': search_config,
                                            'disambiguator_config':
                                            disambiguator_config,
                                            'normalization_config':
                                            normalization_config,
                                            'annotation_logging': ann_logging,
                                            'ner_taggers': ner_taggers,
                                            'logging': logging,
                                        })
Ejemplo n.º 3
0
def get_directory_information(collection):
    directory = collection
    real_dir = real_directory(directory)
    assert_allowed_to_read(real_dir)

    # Get the document names
    user = get_session().get('user')
    if user is None or user == 'guest':
        base_names = []
    # # 可以从配置文件获取用户。
    elif user in USER_PASSWORD:
        base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')]
    else:
        db = DBlite()
        base_names = db.get_AnnNull_files(directory)
        names_ING = db.get_AnnING_files(directory, user)
        print("names_ING", names_ING, file=sys.stderr)
        base_names.extend(names_ING)


    doclist = base_names[:]
    doclist_header = [("文档", "string")]

    # Then get the modification times
    doclist_with_time = []
    for file_name in doclist:
        file_path = path_join(DATA_DIR, real_dir,
                              file_name + "." + JOINED_ANN_FILE_SUFF)
        doclist_with_time.append([file_name, _getmtime(file_path)])
    doclist = doclist_with_time
    doclist_header.append(("修改时间", "time"))

    """
        stats_types: [('Entities', 'int'), ('Relations', 'int'), ('Events', 'int')]
        doc_stats: [[29, 0, 0], [97, 0, 0], [22, 0, 0], [8, 0, 0], [17, 0, 0], [22, 0, 0], [14, 0, 0], [24, 0, 0], [22, 0, 0], [21, 0, 0]]
        doclist: [['ned.train-doc-184', 1555259780.624325, 29, 0, 0], ['ned.train-doc-181', 1555259780.623239, 97, 0, 0], ['ned.train-doc-236'
    """
    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
        print("stats_types:", stats_types, file=sys.stderr)
        print("doc_stats:", doc_stats, file=sys.stderr)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError

    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    print("doclist:", doclist, file=sys.stderr)
    doclist_header += stats_types
    # doclist_header.append(("修改者", "string"))
    print("doclist_header:", doclist_header, file=sys.stderr)

    if user is None or user == 'guest':
        dirlist = []
    elif user in USER_PASSWORD:
        dirlist = [dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))]
    else: # for user ACL
        dirlist = [dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))]
    # just in case, and for generality
    dirlist = [[dir] for dir in dirlist]
    # print("---------------dirlist------------------", dirlist, file=sys.stderr)
    # 打开最后的文件目录结构时出现
    # 文件名  修改时间   实体 关系 事件
    # [['esp.train-doc-46', 1555259780.6167455, 104, 0, 0], ['esp.train-doc-989', 1555259780.6174483, 34, 0, 0],
    # print(doclist, file=sys.stderr)

    # check whether at root, ignoring e.g. possible trailing slashes
    if normpath(real_dir) != normpath(DATA_DIR):
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None] + i)
    for i in doclist:
        combolist.append(["d", None] + i)

    # plug in the search config too
    search_config = get_search_config(real_dir)

    # ... and the disambiguator config ... this is getting a bit much
    disambiguator_config = get_disambiguator_config(real_dir)

    # ... and the normalization config (TODO: rethink)
    normalization_config = get_normalization_config(real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    # fill in a flag for whether annotator logging is active so that
    # the client knows whether to invoke timing actions
    ann_logging = annotation_logging_active(real_dir)

    # fill in NER services, if any
    ner_taggers = get_annotator_config(real_dir)

    return _inject_annotation_type_conf(real_dir, json_dic={
        'items': combolist,
        'header': doclist_header,
        'parent': parent,
        'messages': [],
        'description': readme_text,
        'search_config': search_config,
        'disambiguator_config': disambiguator_config,
        'normalization_config': normalization_config,
        'annotation_logging': ann_logging,
        'ner_taggers': ner_taggers,
    })