def construct_all_from_folder( dictionaries_folder, string_tokenizer=(lambda x: x.split()), case_sensitive=False, hdfs_url=None, hdfs_user=None, stop_words=None, accepted_extensions=[".dic", "dict", ".txt", ".tsv", ".csv"]): def accept_filename_fun(filename): return any( filename.endswith(accepted_extension) for accepted_extension in accepted_extensions) hdfs_client = maybe_get_hdfs_client(hdfs_url, hdfs_user) if hdfs_client: # hdfs dic_paths = walk_hdfs_directory(hdfs_client, dictionaries_folder, accept_filename_fun) read_function = DictionaryFeatureGenerator.__hdfs_read_function( hdfs_client) else: # local file system dic_paths = (path for path in glob.glob( os.path.join(dictionaries_folder, "*"), recursive=True) if accept_filename_fun(path)) read_function = DictionaryFeatureGenerator.__localfs_read_function # return DictionaryFeatureGenerator.__read_dictionaries( dic_paths, read_function, string_tokenizer, case_sensitive, stop_words)
def construct_all_from_paths( dictionaries_paths, string_tokenizer=(lambda x: x.split()), case_sensitive=False, hdfs_url=None, hdfs_user=None, stop_words=None, accepted_extensions=[".dic", "dict", ".txt", ".tsv", ".csv"]): if type(dictionaries_paths) is str: dictionaries_paths = dictionaries_paths.split() hdfs_client = maybe_get_hdfs_client(hdfs_url, hdfs_user) if hdfs_client: read_function = DictionaryFeatureGenerator.__hdfs_read_function( hdfs_client) else: read_function = DictionaryFeatureGenerator.__localfs_read_function # return DictionaryFeatureGenerator.__read_dictionaries( dictionaries_paths, read_function, string_tokenizer, case_sensitive, stop_words)
def __init__(self, directory, read_only_class_id=None, delete_incomplete_docs=True, is_predicted=False, read_relations=False, whole_basename_as_docid=False, raise_exception_on_incosistencies=True, hdfs_url=None, hdfs_user=None): self.directory = directory """the directory containing *.ann.json files""" if read_only_class_id is not None and not isinstance( read_only_class_id, list): read_only_class_id = [read_only_class_id] self.read_only_class_id = read_only_class_id """whether to read in only entities with given class_id's (single id or list of). Otherwise if None, read all entities""" self.delete_incomplete_docs = delete_incomplete_docs """delete documents from the dataset that are not marked as 'anncomplete' provided the docs are not predicted""" self.is_predicted = is_predicted """whether the annotation is predicted or real, which determines where it will be saved""" self.read_relations = read_relations """whether relations should be read as well""" self.whole_basename_as_docid = whole_basename_as_docid self.raise_exception_on_incosistencies = raise_exception_on_incosistencies self.hdfs_client = maybe_get_hdfs_client(hdfs_url, hdfs_user)
def __init__(self, path, whole_basename_as_docid=False, hdfs_url=None, hdfs_user=None): self.path = path """an html file or a directory containing .html files""" self.whole_basename_as_docid = whole_basename_as_docid self.hdfs_client = maybe_get_hdfs_client(hdfs_url, hdfs_user)