def __init__(self, np2vec_model_file, binary=False, word_ngrams=False, grouping=False): """ Load the np2vec model for set expansion. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ self.grouping = grouping if grouping: # load grouping info logger.info('loading grouping data') self.id2rep = load_json_file(path.join(cur_dir, 'id2rep')) self.np2id = load_json_file(path.join(cur_dir, 'np2id')) self.id2group = load_json_file(path.join(cur_dir, 'id2group')) logger.info('loadind model...') self.np2vec_model = NP2vec.load(np2vec_model_file, binary=binary, word_ngrams=word_ngrams) # extract the first term of the model in order to get the marking character logger.info('compute L2 norm') first_term = next(iter(self.np2vec_model.vocab.keys())) self.mark_char = first_term[-1] # Precompute L2-normalized vectors. self.np2vec_model.init_sims() logger.info('done init')
def is_stop(token: str) -> bool: if not StringUtils.stop_words: StringUtils.stop_words = load_json_file(STOP_WORDS_FILE) StringUtils.stop_words.extend(DISAMBIGUATION_CATEGORY) if token not in StringUtils.stop_words: return False return True
def is_pronoun(in_str: str) -> bool: if not StringUtils.pronouns: StringUtils.pronouns = load_json_file(PRONOUN_FILE) tokens = in_str.split() if len(tokens) == 1: if tokens[0] in StringUtils.pronouns: return True return False
def is_determiner(in_str: str) -> bool: if not StringUtils.determiners: StringUtils.determiners = load_json_file(DETERMINERS_FILE) tokens = in_str.split() if len(tokens) == 1: if tokens[0] in StringUtils.determiners: return True return False
def load_mentions_from_file(self, mentions_file_path: str) -> List[Topic]: start_data_load = time.time() logger.info('Loading mentions from-%s', mentions_file_path) mentions = load_json_file(mentions_file_path) topics = self.order_mentions_by_topics(mentions) end_data_load = time.time() took_load = end_data_load - start_data_load logger.info('Mentions file-%s, took:%.4f sec to load', mentions_file_path, took_load) return topics
def is_preposition(in_str: str) -> bool: if not StringUtils.preposition: StringUtils.preposition = load_json_file(PREPOSITION_FILE) tokens = in_str.split() if len(tokens) == 1: if tokens[0] in StringUtils.preposition: return True return False
def __init__(self, wd_file: str): """ Extract Relation between two mentions according to Within document co-reference Args: wd_file (required): str Location of within doc co-reference mentions file """ wd_mentions_json = load_json_file(wd_file) self.within_doc_coref_chain = self.arrange_resource(wd_mentions_json) super(WithinDocCoref, self).__init__()
def __init__( self, np2vec_model_file, binary=False, word_ngrams=False, grouping=False, light_grouping=False, grouping_map_dir=None, ): """ Load the np2vec model for set expansion. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. light_grouping (bool): boolean indicating whether to load all maps for grouping. grouping_map_dir (str): path to the directory containing maps for grouping. Returns: np2vec model to load """ self.grouping = grouping if grouping: # load grouping info logger.info("loading grouping data") if not grouping_map_dir: grouping_map_dir = path.dirname(np2vec_model_file) self.np2id = load_json_file(path.join(grouping_map_dir, "np2id")) if not light_grouping: self.id2rep = load_json_file(path.join(grouping_map_dir, "id2rep")) self.id2group = load_json_file(path.join(grouping_map_dir, "id2group")) logger.info("loadind model...") self.np2vec_model = NP2vec.load(np2vec_model_file, binary=binary, word_ngrams=word_ngrams) # extract the first term of the model in order to get the marking character logger.info("compute L2 norm") first_term = next(iter(self.np2vec_model.vocab.keys())) self.mark_char = first_term[-1] # Precompute L2-normalized vectors. self.np2vec_model.init_sims() logger.info("done init")
def __init__(self, method: OnlineOROfflineMethod, ref_dict: str = None): """ Extract Relation between two mentions according to Referent Dictionary knowledge Args: method (required): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full referent dictionary or a sub-set of ref_dict (required): str Location of referent dictionary file to work with """ logger.info('Loading ReferentDict module') if method == OnlineOROfflineMethod.OFFLINE: self.ref_dict = load_json_file(ref_dict) elif method == OnlineOROfflineMethod.ONLINE: self.ref_dict = self.load_reference_dict(ref_dict) logger.info('ReferentDict module lead successfully') super(ReferentDictRelationExtraction, self).__init__()
def load_dump(self, wn_dump): onlyfiles = [] for _file in listdir(wn_dump): file_path = join(wn_dump, _file) if isfile(file_path): onlyfiles.append(file_path) json_dump_list = {} for _file in onlyfiles: json_dump_list.update(load_json_file(_file)) dump_final = {} for key, value in json_dump_list.items(): dump_final[key] = self.extract_json_values(value) return dump_final
def __init__(self, method: OnlineOROfflineMethod, vo_file: str): """ Extract Relation between two mentions according to VerbOcean knowledge Args: method (required): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full VerbOcean or a sub-set of it vo_file (required): str Location of VerbOcean file to work with """ logger.info('Loading Verb Ocean module') if method == OnlineOROfflineMethod.OFFLINE: self.vo = load_json_file(vo_file) elif method == OnlineOROfflineMethod.ONLINE: self.vo = self.load_verbocean_file(vo_file) logger.info('Verb Ocean module lead successfully') super(VerboceanRelationExtraction, self).__init__()
def __init__(self, wd_file: str): """ Extract Relation between two mentions according to Within document co-reference Args: wd_file (required): str Location of within doc co-reference mentions file """ logger.info("Loading Within doc resource") if wd_file is not None and os.path.isfile(wd_file): wd_mentions_json = load_json_file(wd_file) self.within_doc_coref_chain = self.arrange_resource( wd_mentions_json) else: raise FileNotFoundError( "Within-doc resource file not found or not in path") super(WithinDocCoref, self).__init__()
def from_config(cls, word_vocab_size: int, num_labels: int, config: str): """ Load a model from a configuration file A valid configuration file is a JSON file with fields as in class `__init__` Args: word_vocab_size (int): word vocabulary size num_labels (int): number of labels (classifier) config (str): path to configuration file Returns: IDCNN: IDCNNEmbedder module pre-configured """ if not os.path.exists(config): raise FileNotFoundError cfg = load_json_file(config) return cls(word_vocab_size=word_vocab_size, num_labels=num_labels, **cfg)
def read_mentions_json_to_mentions_data_list(mentions_json_file: str): """ Args: mentions_json_file: the path of the mentions json file to read Returns: List[MentionData] """ all_mentions_only = load_json_file(mentions_json_file) mentions = [] for mention_line in all_mentions_only: mention_data = MentionData.read_json_mention_data_line(mention_line) mentions.append(mention_data) return mentions
def __init__(self, method: OnlineOROfflineMethod = OnlineOROfflineMethod.ONLINE, vo_file: str = None): """ Extract Relation between two mentions according to VerbOcean knowledge Args: method (optional): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full VerbOcean or a sub-set of it (default = ONLINE) vo_file (required): str Location of VerbOcean file to work with """ logger.info('Loading Verb Ocean module') if vo_file is not None and os.path.isfile(vo_file): if method == OnlineOROfflineMethod.OFFLINE: self.vo = load_json_file(vo_file) elif method == OnlineOROfflineMethod.ONLINE: self.vo = self.load_verbocean_file(vo_file) logger.info('Verb Ocean module lead successfully') else: raise FileNotFoundError( 'VerbOcean file not found or not in path..') super(VerboceanRelationExtraction, self).__init__()
def __init__(self, method: OnlineOROfflineMethod = OnlineOROfflineMethod.ONLINE, ref_dict: str = None): """ Extract Relation between two mentions according to Referent Dictionary knowledge Args: method (optional): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full referent dictionary or a sub-set of (default = ONLINE) ref_dict (required): str Location of referent dictionary file to work with """ logger.info('Loading ReferentDict module') if ref_dict is not None and os.path.isfile(ref_dict): if method == OnlineOROfflineMethod.OFFLINE: self.ref_dict = load_json_file(ref_dict) elif method == OnlineOROfflineMethod.ONLINE: self.ref_dict = self.load_reference_dict(ref_dict) logger.info('ReferentDict module lead successfully') else: raise FileNotFoundError( 'Referent Dict file not found or not in path:' + ref_dict) super(ReferentDictRelationExtraction, self).__init__()
def get_from_cache(url: str, cache_dir: str = None) -> str: """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. """ if cache_dir is None: cache_dir = MODEL_CACHE os.makedirs(cache_dir, exist_ok=True) response = requests.head(url, allow_redirects=True) if response.status_code != 200: raise IOError("HEAD request failed for url {} with status code {}" .format(url, response.status_code)) etag = response.headers.get("ETag") filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) need_downloading = True if os.path.exists(cache_path): # check if etag has changed comparing with the metadata if url.split('/')[-1].endswith('zip'): meta_path = cache_path + '.json' else: meta_path = cache_path + '_meta_' + '.json' meta = load_json_file(meta_path) if meta['etag'] == etag: print('file already present') need_downloading = False if need_downloading: print("File not present or etag changed") # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with tempfile.NamedTemporaryFile() as temp_file: logger.info("%s not found in cache, downloading to %s", url, temp_file.name) # GET file object http_get(url, temp_file) # we are copying the file before closing it, so flush to avoid truncation temp_file.flush() # shutil.copyfileobj() starts at the current position, so go to the start temp_file.seek(0) logger.info("copying %s to cache at %s", temp_file.name, cache_path) with open(cache_path, 'wb') as cache_file: shutil.copyfileobj(temp_file, cache_file) logger.info("creating metadata file for %s", cache_path) meta = {'url': url, 'etag': etag} if url.split('/')[-1].endswith('zip'): meta_path = cache_path + '.json' else: meta_path = cache_path + '_meta_' + '.json' with open(meta_path, 'w') as meta_file: json.dump(meta, meta_file) logger.info("removing temp file %s", temp_file.name) return cache_path, need_downloading