def load_word2vec(path, delimiter=' ', cache=True) -> Tuple[Dict[str, np.ndarray], int]: realpath = get_resource(path) binpath = replace_ext(realpath, '.pkl') if cache: try: flash('Loading word2vec from cache [blink][yellow]...[/yellow][/blink]') word2vec, dim = load_pickle(binpath) flash('') return word2vec, dim except IOError: pass dim = None word2vec = dict() f = TimingFileIterator(realpath) for idx, line in enumerate(f): f.log('Loading word2vec from text file [blink][yellow]...[/yellow][/blink]') line = line.rstrip().split(delimiter) if len(line) > 2: if dim is None: dim = len(line) else: if len(line) != dim: logger.warning('{}#{} length mismatches with {}'.format(path, idx + 1, dim)) continue word, vec = line[0], line[1:] word2vec[word] = np.array(vec, dtype=np.float32) dim -= 1 if cache: flash('Caching word2vec [blink][yellow]...[/yellow][/blink]') save_pickle((word2vec, dim), binpath) flash('') return word2vec, dim
def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]: """An iterator over CONLL formatted files which yields documents, regardless of the number of document annotations in a particular file. This is useful for conll data which has been preprocessed, such as the preprocessing which takes place for the 2012 CONLL Coreference Resolution task. Args: file_path: str: Returns: """ open_file = TimingFileIterator(file_path) conll_rows = [] document: List[OntonotesSentence] = [] for line in open_file: open_file.log(f'Loading {os.path.basename(file_path)}') line = line.strip() if line != "" and not line.startswith("#"): # Non-empty line. Collect the annotation. conll_rows.append(line) else: if conll_rows: document.append(self._conll_rows_to_sentence(conll_rows)) conll_rows = [] if line.startswith("#end document"): yield document document = [] open_file.erase() if document: # Collect any stragglers or files which might not # have the '#end document' format for the end of the file. yield document
def load_file(self, filepath): cache, valid = file_cache(filepath, not self.cache) if not valid or (self.vocab.mutable and not os.path.isfile(self.vocab_path)): with open(cache, 'wb') as out: tokens, lines = 0, 0 f = TimingFileIterator(filepath) for line in f: if self.strip: line = line.strip() if not line: continue sample = {'text': line} sample = self.transform_sample(sample, inplace=True) for id in sample['token_id']: out.write((id).to_bytes(4, 'little')) tokens += len(sample['token_id']) lines += 1 f.log(f'{tokens // 1000000}M tokens, {lines // 1000000}M lines\n' f'{sample["token"][:10]}') f.erase() if self.vocab.mutable: self.vocab.lock() hanlp_common.io.save_json(self.vocab_path) self.num_tokens = tokens else: self.num_tokens = int(os.path.getsize(self.filecache) / 4) if self.vocab.mutable: hanlp_common.io.load_json(self.vocab_path)
def load_file(self, filepath: tuple): phrase_tree_path = get_resource(filepath[0]) dep_tree_path = get_resource(filepath[1]) pf = TimingFileIterator(phrase_tree_path) message_prefix = f'Loading {os.path.basename(phrase_tree_path)} and {os.path.basename(dep_tree_path)}' for i, (dep_sent, phrase_sent) in enumerate( zip(read_tsv_as_sents(dep_tree_path), pf)): # Somehow the file contains escaped literals phrase_sent = phrase_sent.replace('\\/', '/') token = [x[1] for x in dep_sent] pos = [x[3] for x in dep_sent] head = [int(x[6]) for x in dep_sent] rel = [x[7] for x in dep_sent] phrase_tree = load_trees_from_str(phrase_sent, [head], [rel], [token]) assert len( phrase_tree ) == 1, f'{phrase_tree_path} must have on tree per line.' phrase_tree = phrase_tree[0] yield { 'FORM': token, 'CPOS': pos, 'HEAD': head, 'DEPREL': rel, 'tree': phrase_tree, 'hpsg': phrase_tree.convert() } pf.log( f'{message_prefix} {i + 1} samples [blink][yellow]...[/yellow][/blink]' ) pf.erase()
def load_file(self, filepath): """Both ``.conllx`` and ``.conllu`` are supported. Their descriptions can be found in :class:`hanlp_common.conll.CoNLLWord` and :class:`hanlp_common.conll.CoNLLUWord` respectively. Args: filepath: ``.conllx`` or ``.conllu`` file path. """ if filepath.endswith('.conllu'): # See https://universaldependencies.org/format.html field_names = [ 'ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC' ] else: field_names = [ 'ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL' ] fp = TimingFileIterator(filepath) for idx, sent in enumerate(read_conll(fp)): sample = {} for i, field in enumerate(field_names): sample[field] = [cell[i] for cell in sent] if not self._prune or not self._prune(sample): yield sample fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]')
def load_file(self, filepath: str): """Load tokenized corpus. The format is one sentence per line, where each line consisits of tokens seperated by a delimiter (usually space). .. highlight:: bash .. code-block:: bash $ head train.txt 上海 浦东 开发 与 法制 建设 同步 新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 ) Args: filepath: The path to the corpus. """ f = TimingFileIterator(filepath) # longest_sent = 0 for line in f: line = line.rstrip('\n') tokens = line.split(self.delimiter) if not tokens: continue if self.max_seq_len and sum(len(t) for t in tokens) > self.max_seq_len: # debug = [] for short_sents in split_long_sentence_into(tokens, self.max_seq_len, self.sent_delimiter, char_level=self.char_level, hard_constraint=self.hard_constraint): # debug.extend(short_sents) # longest_sent = max(longest_sent, len(''.join(short_sents))) yield {'token': short_sents} # assert debug == tokens else: # longest_sent = max(longest_sent, len(''.join(tokens))) yield {'token': tokens} f.log(line[:20]) f.erase()
def load_file(self, filepath: str): """Load ``.jsonlines`` CoNLL12-style corpus. Samples of this corpus can be found using the following scripts. .. highlight:: python .. code-block:: python import json from hanlp_common.document import Document from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV from hanlp.utils.io_util import get_resource with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src: for line in src: doc = json.loads(line) print(Document(doc)) break Args: filepath: ``.jsonlines`` CoNLL12 corpus. """ filename = os.path.basename(filepath) reader = TimingFileIterator(filepath) num_docs, num_sentences = 0, 0 for line in reader: doc = json.loads(line) num_docs += 1 num_tokens_in_doc = 0 for sid, (sentence, srl) in enumerate(zip(doc['sentences'], doc['srl'])): if self.doc_level_offset: srl = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc, x[2] - num_tokens_in_doc, x[3]) for x in srl] else: srl = [(x[0], x[1], x[2], x[3]) for x in srl] for x in srl: if any([o < 0 for o in x[:3]]): raise ValueError( f'Negative offset occurred, maybe doc_level_offset=False' ) if any([o >= len(sentence) for o in x[:3]]): raise ValueError( 'Offset exceeds sentence length, maybe doc_level_offset=True' ) deduplicated_srl = set() pa_set = set() for p, b, e, l in srl: pa = (p, b, e) if pa in pa_set: continue pa_set.add(pa) deduplicated_srl.add((p, b, e, l)) yield self.build_sample(sentence, deduplicated_srl, doc, sid) num_sentences += 1 num_tokens_in_doc += len(sentence) reader.log( f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]' ) reader.erase()
def load_file(self, filepath: str): """Load eos corpus. Args: filepath: Path to the corpus. .. highlight:: bash .. code-block:: bash $ head -n 2 ctb8.txt 中国经济简讯 新华社北京十月二十九日电中国经济简讯 """ f = TimingFileIterator(filepath) sents = [] eos_offsets = [] offset = 0 for line in f: if not line.strip(): continue line = line.rstrip('\n') eos_offsets.append(offset + len(line.rstrip()) - 1) offset += len(line) if self.append_after_sentence: line += self.append_after_sentence offset += len(self.append_after_sentence) f.log(line) sents.append(line) f.erase() corpus = list(itertools.chain.from_iterable(sents)) if self.eos_chars: if not isinstance(self.eos_chars, set): self.eos_chars = set(self.eos_chars) else: eos_chars = Counter() for i in eos_offsets: eos_chars[corpus[i]] += 1 self.eos_chars = set(k for (k, v) in eos_chars.most_common() if v >= self.eos_char_min_freq and ( not self.eos_char_is_punct or ispunct(k))) cprint(f'eos_chars = [yellow]{self.eos_chars}[/yellow]') eos_index = 0 eos_offsets = [i for i in eos_offsets if corpus[i] in self.eos_chars] window_size = self.window_size for i, c in enumerate(corpus): if c in self.eos_chars: window = corpus[i - window_size:i + window_size + 1] label_id = 1. if eos_offsets[eos_index] == i else 0. if label_id > 0: eos_index += 1 yield {'char': window, 'label_id': label_id} assert eos_index == len( eos_offsets), f'{eos_index} != {len(eos_offsets)}'
def load_file(self, filepath: str): """Load ``.jsonlines`` NER corpus. Samples of this corpus can be found using the following scripts. .. highlight:: python .. code-block:: python import json from hanlp_common.document import Document from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV from hanlp.utils.io_util import get_resource with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src: for line in src: doc = json.loads(line) print(Document(doc)) break Args: filepath: ``.jsonlines`` NER corpus. """ filename = os.path.basename(filepath) reader = TimingFileIterator(filepath) num_docs, num_sentences = 0, 0 for line in reader: line = line.strip() if not line: continue doc = json.loads(line) num_docs += 1 num_tokens_in_doc = 0 for sentence, ner in zip(doc['sentences'], doc['ner']): if self.doc_level_offset: ner = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc, x[2]) for x in ner] else: ner = [(x[0], x[1], x[2]) for x in ner] if self.tagset: ner = [x for x in ner if x[2] in self.tagset] if isinstance(self.tagset, dict): ner = [(x[0], x[1], self.tagset[x[2]]) for x in ner] deduplicated_srl = [] be_set = set() for b, e, l in ner: be = (b, e) if be in be_set: continue be_set.add(be) deduplicated_srl.append((b, e, l)) yield {'token': sentence, 'ner': deduplicated_srl} num_sentences += 1 num_tokens_in_doc += len(sentence) reader.log( f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]' ) reader.erase()