def test_match_zero(matcher): matcher.add('Quote', '', {}, [[{ 'ORTH': '"' }, { 'OP': '!', 'IS_PUNCT': True }, { 'OP': '!', 'IS_PUNCT': True }, { 'ORTH': '"' }]]) doc = Doc(matcher.vocab, words='He said , " some words " ...'.split()) assert len(matcher(doc)) == 1 doc = Doc(matcher.vocab, words='He said , " some three words " ...'.split()) assert len(matcher(doc)) == 0 matcher.add('Quote', '', {}, [[{ 'ORTH': '"' }, { 'IS_PUNCT': True }, { 'IS_PUNCT': True }, { 'IS_PUNCT': True }, { 'ORTH': '"' }]]) assert len(matcher(doc)) == 0
def deserialize_dataset(file_path, max_items): vocab = spacy.load('en_default').vocab with open(file_path, 'rb') as input_file: for data_point in pickle.load(input_file)[:max_items]: yield { 'question1': Doc(vocab).from_bytes(data_point['question1']), 'question2': Doc(vocab).from_bytes(data_point['question2']), 'id': data_point['id'], 'is_duplicate': data_point.get('is_duplicate', None) }
def get_tokens(model: Language, doc_id: int): fn = os.path.join(settings.TOKEN_DIR, str(doc_id)) if not os.path.exists(fn): raise ValueError( "Document {doc_id} has not been preprocessed ({fn} does not exist)" .format(**locals())) return Doc(model.vocab).from_disk(fn)
def decode(self, output: TaskOutput) -> TaskOutput: # The dims are: batch, top_k, tags output.tags: List[List[List[str]]] = [ self._decode_tags(paths) for paths in output.viterbi_paths ] output.scores: List[List[float]] = [[score for tags, score in paths] for paths in output.viterbi_paths] output.entities: List[List[List[Dict]]] = [] output.tokens: List[List[Dict]] = [] # iterate over batch for raw_text, k_tags in zip(output.raw_text, output.tags): pre_tokenized = not isinstance(raw_text, str) if pre_tokenized: # compose spacy doc from tokens doc = Doc(Vocab(), words=raw_text) else: doc = self.backbone.tokenizer.nlp(raw_text) output.entities.append( self._decode_entities(doc, k_tags, pre_tokenized)) output.tokens.append( self._decode_tokens(doc) if not pre_tokenized else None) if not any(output.tokens): # drop tokens field if no data del output.tokens del output.logits del output.mask del output.probs del output.raw_text del output.viterbi_paths return output
def get_spacy(self,load_from_file=False,model_name='en_core_web_sm'): import spacy global nlp if not nlp: #print('>> loading spacy...') nlp = spacy.load(model_name) doc=None if self.parsed and load_from_file: #print self.fnfn_spacy from spacy.tokens.doc import Doc try: for byte_string in Doc.read_bytes(open(self.fnfn_spacy, 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string) except UnicodeDecodeError: print("!! UNICODE ERROR:",self.fnfn_spacy) #else: if not doc: #print '>> making spacy document for text',self.id txt=self.text txt=clean_text(txt) doc=nlp(txt) return doc
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length): doc = Doc(en_parser.vocab, words=text) assert len(doc) == length with en_parser.step_through(doc) as _: # noqa: F841 pass assert doc[0].is_space for token in doc: assert token.head.i == length - 1
def test_get_entity_via_match(en_vocab): matcher = Matcher(en_vocab) matcher.add_entity('TestEntity', attrs={u'Hello': u'World'}) assert matcher.n_patterns == 0 assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == [] matcher.add_pattern(u'TestEntity', [{ORTH: u'Test'}, {ORTH: u'Entity'}]) assert matcher.n_patterns == 1 matches = matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) assert len(matches) == 1 assert len(matches[0]) == 4 ent_id, label, start, end = matches[0] assert ent_id == matcher.vocab.strings[u'TestEntity'] assert label == 0 assert start == 0 assert end == 2 attrs = matcher.get_entity(ent_id) assert attrs == {u'Hello': u'World'}
def read_docs(filepath): """Deserialize a list of documents + associated metadata""" spacy_parser = get_spacy_parser() data = pickle.load(open(filepath, 'rb')) for row in data: doc = Doc(spacy_parser.vocab) # read doc object from serialized byte array row['content'] = doc.from_bytes(row.pop('binary_content')) return data
def test_match_zero_plus(matcher): matcher.add('Quote', '', {}, [ [ {'ORTH': '"'}, {'OP': '*', 'IS_PUNCT': False}, {'ORTH': '"'} ]]) doc = Doc(matcher.vocab, 'He said , " some words " ...'.split()) assert len(matcher(doc)) == 1
def forward(texts, drop=0.): if tokenized: docs = [Doc(nlp.vocab, words) for words in texts] else: docs = [nlp(text) for text in texts] features = [doc.to_array(attrs) for doc in docs] def backward(d_features, sgd=None): return d_features return features, backward
def process_non_content_bearing_samples( self, empty_samples: List[Tuple[int, Text]]) -> List[Tuple[int, "Doc"]]: """Creates empty Doc-objects from zero-lengthed training samples strings.""" from spacy.tokens import Doc n_docs = [(empty_sample[0], doc) for empty_sample, doc in zip( empty_samples, [Doc(self.nlp.vocab) for doc in empty_samples])] return n_docs
def test_efficient_binary_serialization(doc): from spacy.tokens.doc import Doc byte_string = doc.to_bytes() open('moby_dick.bin', 'wb').write(byte_string) nlp = spacy.en.English() for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string)
def test_read_bytes(nlp): from spacy.tokens.doc import Doc loc = 'test_serialize.bin' with open(loc, 'wb') as file_: file_.write(nlp(u'This is a document.').to_bytes()) file_.write(nlp(u'This is another.').to_bytes()) docs = [] with open(loc, 'rb') as file_: for byte_string in Doc.read_bytes(file_): docs.append(Doc(nlp.vocab).from_bytes(byte_string)) assert len(docs) == 2
def __call__(self, text): if self._word_tokenizer: words = self._word_tokenizer.tokenize(text) else: words = text.split(' ') if self.return_doc: words = self._remove_empty_words(words) spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces) else: return words
def load_and_transform(batch_id, in_loc, out_dir): out_loc = path.join(out_dir, '%d.txt' % batch_id) if path.exists(out_loc): return None print('Batch', batch_id) nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False) with io.open(out_loc, 'w', encoding='utf8') as out_file: with io.open(in_loc, 'rb') as in_file: for byte_string in Doc.read_bytes(in_file): doc = Doc(nlp.vocab).from_bytes(byte_string) doc.is_parsed = True out_file.write(transform_doc(doc))
def _json_to_instance(self, json_dict: JsonDict) -> Instance: doc = Doc(spacy_model.vocab, words=json_dict['sentence'], spaces=[True for _ in range(len(json_dict['sentence']))]) spacy_tokens = spacy_model.pipeline[0][1](doc) if self._dataset_reader.use_language_specific_pos: # type: ignore # fine-grained part of speech pos_tags = [token.tag_ for token in spacy_tokens] else: # coarse-grained part of speech (Universal Depdendencies format) pos_tags = [token.pos_ for token in spacy_tokens] return self._dataset_reader.text_to_instance(json_dict['sentence'], pos_tags)
def parse(sntnc, nlp, tokenized=False): # if it's tokenized, split the string by space and convert to a list for input doc = None if tokenized: doc = Doc(nlp.vocab, words=sntnc.split(" ")) nlp.tagger(doc) nlp.parser(doc) nlp.entity(doc) else: doc = nlp(sntnc) return doc
def test_graph_edges_and_nodes(): doc = Doc(Vocab(), words=["a", "b", "c", "d"]) graph = Graph(doc, name="hello") node1 = graph.add_node((0, )) assert graph.get_node((0, )) == node1 node2 = graph.add_node((1, 3)) assert list(node2) == [1, 3] graph.add_edge(node1, node2, label="one", weight=-10.5) assert graph.has_edge(node1, node2, label="one") assert node1.heads() == [] assert [tuple(h) for h in node2.heads()] == [(0, )] assert [tuple(t) for t in node1.tails()] == [(1, 3)] assert [tuple(t) for t in node2.tails()] == []
def load(cls, nlp: Language, path: Union[str, Path]) -> Corpus: index = [] docs = [] with zipfile.ZipFile(path, 'r', compression=zipfile.ZIP_DEFLATED) as archive: for fname in archive.namelist(): i, _ = os.path.splitext(fname) index.append(i) with archive.open(fname) as f: doc = Doc(nlp.vocab).from_bytes(f.read()) docs.append(doc) indexed_documents = zip(docs, index) return cls(nlp, indexed_documents=indexed_documents)
def __iter__(self, week=None): with open(self.path + ".info") as info: with open(self.path + ".title.bin") as title_bin: for byte_string in Doc.read_bytes(title_bin): info_line = info.readline() comment_info = self._parse_info(info_line) if not (week is None) and get_week(comment_info["timestamp"]) != week: continue if self.clean_deleted and comment_info["author"] == "[deleted]": continue if self.clean_bots and (is_bot(comment_info["author"]) or comment_info["author"] in FILTERED_USERS): continue comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string) yield comment_info
def make_docs(nlp, batch, heads=True): docs = [] for record in batch: text = record["text"] if "tokens" in record: doc = Doc(nlp.vocab, words=record["tokens"]) else: doc = nlp.make_doc(text) if "heads" in record: heads = record["heads"] heads = numpy.asarray(heads, dtype="uint64") heads = heads.reshape((len(doc), 1)) doc = doc.from_array([HEAD], heads) if len(doc) >= 1 and len(doc) < 200: docs.append(doc) return docs
def __iter__(self, week=None): with open(self.path + ".bin", "rb") as bin: with open(self.path + ".info") as info: for byte_string in Doc.read_bytes(bin): comment_info = self._parse_info(info.next()) if (not week is None) and get_week(comment_info["timestamp"]) != week: continue if self.clean_deleted and comment_info["author"] == "[deleted]": continue if self.clean_bots and (is_bot(comment_info["author"]) or comment_info["author"] in FILTERED_USERS): continue doc = Doc(self._vocab).from_bytes(byte_string) comment_info["doc"] = doc comment_info["text"] = self._text_from_doc(doc) yield comment_info
def test_default_sentiment(EN): '''Test new span.sentiment property's default averaging behaviour''' good = EN.vocab[u'good'] good.sentiment = 3.0 bad = EN.vocab[u'bad'] bad.sentiment = -2.0 doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff']) good_stuff = doc[:2] assert good_stuff.sentiment == 3.0 / 2 bad_stuff = doc[-2:] assert bad_stuff.sentiment == -2. / 2 good_stuff_bad = doc[:-1] assert good_stuff_bad.sentiment == (3. + -2) / 3.
def test_override_sentiment(EN): '''Test new span.sentiment property's default averaging behaviour''' good = EN.vocab[u'good'] good.sentiment = 3.0 bad = EN.vocab[u'bad'] bad.sentiment = -2.0 doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff']) doc.user_span_hooks['sentiment'] = lambda span: 10.0 good_stuff = doc[:2] assert good_stuff.sentiment == 10.0 bad_stuff = doc[-2:] assert bad_stuff.sentiment == 10.0 good_stuff_bad = doc[:-1] assert good_stuff_bad.sentiment == 10.0
def test_issue7056(): """Test that the Unshift transition works properly, and doesn't cause sentence segmentation errors.""" vocab = Vocab() ae = ArcEager( vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])) doc = Doc(vocab, words="Severe pain , after trauma".split()) state = ae.init_batch([doc])[0] ae.apply_transition(state, "S") ae.apply_transition(state, "L-amod") ae.apply_transition(state, "S") ae.apply_transition(state, "S") ae.apply_transition(state, "S") ae.apply_transition(state, "R-pobj") ae.apply_transition(state, "D") ae.apply_transition(state, "D") ae.apply_transition(state, "D") assert not state.eol()
def test_graph_walk(): doc = Doc(Vocab(), words=["a", "b", "c", "d"]) graph = Graph( doc, name="hello", nodes=[(0, ), (1, ), (2, ), (3, )], edges=[(0, 1), (0, 2), (0, 3), (3, 0)], labels=None, weights=None, ) node0, node1, node2, node3 = list(graph.nodes) assert [tuple(h) for h in node0.heads()] == [(3, )] assert [tuple(h) for h in node1.heads()] == [(0, )] assert [tuple(h) for h in node0.walk_heads()] == [(3, ), (0, )] assert [tuple(h) for h in node1.walk_heads()] == [(0, ), (3, ), (0, )] assert [tuple(h) for h in node2.walk_heads()] == [(0, ), (3, ), (0, )] assert [tuple(h) for h in node3.walk_heads()] == [(0, ), (3, )] assert [tuple(t) for t in node0.walk_tails()] == [(1, ), (2, ), (3, ), (0, )] assert [tuple(t) for t in node1.walk_tails()] == [] assert [tuple(t) for t in node2.walk_tails()] == [] assert [tuple(t) for t in node3.walk_tails()] == [(0, ), (1, ), (2, ), (3, )]
def _make_task_prediction( self, single_forward_output: Dict, instance: Instance, ) -> TokenClassificationPrediction: # The dims are: top_k, tags tags: List[List[str]] = self._make_tags( single_forward_output["viterbi_paths"]) # construct a spacy Doc pre_tokenized = not isinstance(single_forward_output["raw_text"], str) if pre_tokenized: # compose doc from tokens doc = Doc(Vocab(), words=single_forward_output["raw_text"]) else: doc = self.backbone.tokenizer.nlp( single_forward_output["raw_text"]) return TokenClassificationPrediction( tags=tags, scores=[ score for tags, score in single_forward_output["viterbi_paths"] ], entities=self._make_entities(doc, tags, pre_tokenized), )
def __call__(self, tokens): spaces = [True] * len(tokens) return Doc(self.vocab, words=tokens, spaces=spaces)
def main(): _arg = parse_args() # list of articles from which line list features are to be extracted # for each infected case ll_articles = [json.loads(l) for l in io.open(_arg.MERSbulletins, "r")] # word embeddings specific to WHO corpus extracted by word2vec models (SGNS # or SGHS) w2v_model = Word2Vec.load(_arg.whovec) # Number of predictor keywords (excluding the seed keyword) K = np.int(_arg.numind) # Seed keywords for each line list feature guiding the extraction process seed_keywords = { "Onset Date": "onset", "Hospital Date": "hospitalized", "Outcome Date": "died", "Specified Proximity to Animals or Animal Products": "animals", "Specified Contact with Other Cases": "case", "Specified HCW": "healthcare", "Specified Comorbidities": "comorbidities" } auto_ll = [] ll_extract = LineList() # Extracting the number of infected cases and the line list features # corresponding to each case from each article for ll_artl in ll_articles: ll_text = "" dt_offsets = [] dt_dict = defaultdict() # dt_dict: mapping date phrases to proper datetime strings, e.g. for dtphrase_elm in ll_artl['eventSemantics']['datetimes']: dt_dict["-".join( dtphrase_elm['phrase'].split())] = dtphrase_elm["date"] dt_offsets.append({ 'start': ll_artl['BasisEnrichment']['tokens'][int( dtphrase_elm['offset'].split(":")[0])]['start'], 'end': ll_artl['BasisEnrichment']['tokens'][ int(dtphrase_elm['offset'].split(":")[1]) - 1]['end'] }) for i in xrange(len(ll_artl["content"])): is_offset = 0 for offset_elm in dt_offsets: if int(offset_elm['start']) <= i < int(offset_elm['end']): is_offset = 1 if ll_artl["content"][i] == " ": ll_text += "-" else: ll_text += ll_artl["content"][i] if not is_offset: if ll_artl["content"][i] == ".": ll_text += " " ll_text += ll_artl["content"][i] num_cases = [] en_nlp = spacy.load('en') ll_doc = Doc(en_nlp.vocab) ll_doc = en_nlp(ll_text) ll_sents = [] for sent in ll_doc.sents: ll_sents.append(sent) # Extracting the number of cases mentioned in the article # using age and gender information for sent_ind in xrange(len(ll_sents)): ag_out = ll_extract.get_age_gender(ll_sents[sent_ind].text) if ag_out['age'] is not None and ag_out['gender'] is not None: case_feature = defaultdict() case_feature['age'] = ag_out['age'] case_feature['gender'] = ag_out['gender'] case_feature['start'] = sent_ind case_feature['link'] = ll_artl["link"] num_cases.append(case_feature) # Identifying the start sentence and end sentence for each case for case_ind in xrange(len(num_cases)): sent_start = num_cases[case_ind]['start'] sent_end = sent_start + 2 for ind_case in xrange(len(num_cases)): if ind_case == case_ind: continue if num_cases[ind_case]['start'] > sent_start: sent_end = num_cases[ind_case]['start'] - 1 break try: for ll_feat in seed_keywords: num_cases[case_ind][ll_feat] = defaultdict() # Extracting the disease onset features for each case for dt_feat in ["Onset Date", "Hospital Date", "Outcome Date"]: kwargs = { 'K': K, 'w2v': w2v_model, 'start': sent_start, 'end': sent_end, 'll_sents': ll_sents, 'seed': seed_keywords[dt_feat], 'dt_dict': dt_dict } num_cases[case_ind][dt_feat] = ll_extract.infer_date( **kwargs)['final'] # Extracting the clinical features corresponding to each case for clin_feat in [ "Specified Proximity to Animals or Animal Products", "Specified Contact with Other Cases", "Specified HCW", "Specified Comorbidities" ]: kwargs = { 'K': K, 'w2v': w2v_model, 'start': sent_start, 'end': sent_end, 'll_sents': ll_sents, 'seed': seed_keywords[clin_feat] } num_cases[case_ind][clin_feat] = ll_extract.infer_clinical( **kwargs)['final'] except Exception as e: print e if len(num_cases) != 0: auto_ll.extend(num_cases) # Writing the automatically extracted line lists to a file if len(auto_ll) != 0: with open(_arg.outputll, "w") as f_ll: for cs in auto_ll: print >> f_ll, json.dumps(cs, encoding='utf-8')
def apple_orange(en_vocab): return Doc(en_vocab, words=[u'apple', u'orange'])