class SpacyParser(object): '''https://spacy.io/#example-use''' def __init__(self, num_threads=4): self.nlp = English(tokenizer=True, parser=True, tagger=True, entity=None, matcher=None) def parse(self, doc, doc_id=None): """Parse a raw document as a string into a list of sentences""" if len(doc.strip()) == 0: return doc = doc.decode("utf-8") for doc in self.nlp.pipe([doc], batch_size=50, n_threads=4): assert doc.is_parsed for sent_id, sent in enumerate(doc.sents): tokens = [t for t in sent] token_idxs = [t.idx for t in sent] words = [t.text for t in sent] lemmas = [self.nlp.vocab.strings[t.lemma] for t in tokens] poses = [self.nlp.vocab.strings[t.tag] for t in tokens] dep_labels = [self.nlp.vocab.strings[t.dep] for t in tokens] # index tokens to determine sentence offset for dependency tree token_idx = {t:i for i,t in enumerate(tokens)} dep_parents = [token_idx[t.head] for t in tokens] s = Sentence(words=words,lemmas=lemmas,poses=poses, dep_parents=dep_parents, dep_labels=dep_labels, sent_id=sent_id, doc_id=doc_id, text=sent.text, token_idxs=token_idxs, doc_name=doc_id ) yield s
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size, min_ngram, max_ngram): logging.info('Processing batch_id: {}'.format(batch_id)) subtrees = PreshCounter() subtrees_string_map = StringStore() noun_chunks = PreshCounter() noun_chunks_string_map = StringStore() if lang.lower() == "en": from spacy.en import English NLU = English() NLU.matcher = None elif lang.lower() == "id": from spacy.id import Indonesian NLU = Indonesian() NLU.matcher = None for i, doc in enumerate( NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)): phrases = set() for tok in doc: st_len = len(list(tok.subtree)) if min_ngram <= st_len <= max_ngram: st = ''.join([rep_text(t.text_with_ws) for t in tok.subtree]).strip() orth = subtrees_string_map[st] subtrees.inc(orth, 1) for np in doc.noun_chunks: if min_ngram <= len(np) <= max_ngram: st = ''.join([rep_text(t.text_with_ws) for t in np]).strip() orth = noun_chunks_string_map[st] noun_chunks.inc(orth, 1) if i % batch_size == 0: logging.info('Processing batch_id: {}, doc: {}'.format( batch_id, i)) output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id)) with io.open(output_fname, 'w', encoding='utf-8') as out: for orth, count in subtrees: st = subtrees_string_map[orth] if count >= 5 and '!LONGWORD!' not in st: out.write('{}\t{}\n'.format(count, st)) output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id)) with io.open(output_fname, 'w', encoding='utf-8') as out: for orth, count in noun_chunks: if count >= 5: st = noun_chunks_string_map[orth] out.write('{}\t{}\n'.format(count, st))
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None, **kwargs): """""" if nlp is None: nlp = English() data = np.zeros((len(texts), max_length), dtype='int32') data[:] = skip bad_deps = ('amod', 'compound') for row, doc in enumerate(nlp.pipe(texts, **kwargs)): if merge: # from the spaCy blog, an example on how to merge # noun phrases into single tokens for phrase in doc.noun_chunks: # Only keep adjectives and nouns, e.g. "good ideas" while len(phrase) > 1 and phrase[0].dep_ not in bad_deps: phrase = phrase[1:] if len(phrase) > 1: # Merge the tokens, e.g. good_ideas phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_) # Iterate over named entities for ent in doc.ents: if len(ent) > 1: # Merge them into single tokens ent.merge(ent.root.tag_, ent.text, ent.label_) dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32') if len(dat) > 0: dat = dat.astype('int32') msg = "Negative indices reserved for special tokens" assert dat.min() >= 0, msg # Replace email and URL tokens idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) dat[idx] = skip length = min(len(dat), max_length) data[row, :length] = dat[:length, 0].ravel() uniques = np.unique(data) vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip} vocab[skip] = '<SKIP>' return data, vocab
class Tokenizer: """class for tokenizing documents""" def __init__(self): self.nlp = English(tag=True, parse=False, entity=False) def tokenize(self, documents, batch_size=1000): """tokenize a set of documents uses the lemma of each token :param documents: documents to tokenize :type documents: list of str :param batch_size: batch size for processing documents :type batch_size: int :returns: tokenized documents :rtype: list of list of str """ return [[ token.lemma_ for token in doc if self._include(token) ] for doc in self.nlp.pipe( documents, entity=False, batch_size=batch_size, n_threads=4)] @staticmethod def _include(token): """whether to include a token :param token: token to check :type token: spacy.tokens.token.Token :returns: whether to include :rtype: boolean """ return (not token.is_punct and token.lemma_ not in STOP_WORDS and token.lemma_.strip() != '' and not token.like_num and not token.like_url)
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5): """Normalize and tokenize strings. Args: p_iter (iter): iter over strings to normalize and tokenize. p_batch_size (int): number of batches. p_thread_count (int): number of threads running. Returns: iter: iter over normalized and tokenized string. """ global NLP if not NLP: NLP = NlpEnglish(parser=False) output_iter = NLP.pipe(p_iter, \ batch_size=p_batch_size, \ n_threads=p_thread_count) for doc in output_iter: tokens = [str(w).strip().lower() for w in doc] yield str(' ').join(tokens)
def preprocess(texts): nlp = English() docs = nlp.pipe(texts) for doc in docs: for np in doc.noun_chunks: # Only keep adjectives and nouns, e.g. "good ideas" while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'): np = np[1:] if len(np) > 1: # Merge the tokens, e.g. good_ideas np.merge(np.root.tag_, np.text, np.root.ent_type_) # Iterate over named entities for ent in doc.ents: if len(ent) > 1: # Merge them into single tokens ent.merge(ent.root.tag_, ent.text, ent.label_) sentences = [] for sent in doc.sents: sentences.append([token.text for token in sent]) yield sentences
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None, **kwargs): """ Uses spaCy to quickly tokenize text and return an array of indices. This method stores a global NLP directory in memory, and takes up to a minute to run for the time. Later calls will have the tokenizer in memory. Parameters ---------- text : list of unicode strings These are the input documents. There can be multiple sentences per item in the list. max_length : int This is the maximum number of words per document. If the document is shorter then this number it will be padded to this length. skip : int, optional Short documents will be padded with this variable up until max_length. attr : int, from spacy.attrs What to transform the token to. Choice must be in spacy.attrs, and = common choices are (LOWER, LEMMA) merge : int, optional Merge noun phrases into a single token. Useful for turning 'New York' into a single token. nlp : None A spaCy NLP object. Useful for not reinstantiating the object multiple times. kwargs : dict, optional Any further argument will be sent to the spaCy tokenizer. For extra speed consider setting tag=False, parse=False, entity=False, or n_threads=8. Returns ------- arr : 2D array of ints Has shape (len(texts), max_length). Each value represents the word index. vocab : dict Keys are the word index, and values are the string. The pad index gets mapped to None >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"] >>> arr, vocab = tokenize(sents, 10, merge=True) >>> arr.shape[0] 2 >>> arr.shape[1] 10 >>> w2i = {w: i for i, w in vocab.iteritems()} >>> arr[0, 0] == w2i[u'do'] # First word and its index should match True >>> arr[0, 1] == w2i[u'you'] True >>> arr[0, -1] # last word in 0th document is a pad word -2 >>> arr[0, 4] == w2i[u'class action lawsuit'] # noun phrase is tokenized True >>> arr[1, 1] # The URL token is thrown out -2 """ if nlp is None: nlp = English() data = np.zeros((len(texts), max_length), dtype='int32') data[:] = skip bad_deps = ('amod', 'compound') for row, doc in enumerate(nlp.pipe(texts, **kwargs)): if merge: # from the spaCy blog, an example on how to merge # noun phrases into single tokens for phrase in doc.noun_chunks: # Only keep adjectives and nouns, e.g. "good ideas" while len(phrase) > 1 and phrase[0].dep_ not in bad_deps: phrase = phrase[1:] if len(phrase) > 1: # Merge the tokens, e.g. good_ideas phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_) # Iterate over named entities for ent in doc.ents: if len(ent) > 1: # Merge them into single tokens ent.merge(ent.root.tag_, ent.text, ent.label_) dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32') if len(dat) > 0: dat = dat.astype('int32') msg = "Negative indices reserved for special tokens" assert dat.min() >= 0, msg # Replace email and URL tokens idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) dat[idx] = skip length = min(len(dat), max_length) data[row, :length] = dat[:length, 0].ravel() uniques = np.unique(data) vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip} vocab[skip] = '<SKIP>' return data, vocab
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None, **kwargs): """ Uses spaCy to quickly tokenize text and return an array of indices. This method stores a global NLP directory in memory, and takes up to a minute to run for the time. Later calls will have the tokenizer in memory. Parameters ---------- text : list of unicode strings These are the input documents. There can be multiple sentences per item in the list. max_length : int This is the maximum number of words per document. If the document is shorter then this number it will be padded to this length. skip : int, optional Short documents will be padded with this variable up until max_length. attr : int, from spacy.attrs What to transform the token to. Choice must be in spacy.attrs, and = common choices are (LOWER, LEMMA) merge : int, optional Merge noun phrases into a single token. Useful for turning 'New York' into a single token. nlp : None A spaCy NLP object. Useful for not reinstantiating the object multiple times. kwargs : dict, optional Any further argument will be sent to the spaCy tokenizer. For extra speed consider setting tag=False, parse=False, entity=False, or n_threads=8. Returns ------- arr : 2D array of ints Has shape (len(texts), max_length). Each value represents the word index. vocab : dict Keys are the word index, and values are the string. The pad index gets mapped to None >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"] >>> arr, vocab = tokenize(sents, 10, merge=True) >>> arr.shape[0] 2 >>> arr.shape[1] 10 >>> w2i = {w: i for i, w in vocab.iteritems()} >>> arr[0, 0] == w2i[u'do'] # First word and its index should match True >>> arr[0, 1] == w2i[u'you'] True >>> arr[0, -1] # last word in 0th document is a pad word -2 >>> arr[0, 4] == w2i[u'class action lawsuit'] # noun phrase is tokenized True >>> arr[1, 1] # The URL token is thrown out -2 """ if nlp is None: nlp = English() data = np.zeros((len(texts), max_length), dtype='int32') data[:] = skip bad_deps = ('amod', 'compound') for row, doc in enumerate(tqdm(list(nlp.pipe(texts, **kwargs)), desc="tokenizing")): if merge: # from the spaCy blog, an example on how to merge # noun phrases into single tokens for phrase in doc.noun_chunks: # Only keep adjectives and nouns, e.g. "good ideas" while len(phrase) > 1 and phrase[0].dep_ not in bad_deps: phrase = phrase[1:] if len(phrase) > 1: # Merge the tokens, e.g. good_ideas phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_) # Iterate over named entities for ent in doc.ents: if len(ent) > 1: # Merge them into single tokens ent.merge(ent.root.tag_, ent.text, ent.label_) dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32') if len(dat) > 0: dat = dat.astype('int32') msg = "Negative indices reserved for special tokens" assert dat.min() >= 0, msg # Replace email and URL tokens idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) dat[idx] = skip length = min(len(dat), max_length) data[row, :length] = dat[:length, 0].ravel() uniques = np.unique(data) vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip} vocab[skip] = '<SKIP>' return data, vocab
parser.add_argument('-i', dest='indices', default='') parser.add_argument('-o', dest='output', default='parses') args = parser.parse_args() with Store(args.dataset) as load: if args.indices and args.indices in indices_dispatch: data = load.select(indices_dispatch[args.indices](load)) else: print('No valid indices selected. Continuing with all data.') data = load.data nlp = English() unicode_posts = data['text'].str.decode('utf8') print('Processing {0} posts'.format(len(unicode_posts))) docs = nlp.pipe(unicode_posts, batch_size=16, n_threads=3) unicode_titles = data['title'].str.decode('utf8') titles = nlp.pipe(unicode_titles, batch_size=16, n_threads=3) posts = zip(docs, titles) print('Saving documents and titles.') cnt, cur = 0, 0 mode = 'w' for post_id, post in zip(data['post_id'], posts): # unpack the post doc, title = post write_conll(os.path.join(args.output, str(cur) + '.parse'), post_id, title, doc,
if __name__ == '__main__': #print 'data' if len(sys.argv) != 3: print 'usage: python pyfile dir_path input_name outputname' exit(1) dir_path = sys.argv[1] f_input = dir_path + sys.argv[2] nlp = English() texts = [] stime = time.time() with codecs.open(f_input, 'r', 'utf-8') as file: for line in file: line = line.strip() lineNo, sentence, tags, tags_er = line.split('\t') texts.append(lineNo + sentence) etime = time.time() print 'load tests time:', etime - stime pool = Pool(30) try: DT_result = [ generateDT(doc) for doc in nlp.pipe(texts, n_threads=30, batch_size=100) ] except: print 'read file exception' pickle.dump(DT_result, open(dir_path + 'DT_result.p', 'wb'))
temp.append([token.head.orth_,t[token.head.idx]]) dep_triple.append(temp) return dep_triple if __name__=='__main__': #print 'data' if len(sys.argv) !=3: print 'usage: python pyfile dir_path input_name outputname' exit(1) dir_path = sys.argv[1] f_input = dir_path+sys.argv[2] nlp= English() texts = [] stime = time.time() with codecs.open(f_input,'r','utf-8') as file: for line in file: line = line.strip() lineNo,sentence,tags,tags_er = line.split('\t') texts.append(lineNo+sentence) etime = time.time() print 'load tests time:',etime - stime pool = Pool(30) try: DT_result = [generateDT(doc) for doc in nlp.pipe(texts, n_threads=30, batch_size=100)] except: print 'read file exception' pickle.dump(DT_result,open(dir_path+'DT_result.p','wb'))