コード例 #1
0
class SpacyParser(object):
    '''https://spacy.io/#example-use'''
    def __init__(self, num_threads=4):
        
        self.nlp = English(tokenizer=True, parser=True, tagger=True,
                           entity=None, matcher=None)
    
    def parse(self, doc, doc_id=None):
        """Parse a raw document as a string into a list of sentences"""
        if len(doc.strip()) == 0:
            return
        doc = doc.decode("utf-8")
        for doc in self.nlp.pipe([doc], batch_size=50, n_threads=4):
            assert doc.is_parsed
                    
        for sent_id, sent in enumerate(doc.sents):
            tokens = [t for t in sent]
            token_idxs = [t.idx for t in sent]
            words = [t.text for t in sent]
            lemmas = [self.nlp.vocab.strings[t.lemma] for t in tokens]
            poses = [self.nlp.vocab.strings[t.tag] for t in tokens]
            dep_labels = [self.nlp.vocab.strings[t.dep] for t in tokens]
            # index tokens to determine sentence offset for dependency tree
            token_idx = {t:i for i,t in enumerate(tokens)}
            dep_parents = [token_idx[t.head] for t in tokens] 
            
            s = Sentence(words=words,lemmas=lemmas,poses=poses, 
                         dep_parents=dep_parents, dep_labels=dep_labels, 
                         sent_id=sent_id, doc_id=doc_id, text=sent.text,
                         token_idxs=token_idxs, doc_name=doc_id )

            yield s
コード例 #2
0
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size,
            min_ngram, max_ngram):
    logging.info('Processing batch_id: {}'.format(batch_id))
    subtrees = PreshCounter()
    subtrees_string_map = StringStore()
    noun_chunks = PreshCounter()
    noun_chunks_string_map = StringStore()

    if lang.lower() == "en":
        from spacy.en import English
        NLU = English()
        NLU.matcher = None
    elif lang.lower() == "id":
        from spacy.id import Indonesian
        NLU = Indonesian()
        NLU.matcher = None

    for i, doc in enumerate(
            NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)):
        phrases = set()
        for tok in doc:
            st_len = len(list(tok.subtree))
            if min_ngram <= st_len <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws)
                              for t in tok.subtree]).strip()
                orth = subtrees_string_map[st]
                subtrees.inc(orth, 1)
        for np in doc.noun_chunks:
            if min_ngram <= len(np) <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws) for t in np]).strip()
                orth = noun_chunks_string_map[st]
                noun_chunks.inc(orth, 1)

        if i % batch_size == 0:
            logging.info('Processing batch_id: {}, doc: {}'.format(
                batch_id, i))

    output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in subtrees:
            st = subtrees_string_map[orth]
            if count >= 5 and '!LONGWORD!' not in st:
                out.write('{}\t{}\n'.format(count, st))

    output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in noun_chunks:
            if count >= 5:
                st = noun_chunks_string_map[orth]
                out.write('{}\t{}\n'.format(count, st))
コード例 #3
0
ファイル: preprocess.py プロジェクト: vijeth8/lda2vec-tf
def tokenize(texts,
             max_length,
             skip=-2,
             attr=LOWER,
             merge=False,
             nlp=None,
             **kwargs):
    """"""
    if nlp is None:
        nlp = English()
    data = np.zeros((len(texts), max_length), dtype='int32')
    data[:] = skip
    bad_deps = ('amod', 'compound')
    for row, doc in enumerate(nlp.pipe(texts, **kwargs)):
        if merge:
            # from the spaCy blog, an example on how to merge
            # noun phrases into single tokens
            for phrase in doc.noun_chunks:
                # Only keep adjectives and nouns, e.g. "good ideas"
                while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
                    phrase = phrase[1:]
                if len(phrase) > 1:
                    # Merge the tokens, e.g. good_ideas
                    phrase.merge(phrase.root.tag_, phrase.text,
                                 phrase.root.ent_type_)
            # Iterate over named entities
            for ent in doc.ents:
                if len(ent) > 1:
                    # Merge them into single tokens
                    ent.merge(ent.root.tag_, ent.text, ent.label_)
        dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32')
        if len(dat) > 0:
            dat = dat.astype('int32')
            msg = "Negative indices reserved for special tokens"
            assert dat.min() >= 0, msg
            # Replace email and URL tokens
            idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
            dat[idx] = skip
            length = min(len(dat), max_length)
            data[row, :length] = dat[:length, 0].ravel()
    uniques = np.unique(data)
    vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip}
    vocab[skip] = '<SKIP>'
    return data, vocab
コード例 #4
0
class Tokenizer:
    """class for tokenizing documents"""
    def __init__(self):
        self.nlp = English(tag=True, parse=False, entity=False)

    def tokenize(self, documents, batch_size=1000):
        """tokenize a set of documents

        uses the lemma of each token

        :param documents: documents to tokenize
        :type documents: list of str

        :param batch_size: batch size for processing documents
        :type batch_size: int

        :returns: tokenized documents
        :rtype: list of list of str
        """

        return [[
            token.lemma_ for token in doc if self._include(token)
        ] for doc in self.nlp.pipe(
            documents, entity=False, batch_size=batch_size, n_threads=4)]

    @staticmethod
    def _include(token):
        """whether to include a token

        :param token: token to check
        :type token: spacy.tokens.token.Token

        :returns: whether to include
        :rtype: boolean
        """

        return (not token.is_punct and token.lemma_ not in STOP_WORDS
                and token.lemma_.strip() != '' and not token.like_num
                and not token.like_url)
コード例 #5
0
ファイル: ms_marco_eval.py プロジェクト: fuxihao66/RCProj
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """

    global NLP
    if not NLP:
        NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                           batch_size=p_batch_size, \
                           n_threads=p_thread_count)

    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield str(' ').join(tokens)
コード例 #6
0
def preprocess(texts):
    nlp = English()
    docs = nlp.pipe(texts)

    for doc in docs:
        for np in doc.noun_chunks:
            # Only keep adjectives and nouns, e.g. "good ideas"
            while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'):
                np = np[1:]
            if len(np) > 1:
                # Merge the tokens, e.g. good_ideas
                np.merge(np.root.tag_, np.text, np.root.ent_type_)
            # Iterate over named entities
        for ent in doc.ents:
            if len(ent) > 1:
                # Merge them into single tokens
                ent.merge(ent.root.tag_, ent.text, ent.label_)

        sentences = []

        for sent in doc.sents:
            sentences.append([token.text for token in sent])

        yield sentences
コード例 #7
0
def preprocess(texts):
    nlp = English()
    docs = nlp.pipe(texts)

    for doc in docs:
        for np in doc.noun_chunks:
            # Only keep adjectives and nouns, e.g. "good ideas"
            while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'):
                np = np[1:]
            if len(np) > 1:
                # Merge the tokens, e.g. good_ideas
                np.merge(np.root.tag_, np.text, np.root.ent_type_)
            # Iterate over named entities
        for ent in doc.ents:
            if len(ent) > 1:
                # Merge them into single tokens
                ent.merge(ent.root.tag_, ent.text, ent.label_)

        sentences = []

        for sent in doc.sents:
            sentences.append([token.text for token in sent])

        yield sentences
コード例 #8
0
def tokenize(texts,
             max_length,
             skip=-2,
             attr=LOWER,
             merge=False,
             nlp=None,
             **kwargs):
    """ Uses spaCy to quickly tokenize text and return an array
    of indices.

    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory.

    Parameters
    ----------
    text : list of unicode strings
        These are the input documents. There can be multiple sentences per
        item in the list.
    max_length : int
        This is the maximum number of words per document. If the document is
        shorter then this number it will be padded to this length.
    skip : int, optional
        Short documents will be padded with this variable up until max_length.
    attr : int, from spacy.attrs
        What to transform the token to. Choice must be in spacy.attrs, and =
        common choices are (LOWER, LEMMA)
    merge : int, optional
        Merge noun phrases into a single token. Useful for turning 'New York'
        into a single token.
    nlp : None
        A spaCy NLP object. Useful for not reinstantiating the object multiple
        times.
    kwargs : dict, optional
        Any further argument will be sent to the spaCy tokenizer. For extra
        speed consider setting tag=False, parse=False, entity=False, or
        n_threads=8.

    Returns
    -------
    arr : 2D array of ints
        Has shape (len(texts), max_length). Each value represents
        the word index.
    vocab : dict
        Keys are the word index, and values are the string. The pad index gets
        mapped to None

    >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"]
    >>> arr, vocab = tokenize(sents, 10, merge=True)
    >>> arr.shape[0]
    2
    >>> arr.shape[1]
    10
    >>> w2i = {w: i for i, w in vocab.iteritems()}
    >>> arr[0, 0] == w2i[u'do']  # First word and its index should match
    True
    >>> arr[0, 1] == w2i[u'you']
    True
    >>> arr[0, -1]  # last word in 0th document is a pad word
    -2
    >>> arr[0, 4] == w2i[u'class action lawsuit']  # noun phrase is tokenized
    True
    >>> arr[1, 1]  # The URL token is thrown out
    -2
    """
    if nlp is None:
        nlp = English()
    data = np.zeros((len(texts), max_length), dtype='int32')
    data[:] = skip
    bad_deps = ('amod', 'compound')
    for row, doc in enumerate(nlp.pipe(texts, **kwargs)):
        if merge:
            # from the spaCy blog, an example on how to merge
            # noun phrases into single tokens
            for phrase in doc.noun_chunks:
                # Only keep adjectives and nouns, e.g. "good ideas"
                while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
                    phrase = phrase[1:]
                if len(phrase) > 1:
                    # Merge the tokens, e.g. good_ideas
                    phrase.merge(phrase.root.tag_, phrase.text,
                                 phrase.root.ent_type_)
                # Iterate over named entities
                for ent in doc.ents:
                    if len(ent) > 1:
                        # Merge them into single tokens
                        ent.merge(ent.root.tag_, ent.text, ent.label_)
        dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32')
        if len(dat) > 0:
            dat = dat.astype('int32')
            msg = "Negative indices reserved for special tokens"
            assert dat.min() >= 0, msg
            # Replace email and URL tokens
            idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
            dat[idx] = skip
            length = min(len(dat), max_length)
            data[row, :length] = dat[:length, 0].ravel()
    uniques = np.unique(data)
    vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip}
    vocab[skip] = '<SKIP>'
    return data, vocab
コード例 #9
0
ファイル: preprocess.py プロジェクト: scoutexchange/lda2vec
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
             **kwargs):
    """ Uses spaCy to quickly tokenize text and return an array
    of indices.

    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory.

    Parameters
    ----------
    text : list of unicode strings
        These are the input documents. There can be multiple sentences per
        item in the list.
    max_length : int
        This is the maximum number of words per document. If the document is
        shorter then this number it will be padded to this length.
    skip : int, optional
        Short documents will be padded with this variable up until max_length.
    attr : int, from spacy.attrs
        What to transform the token to. Choice must be in spacy.attrs, and =
        common choices are (LOWER, LEMMA)
    merge : int, optional
        Merge noun phrases into a single token. Useful for turning 'New York'
        into a single token.
    nlp : None
        A spaCy NLP object. Useful for not reinstantiating the object multiple
        times.
    kwargs : dict, optional
        Any further argument will be sent to the spaCy tokenizer. For extra
        speed consider setting tag=False, parse=False, entity=False, or
        n_threads=8.

    Returns
    -------
    arr : 2D array of ints
        Has shape (len(texts), max_length). Each value represents
        the word index.
    vocab : dict
        Keys are the word index, and values are the string. The pad index gets
        mapped to None

    >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"]
    >>> arr, vocab = tokenize(sents, 10, merge=True)
    >>> arr.shape[0]
    2
    >>> arr.shape[1]
    10
    >>> w2i = {w: i for i, w in vocab.iteritems()}
    >>> arr[0, 0] == w2i[u'do']  # First word and its index should match
    True
    >>> arr[0, 1] == w2i[u'you']
    True
    >>> arr[0, -1]  # last word in 0th document is a pad word
    -2
    >>> arr[0, 4] == w2i[u'class action lawsuit']  # noun phrase is tokenized
    True
    >>> arr[1, 1]  # The URL token is thrown out
    -2
    """
    if nlp is None:
        nlp = English()
    data = np.zeros((len(texts), max_length), dtype='int32')
    data[:] = skip
    bad_deps = ('amod', 'compound')
    for row, doc in enumerate(tqdm(list(nlp.pipe(texts, **kwargs)),
                                   desc="tokenizing")):
        if merge:
            # from the spaCy blog, an example on how to merge
            # noun phrases into single tokens
            for phrase in doc.noun_chunks:
                # Only keep adjectives and nouns, e.g. "good ideas"
                while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
                    phrase = phrase[1:]
                if len(phrase) > 1:
                    # Merge the tokens, e.g. good_ideas
                    phrase.merge(phrase.root.tag_, phrase.text,
                                 phrase.root.ent_type_)
                # Iterate over named entities
                for ent in doc.ents:
                    if len(ent) > 1:
                        # Merge them into single tokens
                        ent.merge(ent.root.tag_, ent.text, ent.label_)
        dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32')
        if len(dat) > 0:
            dat = dat.astype('int32')
            msg = "Negative indices reserved for special tokens"
            assert dat.min() >= 0, msg
            # Replace email and URL tokens
            idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
            dat[idx] = skip
            length = min(len(dat), max_length)
            data[row, :length] = dat[:length, 0].ravel()
    uniques = np.unique(data)
    vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip}
    vocab[skip] = '<SKIP>'
    return data, vocab
コード例 #10
0
    parser.add_argument('-i', dest='indices', default='')
    parser.add_argument('-o', dest='output', default='parses')
    args = parser.parse_args()

    with Store(args.dataset) as load:
        if args.indices and args.indices in indices_dispatch:
            data = load.select(indices_dispatch[args.indices](load))
        else:
            print('No valid indices selected. Continuing with all data.')
            data = load.data

        nlp = English()

        unicode_posts = data['text'].str.decode('utf8')
        print('Processing {0} posts'.format(len(unicode_posts)))
        docs = nlp.pipe(unicode_posts, batch_size=16, n_threads=3)
        unicode_titles = data['title'].str.decode('utf8')
        titles = nlp.pipe(unicode_titles, batch_size=16, n_threads=3)
        posts = zip(docs, titles)

        print('Saving documents and titles.')
        cnt, cur = 0, 0
        mode = 'w'
        for post_id, post in zip(data['post_id'], posts):
            # unpack the post
            doc, title = post
            write_conll(os.path.join(args.output,
                                     str(cur) + '.parse'),
                        post_id,
                        title,
                        doc,
コード例 #11
0

if __name__ == '__main__':
    #print 'data'

    if len(sys.argv) != 3:
        print 'usage: python pyfile dir_path input_name outputname'
        exit(1)
    dir_path = sys.argv[1]
    f_input = dir_path + sys.argv[2]

    nlp = English()
    texts = []
    stime = time.time()
    with codecs.open(f_input, 'r', 'utf-8') as file:
        for line in file:
            line = line.strip()
            lineNo, sentence, tags, tags_er = line.split('\t')
            texts.append(lineNo + sentence)
    etime = time.time()
    print 'load tests time:', etime - stime

    pool = Pool(30)
    try:
        DT_result = [
            generateDT(doc)
            for doc in nlp.pipe(texts, n_threads=30, batch_size=100)
        ]
    except:
        print 'read file exception'
    pickle.dump(DT_result, open(dir_path + 'DT_result.p', 'wb'))
コード例 #12
0
ファイル: generateDT.py プロジェクト: wujsAct/knowledgeGraph
        temp.append([token.head.orth_,t[token.head.idx]])
        dep_triple.append(temp)
    return dep_triple

    
if __name__=='__main__':
    #print 'data'
    
    if len(sys.argv) !=3:
        print 'usage: python pyfile dir_path input_name outputname'
        exit(1)
    dir_path = sys.argv[1]
    f_input = dir_path+sys.argv[2]
   
    nlp= English()
    texts = []
    stime = time.time()
    with codecs.open(f_input,'r','utf-8') as file:
        for line in file:
            line = line.strip()
            lineNo,sentence,tags,tags_er = line.split('\t')
            texts.append(lineNo+sentence)
    etime = time.time()
    print 'load tests time:',etime - stime       
             
    pool = Pool(30)
    try:
        DT_result = [generateDT(doc) for doc in nlp.pipe(texts, n_threads=30, batch_size=100)]
    except:
        print 'read file exception'
    pickle.dump(DT_result,open(dir_path+'DT_result.p','wb'))