Python itokenise Examples

Programming Language: Python

Namespace/Package Name: nlp

Method/Function: itokenise

Examples at hotexamples.com: 4

Python itokenise - 4 examples found. These are the top rated real world Python examples of nlp.itokenise extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

            postings = defaultdict(lambda: [])
            bdata = index_pb.BuilderData()
            docs = []

            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue  # wtf
                if text[:9].lower() == ('#redirect'): continue

                text = unwiki(text)
                itokens = list(itokenise(text))
                itokens_title = list(itokenise(title))

                tokens = normalise(utils.tokens(text, itokens))
                tokens_title = negate_tokens(
                    normalise(utils.tokens(title, itokens_title)))
                tokens_all = tokens_title + tokens
                if not tokens_all: continue

                article_tokens = Counter()

                thisdoc_postings = defaultdict(lambda: [])
                for i, w in tokens_all:
                    article_tokens[w] += 1
                    thisdoc_postings[w].append(i)
                for w, l in thisdoc_postings.iteritems():

Example #2

Show file

File: utils.py Project: ktp-forked-repos/wis-core

def tokens(string, ilist=None):
    if ilist is None: ilist = itokenise(string)
    return [string[f:t] for f, t in ilist]

Example #3

Show file

File: utils.py Project: himikof/wis-core

def tokens(string, ilist=None):
    if ilist is None:
        ilist = itokenise(string)
    return [string[f:t] for f, t in ilist]

Example #4

Show file

File: feed.py Project: balusamy/wis-core

            postings = defaultdict(lambda: [])
            bdata = index_pb.BuilderData()
            docs = []

            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue # wtf
                if text[:9].lower() == ('#redirect'): continue


                text = unwiki(text)
                itokens = list(itokenise(text))
                itokens_title = list(itokenise(title))

                tokens = normalise(utils.tokens(text, itokens))
                tokens_title = negate_tokens(normalise(utils.tokens(title, itokens_title)))
                tokens_all = tokens_title + tokens
                if not tokens_all: continue

                article_tokens = Counter()

                thisdoc_postings = defaultdict(lambda: [])
                for i, w in tokens_all:
                    article_tokens[w] += 1
                    thisdoc_postings[w].append(i)
                for w, l in thisdoc_postings.iteritems():
                    postings[w].append((sha1, l))