Python tokenise Examples

Programming Language: Python

Namespace/Package Name: preprocess

Method/Function: tokenise

Examples at hotexamples.com: 9

Python tokenise - 9 examples found. These are the top rated real world Python examples of preprocess.tokenise extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: index_search.py Project: marinapts/ttds

def ranked_retrieval(queries, collection_table, doc_nums, inverted_index,
                     stop_words):
    ranked_scores = {}

    for query_index, query in enumerate(queries):
        query_tokens = normalise(remove_stop_words(tokenise(query),
                                                   stop_words))

        # Convert query into an OR boolean search and use eval to evaluate it
        boolean_vectors = []
        for token in query_tokens:
            boolean_vector = collection_table[token]
            boolean_vectors.append('np.array([{}])'.format(
                array_to_string(boolean_vector)))

        query_eval_string = ' | '.join(boolean_vectors)
        query_documents = boolean_search(query_eval_string, doc_nums)

        query_scores = []
        # Map query_boolean_result to a list of document ids
        for doc in query_documents:
            score = TFIDF(doc, query_tokens, len(doc_nums), inverted_index)
            query_scores.append((doc, score))

        # Sort scores for each query on a descending order
        query_scores = sorted(query_scores, key=lambda x: x[1], reverse=True)
        ranked_scores[query_index + 1] = query_scores

    return ranked_scores

Example #2

Show file

File: tweetnet.py Project: vasaura/cours-python

 def computerelations(self, graph):
     for tweet in self:
         tokens = preprocess.tokenise(tweet.message)
         for token in tokens:
             #Does this token look like twitter's @recipient syntax ??
             if token and token[0] == '@':
                 user = token[1:]
                 self.addrelation(user)

Example #3

Show file

File: tweetnet.py Project: DHLeipzig-HumProg-WS14-15/MM-Python-Course

 def computerelations(self, graph):
     for tweet in self:
         tokens = preprocess.tokenise(tweet.message)
         for token in tokens:
             #Does this token look like twitter's @recipient syntax ??
             if token and token[0] == '@': 
                 user = token[1:]
                 self.addrelation(user)

Example #4

Show file

def get_word_indices_in_text(words, text):
    text_token = tokenise(text)
    word_indices = []

    for word in words:
        word_index = text_token.index(word.lower())
        word_indices.append(word_index + 1)

    return word_indices

Example #5

Show file

def load_docs_from_trec(root):
    # Load the provided trec sample xml
    root = load_xml(TREC_SAMPLE_FILE, './DOC')
    doc_list = []
    token_doc_list = []
    tokenised_docs = {}
    doc_nums = []
    test_list = []

    for doc in root:
        doc_no = doc.find('DOCNO').text
        headline = doc.find('HEADLINE').text
        text = doc.find('TEXT').text
        headline_with_text = headline + ' ' + text

        doc_nums.append(doc_no)
        doc_list.append(headline_with_text)
        test_list.append(tokenise(headline_with_text))
        token_doc_list.append(preprocess(headline_with_text))
        tokenised_docs[doc_no] = preprocess(headline_with_text)

    return doc_nums, token_doc_list, tokenised_docs

Example #6

Show file

def extract_features(filename):
    "Open and tokenise the contents of a file."
    return tokenise(read_corpus_file(filename))

Example #7

Show file

File: ari.py Project: vasaura/cours-python

def prepare_text(filename):
    return splitsentences(tokenise(readcorpusfile(filename)))

Example #8

Show file

def preprocess(doc):
    return stemming(remove_stop_words(tokenise(doc), stop_words))

Example #9

Show file

File: ari.py Project: DHLeipzig-HumProg-WS14-15/MM-Python-Course

def prepare_text(filename):
    return splitsentences(tokenise(readcorpusfile(filename)))