Python tokenize Examples, gtbtokenize.tokenize Python Examples

Example #1

0

Show file

File: featurise.py Project: ogh/contra

def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')
        
        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
            for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0

Example #2

0

Show file

File: featurise.py Project: ogh/contra

def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')

        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
                              for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0

Example #3

0

Show file

 def gen_word_list(term_dict):
     word_list = {}
     max_ip_count = 0
     for k in term_dict:
         term_ptrs = gtbtokenize.tokenize(term_dict[k]["label"]).split()
         for word in term_ptrs: 
             if not word in word_list:
                 word_list[word] = {"terms": set([]), "mags": [], "magnitude": 0, "color": 0, "unique_ips": 0, "opacity": min_alpha, "label": word}
                 if word.lower() in idfs: word_list[word]["idf"] = idfs[word.lower()]["IDF"]
                 else: word_list[word]["idf"] = UNMAPPED_IDF_CONST
             word_list[word]["terms"].add(term_dict[k]["term_id"])
             word_list[word]["mags"].append(term_dict[k]["magnitude"])
             word_list[word]["unique_ips"] += term_dict[k]["unique_ips"]
             if word_list[word]["unique_ips"] > max_ip_count: max_ip_count = word_list[word]["unique_ips"]
     #print len(word_list)
     for k in word_list:
         word_list[k]["magnitude"] = np.log2(np.median(word_list[k]["mags"])*word_list[k]["idf"])
         word_list[k]["color"] = len(word_list[k]["terms"])
         word_list[k]["opacity"] = min_alpha + (1-min_alpha)*np.log2(word_list[word]["unique_ips"])/np.log2(max_ip_count)
         word_list[k]["terms"] = list(word_list[k]["terms"])
     word_df = pd.DataFrame.from_dict(word_list, orient="index")
     word_df = word_df.sort_values("magnitude", ascending=False)
     sel_word_list = word_df[0:WC_THRES]
     return word_list

Example #4

0

Show file

File: tokenise.py Project: AlexErmer/CollaborativePDFAnnotation

def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o

Example #5

0

Show file

def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))

Example #6

0

Show file

File: extractTIABs.py Project: spyysalo/pubmed

def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))

Example #7

0

Show file

def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o

Example #8

0

Show file

File: ontovectorgenerator.py Project: protegeteam/string-clustering

import json, re, sys, os, collections, csv, math, time
import numpy as np
import gtbtokenize
import networkx as nx
import sklearn.metrics as metrics
from operator import itemgetter
import pandas as pd
from utils import MatrixIO, FileUtils

first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
f = lambda x: re.sub(r'[^a-z0-9]', "", x)
tokenize = lambda x: gtbtokenize.tokenize(x).lower()
EMBEDDING_SIZE = 100

N = 24358723
UNMAPPED_IDF_CONST = 4
MIN_ED = 4
vector_file = "../lod_query/biomed_vectors_p.txt"
vocab_file = "../lod_query/biomed_vocab_p.txt"
idf_file = "../lod_query/idf_file.tsv"
vocab_dict = {}
enc_vocab_dict = {}
stopWords = set([
    "a", "also", "although", "am", "an", "and", "are", ".", "NNNN", "VVVV",
    "as", "at", "back", "be", "became", "because", "become", "becomes",
    "becoming", "been", "being", "bill", "both", "bottom", "but", "by", "call",
    "can", "con", "could", "de", "do", "done", "eg", "etc", "even", "ever",
    "find", "for", "found", "from", "get", "give", "go", "had", "has", "have",
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how",
    "however", "if", "in", "inc", "into", "is", "it", "its", "itself", "keep",