Example #1
0
def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')
        
        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
            for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0
Example #2
0
def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')

        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
                              for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0
Example #3
0
 def gen_word_list(term_dict):
     word_list = {}
     max_ip_count = 0
     for k in term_dict:
         term_ptrs = gtbtokenize.tokenize(term_dict[k]["label"]).split()
         for word in term_ptrs: 
             if not word in word_list:
                 word_list[word] = {"terms": set([]), "mags": [], "magnitude": 0, "color": 0, "unique_ips": 0, "opacity": min_alpha, "label": word}
                 if word.lower() in idfs: word_list[word]["idf"] = idfs[word.lower()]["IDF"]
                 else: word_list[word]["idf"] = UNMAPPED_IDF_CONST
             word_list[word]["terms"].add(term_dict[k]["term_id"])
             word_list[word]["mags"].append(term_dict[k]["magnitude"])
             word_list[word]["unique_ips"] += term_dict[k]["unique_ips"]
             if word_list[word]["unique_ips"] > max_ip_count: max_ip_count = word_list[word]["unique_ips"]
     #print len(word_list)
     for k in word_list:
         word_list[k]["magnitude"] = np.log2(np.median(word_list[k]["mags"])*word_list[k]["idf"])
         word_list[k]["color"] = len(word_list[k]["terms"])
         word_list[k]["opacity"] = min_alpha + (1-min_alpha)*np.log2(word_list[word]["unique_ips"])/np.log2(max_ip_count)
         word_list[k]["terms"] = list(word_list[k]["terms"])
     word_df = pd.DataFrame.from_dict(word_list, orient="index")
     word_df = word_df.sort_values("magnitude", ascending=False)
     sel_word_list = word_df[0:WC_THRES]
     return word_list
def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o
Example #5
0
def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))
Example #6
0
def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))
Example #7
0
def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o
import json, re, sys, os, collections, csv, math, time
import numpy as np
import gtbtokenize
import networkx as nx
import sklearn.metrics as metrics
from operator import itemgetter
import pandas as pd
from utils import MatrixIO, FileUtils

first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
f = lambda x: re.sub(r'[^a-z0-9]', "", x)
tokenize = lambda x: gtbtokenize.tokenize(x).lower()
EMBEDDING_SIZE = 100

N = 24358723
UNMAPPED_IDF_CONST = 4
MIN_ED = 4
vector_file = "../lod_query/biomed_vectors_p.txt"
vocab_file = "../lod_query/biomed_vocab_p.txt"
idf_file = "../lod_query/idf_file.tsv"
vocab_dict = {}
enc_vocab_dict = {}
stopWords = set([
    "a", "also", "although", "am", "an", "and", "are", ".", "NNNN", "VVVV",
    "as", "at", "back", "be", "became", "because", "become", "becomes",
    "becoming", "been", "being", "bill", "both", "bottom", "but", "by", "call",
    "can", "con", "could", "de", "do", "done", "eg", "etc", "even", "ever",
    "find", "for", "found", "from", "get", "give", "go", "had", "has", "have",
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how",
    "however", "if", "in", "inc", "into", "is", "it", "its", "itself", "keep",