コード例 #1
0
ファイル: featurise.py プロジェクト: ogh/contra
def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')
        
        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
            for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0
コード例 #2
0
ファイル: featurise.py プロジェクト: ogh/contra
def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')

        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
                              for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0
コード例 #3
0
 def gen_word_list(term_dict):
     word_list = {}
     max_ip_count = 0
     for k in term_dict:
         term_ptrs = gtbtokenize.tokenize(term_dict[k]["label"]).split()
         for word in term_ptrs: 
             if not word in word_list:
                 word_list[word] = {"terms": set([]), "mags": [], "magnitude": 0, "color": 0, "unique_ips": 0, "opacity": min_alpha, "label": word}
                 if word.lower() in idfs: word_list[word]["idf"] = idfs[word.lower()]["IDF"]
                 else: word_list[word]["idf"] = UNMAPPED_IDF_CONST
             word_list[word]["terms"].add(term_dict[k]["term_id"])
             word_list[word]["mags"].append(term_dict[k]["magnitude"])
             word_list[word]["unique_ips"] += term_dict[k]["unique_ips"]
             if word_list[word]["unique_ips"] > max_ip_count: max_ip_count = word_list[word]["unique_ips"]
     #print len(word_list)
     for k in word_list:
         word_list[k]["magnitude"] = np.log2(np.median(word_list[k]["mags"])*word_list[k]["idf"])
         word_list[k]["color"] = len(word_list[k]["terms"])
         word_list[k]["opacity"] = min_alpha + (1-min_alpha)*np.log2(word_list[word]["unique_ips"])/np.log2(max_ip_count)
         word_list[k]["terms"] = list(word_list[k]["terms"])
     word_df = pd.DataFrame.from_dict(word_list, orient="index")
     word_df = word_df.sort_values("magnitude", ascending=False)
     sel_word_list = word_df[0:WC_THRES]
     return word_list
コード例 #4
0
def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o
コード例 #5
0
def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))
コード例 #6
0
ファイル: extractTIABs.py プロジェクト: spyysalo/pubmed
def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))
コード例 #7
0
def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o
コード例 #8
0
import json, re, sys, os, collections, csv, math, time
import numpy as np
import gtbtokenize
import networkx as nx
import sklearn.metrics as metrics
from operator import itemgetter
import pandas as pd
from utils import MatrixIO, FileUtils

first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
f = lambda x: re.sub(r'[^a-z0-9]', "", x)
tokenize = lambda x: gtbtokenize.tokenize(x).lower()
EMBEDDING_SIZE = 100

N = 24358723
UNMAPPED_IDF_CONST = 4
MIN_ED = 4
vector_file = "../lod_query/biomed_vectors_p.txt"
vocab_file = "../lod_query/biomed_vocab_p.txt"
idf_file = "../lod_query/idf_file.tsv"
vocab_dict = {}
enc_vocab_dict = {}
stopWords = set([
    "a", "also", "although", "am", "an", "and", "are", ".", "NNNN", "VVVV",
    "as", "at", "back", "be", "became", "because", "become", "becomes",
    "becoming", "been", "being", "bill", "both", "bottom", "but", "by", "call",
    "can", "con", "could", "de", "do", "done", "eg", "etc", "even", "ever",
    "find", "for", "found", "from", "get", "give", "go", "had", "has", "have",
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how",
    "however", "if", "in", "inc", "into", "is", "it", "its", "itself", "keep",