Example #1
0
    def genia_tokenizer(self):
        '''Tokenize pair text with genia tagger.

        '''
        tagger = GeniaTagger('./tools/geniatagger-3.0.2/geniatagger')
        with open('./chemprot_test_gs/new_testing_examples.json', 'r') as f:
            training_examples = json.load(f)
            # print(len(training_examples))
            for i in training_examples:
                tokenized_tuple = tagger.parse(i['sentence'])
                token_list = []
                for output in tokenized_tuple:
                    if output[0] in string.punctuation:
                        continue
                    pos = output[2]
                    if output[0] == pos:
                        continue
                    if output[0].endswith('..'):
                        token = output[0][:-2]
                    elif pos == 'CD':
                        token = 'NUM'
                    else:
                        token = output[0]
                    token_list.append(token)
                i['sentence'] = ' '.join(token_list)
        # with open('./chemprot_training/train_tokenized.json', 'w+') as j:
        with open('./chemprot_test_gs/testing_tokenized.json', 'w+') as j:
            json.dump(training_examples, j, indent=4)
def load_tagger(fp: Path):
    """
    load GENIA Tagger
    :param fp: file path of GENIA Tagger
    :return:
    """
    tagger = GeniaTagger(fp)
    return tagger
def loadParsingTools():
    global tagger, sentencesplitter

    # Get the locations of geniatagger and lingpipe relative to this script
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    geniaPath = "../../Geniatagger/geniatagger-3.0.1/geniatagger"
    lingpipePath = "../../Lingpipe/LingpipeSentenceSplitter/run.sh"

    # Check they are there
    if not os.path.isfile(geniaPath):
        raise RuntimeError("Cannot access GeniaTagger. Tried: " + geniaPath)
    elif not os.path.isfile(lingpipePath):
        raise RuntimeError("Cannot access LingPipe. Tried: " + lingpipePath)

    tagger = GeniaTagger(geniaPath)
    sentencesplitter = LingPipe(lingpipePath)
    def __init__(self, model, topn, alpha, tagger, complex_freq, simple_freq,
                 freq_t, char_ngram):

        logger.info("Instatiating Simple Science Simplifier...")

        self.model = unpickle(model)
        logger.info("Loaded embeddings models from : `{}`".format(model))
        self.topn = topn
        self.alpha = alpha
        self.tagger = GeniaTagger(tagger)
        logger.info("Loaded Genia PoS tagger from : `{}`".format(tagger))
        self.complex_freq = unpickle(complex_freq)
        logger.info(
            "Loaded Complex Word Frequencies from : `{}`".format(complex_freq))
        self.simple_freq = unpickle(simple_freq)
        logger.info(
            "Loaded Simple Word Frequencies from : `{}`".format(simple_freq))
        self.freq_t = freq_t
        self.char_ngram = char_ngram
Example #5
0
from geniatagger import GeniaTagger
import codecs
import sys
tagger = GeniaTagger('/Users/ruichen/DL/geniatagger-3.0.2/geniatagger')


read_dir = '/Users/ruichen/DL/bio_de_mt/pubmed_en_fr_separate/'

en_fn = read_dir + 'pubmed_en.txt'
fr_fn = read_dir + 'pubmed_fr.txt'

# UTF8Writer = codecs.getwriter('utf8')
# sys.stdout = UTF8Writer(sys.stdout)


def decode_list(l):
    return [ tuple(word.decode('utf-8') for word in tp)  for tp in l]

#sys.stdout = codecs.getwriter('utf8')(sys.stdout)

def tag():

    out_fn = "test.txt"
    #out_fn = "genia_raw_tag_pubmed_en_fr.txt" 
    out_file = codecs.open(out_fn, encoding='utf-8', mode='w+')
    
    with codecs.open(en_fn, encoding='utf-8') as ef:
        with codecs.open(fr_fn, encoding = 'utf-8') as ff: 
            for en, fr in zip(ef, ff):
    #             parse_result = tagger.parse(line.encode('utf-8'))
    #             print parse_result
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
from pywsd.lesk import simple_lesk
from geniatagger import GeniaTagger
import pandas as pd
import nltk
from general_functions import *

geni = GeniaTagger('/home/aoguntuga/myVirtualEnvs/sent2/geniatagger/geniatagger')

xl_file = pd.ExcelFile("/home/aoguntuga/myVirtualEnvs/sent2/Test/Test Cases.xlsx")

sent_df = xl_file.parse('Sheet1')

sentences = list(sent_df['Sentence'].values)

polarity_list_1 = []
polarity_list_2 = []

for s in sentences:
	syn1 = return_synset_list_1(sentence)
	syn2 = return_synset_list_2(sentence)
	p1 = polarity_score(syn1)
	p2 = polarity_score(syn2)
	polarity_list_1.append(p1)
	polarity_list_2.append(p2)

sent_df['lesk_wsd_polarity'] = polarity_list_1
sent_df['siplelesk_wsd_polarity'] = polarity_list_2

sent_df.to_excel("output_path.xlsx")
from nltk import *
from nltk.corpus import *
import string
from nltk.corpus import brown
from geniatagger import GeniaTagger

#######################################################################



corpusDir = ["/GI_TAGGING/"]
corpusRootList = [os.getcwd() + directory for directory in corpusDir]

codec = "utf8"

tagger = GeniaTagger("/usr/local/geniatagger/geniatagger")

train_sents = brown.tagged_sents(categories="learned", tagset="treebank")

t0 = nltk.DefaultTagger("UNK")
t1 = nltk.UnigramTagger(train_sents, backoff = t0)
t2 = nltk.BigramTagger(train_sents, backoff = t1)



# Gennia + Backoff
for corpusRoot in corpusRootList :

    corpusReader = PlaintextCorpusReader(corpusRoot, ".*.txt", encoding = codec)

    outFile = open(corpusRoot + "genia_and_backoff.txt", "w")   
Example #8
0
from sklearn.cross_validation import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import numpy as np
import sklearn as sk
import csv
import re
from geniatagger import GeniaTagger
tagger = GeniaTagger(
    '/home/sunilnew/python_packages/geniatagger-3.0.2/geniatagger')
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
import pickle
from cnn_train import *


def preProcess(sent):
    sent = sent.lower()
    sent = tokenizer.tokenize(sent)
    sent = ' '.join(sent)
    sent = re.sub('\d', 'dg', sent)
    sent_list, _, _, _, _ = zip(*tagger.parse(sent))
    sent = ' '.join(sent_list)
    return sent


def find_sub_list(sl, l):
    sll = len(sl)
    for ind in (i for i, e in enumerate(l) if e == sl[0]):
        if l[ind:ind + sll] == sl:
Example #9
0
from sklearn.feature_extraction import DictVectorizer
from sklearn import datasets
from sklearn import svm
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from nltk.tokenize import WordPunctTokenizer
from sklearn.metrics import classification_report
import nltk
import numpy as np
import re
import pickle
from geniatagger import GeniaTagger
tagger = GeniaTagger('/home/desh/geniatagger-3.0.2/geniatagger')

tokenizer = WordPunctTokenizer()


def preProcess(sent):
    sent = re.sub(r"[-+]?\d*\.\d+|\d+", "num", sent.lower())
    sent = tokenizer.tokenize(sent)
    sent = ' '.join(sent)
    sent_list, _, _, _, _ = zip(*tagger.parse(sent))
    sent = ' '.join(sent_list)
    return sent


def find_sub_list(sl, l):
    sll = len(sl)
Example #10
0
import glob
import itertools
import pickle

from geniatagger import GeniaTagger

tagger = GeniaTagger('/home/asada.13003/ddi_cnn/geniatagger-3.0.2/geniatagger')

train_path = 'Divide/train/*.ann'
dev_path = 'Divide/dev/*.ann'

p_tr_path = 'Pickle/train'
p_dev_path = 'Pickle/dev'


def sent_label_pe(path):
    sent = []
    label = []
    y = []
    y_minus = []
    id = []
    for i in glob.glob(path):
        f = open(i, 'r')
        f_txt = open(i.replace('Divide', 'Brat').replace('.ann', '.txt'), 'r')

        sentID = i.split('/')[-1].replace('.ann', '')

        line = f.readlines()
        text = f_txt.read()

        entity = []
def loadParsingTools():
	global tagger, lingpipe
	# Currently point to scripts. TODO: use $PATH instead to find them	
	tagger = GeniaTagger('/home/jlever/apps/geniatagger-3.0.1/geniatagger')
	lingpipe = LingPipe('/projects/jlever/megaTextProject/nounphrasePipeline/lingpipeSentenceSplitter/run.sh')
import os
import nltk
from nltk import *
from nltk.corpus import *
import string
from nltk.corpus import brown
from geniatagger import GeniaTagger

#######################################################################

corpusDir = ["/GI_TAGGING/"]
corpusRootList = [os.getcwd() + directory for directory in corpusDir]

codec = "utf8"

tagger = GeniaTagger("/usr/local/geniatagger/geniatagger")

train_sents = brown.tagged_sents(categories="learned", tagset="treebank")

t0 = nltk.DefaultTagger("UNK")
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)

# Gennia + Backoff
for corpusRoot in corpusRootList:

    corpusReader = PlaintextCorpusReader(corpusRoot, ".*.txt", encoding=codec)

    outFile = open(corpusRoot + "genia_and_backoff.txt", "w")

    for journal in corpusReader.fileids():
Example #13
0
from nltk.stem.lancaster import LancasterStemmer
from collections import Counter,defaultdict
import geniatagger
from geniatagger import GeniaTagger
from nltk import word_tokenize, pos_tag
from nltk.tokenize import sent_tokenize
import requests
import json
from linggle_api import Linggle
from classify_error_type import *

GEC_API = 'https://whisky.nlplab.cc/translate/?text={}'


app = Flask(__name__)
tagger = GeniaTagger('/home/nlplab/yeema/geniataggerPython/geniatagger-3.0.2/geniatagger',['-nt'])
ling = Linggle()

dictWord = defaultdict(lambda: defaultdict(list))
phraseV = defaultdict(lambda: defaultdict(list))
dictPhrase = defaultdict(lambda: defaultdict(list))
dictDef = defaultdict(lambda: defaultdict(list))
miniparCol = defaultdict(lambda: defaultdict(lambda: Counter()))
pw = defaultdict(lambda: defaultdict(lambda:Counter()))
pw_ratio = defaultdict(lambda: defaultdict(lambda:Counter()))
LCE = eval(open('/home/nlplab/yeema/ErrorExplaination/LCE.json').read())
dictSimilar = defaultdict()

@app.route('/')	
def index():
    return render_template('template.html')
from nltk.tokenize import PunktSentenceTokenizer
from geniatagger import GeniaTagger

tagger = GeniaTagger('~/qwerty/shashank/geniatagger-3.0.2/geniatagger')
print(tagger.parse('This is a pen.'))
#print(tagger.parse('tis is  pen'))

#print(data)
med_tokenizer = PunktSentenceTokenizer(train_data)

Example #15
0
def annotate_text(tager=''):
    genia = GeniaTagger('../genia-tagger/geniatagger-3.0.2/geniatagger')
    medpost = spacy.load(os.path.abspath('trained_tagger'))
    stanford = StanfordCoreNLP('http://localhost:9000')
    main_dir = 'corrected_outcomes'
    data_dir = os.path.abspath(os.path.join(main_dir, 'aggregated'))
    create_storage_dirs([data_dir])

    sub_dir = os.path.abspath(os.path.join(data_dir, 'test'))
    if not os.path.exists(os.path.dirname(sub_dir)):
        os.makedirs(os.path.dirname(sub_dir))

    turker, ebm_extract = e.read_anns('hierarchical_labels', 'outcomes', \
                                      ann_type='aggregated', model_phase='train')

    seq_dir = os.path.abspath(os.path.join(os.path.curdir, 'corrected_outcomes', 'test'))
    create_storage_dirs([seq_dir])
    ebm_csv = []

    start = time.time()

    with open(os.path.join(seq_dir, 'test_medpost.bmes'), 'w') as f:
        for pmid, doc in ebm_extract.items():
            abstract = ' '.join(i for i in doc.tokens)
            #pprint(abstract)
            u = doc.anns['AGGREGATED']
            v = doc.tokens
            o = []
            corr_outcomes = []
            temp, temp_2 = [], []
            t = 0
            m = 0
            o_come = e.print_labeled_spans_2(doc)[0] #extract outcomes from the abstract being examined, [(Outcome-type, Outcome), (Outcome-type, Outcome2)]

            #store the annotations and the index of the annotations for each abstract
            for x in range(len(u)):
                if x == t:
                    if u[x] != 0:
                        for ff in o_come:
                            for j in range(len(u)):
                                if j < len(ff[1].split()):
                                    o.append((t, u[x]))
                                    t += 1
                            break
                        o_come.pop(0)

                        txt_toks = [v[i[0]] for i in o]
                        text_wrds = ' '.join(i for i in txt_toks)

                        corr = correcting_spans.correct_text()
                        text_wrds = corr.statTerm_keyWord_punct_remove(text_wrds)

                        if tager.lower() == 'genia':
                            tagged = genia.parse(text_wrds)
                            pos = [i[2] for i in tagged]
                        elif tager.lower() == 'medpost':
                            tagged = medpost(text_wrds)
                            pos = [i.tag_ for i in tagged]
                        elif tager.lower() == 'stanford':
                            pos = []
                            for elem in word_tokenize(text_wrds):
                                stan = stanford.annotate(elem, properties={'annotators':'pos', 'outputFormat':'json'})
                                pos.append(stan['sentences'][0]['tokens'][0]['pos'])

                        text_pos = ' '.join(i for i in pos)

                        label = core_outcome[u[x]]

                        corrected_spans = corr.pos_co_occurrence_cleaning(text_wrds, text_pos, label)

                        if len(corrected_spans) == 0:
                            v[o[0][0]:(o[-1][0] + 1)] = txt_toks
                            u[o[0][0]:(o[-1][0] + 1)] = [0 for i in range(len(txt_toks))]
                        elif len(corrected_spans) < 2:
                            span = corrected_spans[0]
                            s = [i for i in span[1].split()]
                            ll = [o[0][1] if i in s else 0 for i in txt_toks]
                            v[o[0][0]:(o[-1][0] + 1)] = txt_toks
                            u[o[0][0]:(o[-1][0] + 1)] = ll
                        else:
                            s = [i for j in corrected_spans for i in j[1].split()]
                            ll = [o[0][1] if i in s else 0 for i in txt_toks]
                            v[o[0][0]:(o[-1][0] + 1)] = txt_toks
                            u[o[0][0]:(o[-1][0] + 1)] = ll

                        p = [i for i in corrected_spans]
                        if len(p) > 0:
                            for i in p:
                                corr_outcomes.append(i)
                        o.clear()

                    else:
                        t += 1
            if corr_outcomes:
                temp_2 = build_sequence_model(v, u, core_outcome, corr_outcomes)
                qq = 1
                for i in temp_2:
                    print(qq, i)
                    f.write('{}\n'.format(i))
                    qq += 1
                f.write('\n')
                for k in corr_outcomes:
                    ebm_csv.append(k)
        ebm_csv_df = pd.DataFrame(ebm_csv, columns=['Label','Outcome'])
        ebm_csv_df.to_csv(os.path.join(os.path.abspath(os.path.curdir), 'corrected_outcomes/test/labels_outcomes_medpost.csv'))
        f.close()
    print("Duration {}".format(time.time() - start))