Esempio n. 1
0
def setupDataSet(dataPath, eventsFile, converters, includeAll):
    """
	Preps the data for learning
	"""
    #read the event annotations
    events = readEvents(eventsFile)

    #read the data
    rawData, labels = createInstances(readDocs(dataPath, events), events,
                                      includeAll)

    #vectorize it
    data = v.vectorize(rawData, converters)

    return data, labels, events
Esempio n. 2
0
def setupDataSet(dataPath, eventsFile, windowConv, contextConvs):
	"""
	Preps the data for learning
	"""
	#read the event annotations
	events = readEvents(eventsFile)

	#read the data
	rawData, labels = createInstances(readDocs(dataPath, events), events)

	left = n.array([windowConv.convert(i) for i in rawData])

	#vectorize it
	right = vectorize(rawData, contextConvs)

	return (left, right), labels, [i.event for i in rawData]
Esempio n. 3
0
def makeSequences(dataPath, eventsFile, entityFile, converters, wordConv, eventMap):
	"""
	Makes a data set comprised of sequences
	"""
	#NOTE: for BIO tagging of entities
	bio = True

	#read the event annotations
	events = readEvents(eventsFile)

	#read the entities
	entities = readEntities(entityFile)

	#read the data
	rawData, labels = createSequenceInstances(readDocs(dataPath, events), events, entities, eventMap, bio)

	words = vectorizeWordSequences(rawData, [wordConv])
	vec = vectorizeSequences(rawData, converters)

	return (words,vec), pad_sequences(labels, maxlen=c.maxLen, value=eventMap.nilIndex()), events, [s.toTag() for s in rawData]
Esempio n. 4
0
def setupDataSet(dataPath, eventsFile, windowConv):
    """
	Preps the data for learning
	"""
    #read the event annotations
    events = readEvents(eventsFile)

    #read the data
    docs = readDocs(dataPath, events)

    #make instances
    rawData, _ = createInstances(docs, events)

    #look for potential realis instances
    realisInsts, labels = matchInstances(docs, rawData)

    #vectorize it
    left = n.array([windowConv.convert(i) for i in realisInsts])

    return left, labels, [i.event for i in realisInsts]
#!/usr/bin/env python

from annotation import readEvents, readDocs

dataPath = "/home/walker/Data/ace/full_annotated2/"
eventsFile = "data/training.csv"

#read the event annotations
events = readEvents(eventsFile)

docs = readDocs(dataPath, events)

vocab = set()

for doc in docs:
    for token in doc.tokens():
        vocab.add(token.lemma)

for lemma in vocab:
    try:
        print(lemma)
    except:
        pass
import numpy as n
from pickle import dump

from ml.dependency import loadDocuments
from annotation import readEvents, TokenIndex
import config as c
import vectorize as v

current = 2
index = TokenIndex()

w2vFile = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz"

#read the training events
events = readEvents(c.trainingFile)

#get the training docs
trainingDocs = set([e.docId for e in events])

#load the w2v weights
w2v = v.loadW2V(w2vFile)

#load the docs
for doc in loadDocuments(c.dataPath):

    #for all the training docs collect all the vocab
    if doc.id in trainingDocs:

        print("Doc id {}".format(doc.id))