def setupDataSet(dataPath, eventsFile, converters, includeAll): """ Preps the data for learning """ #read the event annotations events = readEvents(eventsFile) #read the data rawData, labels = createInstances(readDocs(dataPath, events), events, includeAll) #vectorize it data = v.vectorize(rawData, converters) return data, labels, events
def setupDataSet(dataPath, eventsFile, windowConv, contextConvs): """ Preps the data for learning """ #read the event annotations events = readEvents(eventsFile) #read the data rawData, labels = createInstances(readDocs(dataPath, events), events) left = n.array([windowConv.convert(i) for i in rawData]) #vectorize it right = vectorize(rawData, contextConvs) return (left, right), labels, [i.event for i in rawData]
def makeSequences(dataPath, eventsFile, entityFile, converters, wordConv, eventMap): """ Makes a data set comprised of sequences """ #NOTE: for BIO tagging of entities bio = True #read the event annotations events = readEvents(eventsFile) #read the entities entities = readEntities(entityFile) #read the data rawData, labels = createSequenceInstances(readDocs(dataPath, events), events, entities, eventMap, bio) words = vectorizeWordSequences(rawData, [wordConv]) vec = vectorizeSequences(rawData, converters) return (words,vec), pad_sequences(labels, maxlen=c.maxLen, value=eventMap.nilIndex()), events, [s.toTag() for s in rawData]
def setupDataSet(dataPath, eventsFile, windowConv): """ Preps the data for learning """ #read the event annotations events = readEvents(eventsFile) #read the data docs = readDocs(dataPath, events) #make instances rawData, _ = createInstances(docs, events) #look for potential realis instances realisInsts, labels = matchInstances(docs, rawData) #vectorize it left = n.array([windowConv.convert(i) for i in realisInsts]) return left, labels, [i.event for i in realisInsts]
#!/usr/bin/env python from annotation import readEvents, readDocs dataPath = "/home/walker/Data/ace/full_annotated2/" eventsFile = "data/training.csv" #read the event annotations events = readEvents(eventsFile) docs = readDocs(dataPath, events) vocab = set() for doc in docs: for token in doc.tokens(): vocab.add(token.lemma) for lemma in vocab: try: print(lemma) except: pass
import numpy as n from pickle import dump from ml.dependency import loadDocuments from annotation import readEvents, TokenIndex import config as c import vectorize as v current = 2 index = TokenIndex() w2vFile = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" #read the training events events = readEvents(c.trainingFile) #get the training docs trainingDocs = set([e.docId for e in events]) #load the w2v weights w2v = v.loadW2V(w2vFile) #load the docs for doc in loadDocuments(c.dataPath): #for all the training docs collect all the vocab if doc.id in trainingDocs: print("Doc id {}".format(doc.id))