Example #1
0
def main():

    print u.now(), "Start"

    modelFile = "tag.model"
    
    inputFile = "gene.dev"
    outputFile = "dev.p1.out"
    
#    inputFile = "short.dev"
#    outputFile = "dev.short.out"

    inputFile = "gene.test"
    outputFile = "gene_test.p1.out"


    weightVector = readPreTrainedModel(modelFile)
    
    lines = u.readFromFile(inputFile)
    
    tagSequences = []

    while len(lines) > 0:       
        sentence, lines = u.GetNextSentence(lines)

        featureData = FeatureData(weightVector)
        tagSequence = ViterbiGLM(featureData, sentence)
        
        tagSequences.append((tagSequence, sentence))
    
    outputText = []

    lines = u.readFromFile(inputFile)

    del (tagSequences[0][0])[0]
    del (tagSequences[0][1])[0]

    for tagSequence, sentence in tagSequences:
        
        for j in xrange(0,len(tagSequence)):

            if len(tagSequence) != len(sentence):
                continue

            tag = tagSequence[j]
            word = sentence[j]


            outputText.append("%s %s" % (word, tag))
            
            
    outputText.append("")
    outputText.append("")
    
    outputText = '\n'.join(outputText)
    
    u.saveToFile(outputFile, outputText)
    
    print u.now(), "Done"
Example #2
0
def main():

    print u.now(), "Start"

    modelFile = "tag.model"
    trainingFile = "gene.train"
    
    inputFile = "gene.dev"
    outputFile = "dev.p1.out"
    
    lines = u.readFromFile(inputFile)
    
    #weightVector = p1.readPreTrainedModel(modelFile)

    # Empty weight vector means a vector of 0s
    weightVector = dict()

    featureData = FeatureData(weightVector)
    
    trainingSet = readTrainingSetFile(trainingFile)

    perceptron(trainingSet, featureData, 5)


#    tagSequences = []
#
#    while len(lines) > 0:       
        #sentence, lines = u.GetNextSentence(lines)
        
#    genX = gen(["", "hello", "there", "foo", "bar", "bacon"])

    
    print u.now(), "Done"
Example #3
0
def readPreTrainedModel(modelFilename):
    lines = u.readFromFile(modelFilename)
    
    vMap = dict()
    
    for i in xrange(0, len(lines)):
        line = lines[i]
        tokens = line.split()
        
        if len(tokens) != 2:
            print "Invalid line %s: %s" % ( i, line )
            
        vMap[tokens[0]] = float(tokens[1])
        
    return vMap
Example #4
0
def readTrainingSetFile(filename):
    lines = u.readFromFile(filename)

    trainingSet = []

    while len(lines) > 0:       
        trainingSentence, lines = u.GetNextSentence(lines)
        
        x = []
        y = []
        
        for pair in trainingSentence[1:]:
            splitPair = pair.split()
            
            x.append(splitPair[0])
            y.append(splitPair[1])

        
        trainingSet.append ( (x,y) )
        
    return trainingSet