Ejemplo n.º 1
0
    def __init__(self, s=None):
        """ @param[in] s optional SpellSuggestion object to use
                         for the spell checking 
        """
        WordCleanupModule.__init__(self)

        if s == None:
            self.s = SpellSuggestion()
        else:
            self.s = s
Ejemplo n.º 2
0
Archivo: text.py Proyecto: k3njiy/ewrt
    def __init__(self, s=None):
        """ @param[in] s optional SpellSuggestion object to use
                         for the spell checking 
        """
        WordCleanupModule.__init__(self)

        if s==None:
            self.s = SpellSuggestion()
        else:
            self.s = s
Ejemplo n.º 3
0
class FixSpelling(WordCleanupModule):
    """ @class FixSpelling 
        fixes spelling mistakes """
    def __init__(self, s=None):
        """ @param[in] s optional SpellSuggestion object to use
                         for the spell checking 
        """
        WordCleanupModule.__init__(self)

        if s == None:
            self.s = SpellSuggestion()
        else:
            self.s = s

    def __call__(self, l):
        return [self.s.correct(w)[1] for w in l]

    def numMistakesFixed(self, l):
        """ @returns the number of mistakes fixed by the
                     spelling module """
        return len([True for w in l if self.s.correct(w)[1] != w])
Ejemplo n.º 4
0
Archivo: text.py Proyecto: k3njiy/ewrt
class FixSpelling(WordCleanupModule):
    """ @class FixSpelling 
        fixes spelling mistakes """

    def __init__(self, s=None):
        """ @param[in] s optional SpellSuggestion object to use
                         for the spell checking 
        """
        WordCleanupModule.__init__(self)

        if s==None:
            self.s = SpellSuggestion()
        else:
            self.s = s

    def __call__(self, l):
        return [ self.s.correct(w)[1] for w in l ]

    def numMistakesFixed(self, l):
        """ @returns the number of mistakes fixed by the
                     spelling module """
        return len( [ True for w in l if self.s.correct(w)[1] != w ] )
Ejemplo n.º 5
0
from rdflib import Namespace
from collections import defaultdict
from itertools import izip_longest
from operator import itemgetter
from csv import writer

# a directory containing all cxl ontology files
ONTOLOGY_DIR = "/home/albert/data/ac/research/inwork/pakdd2011-ontology-evaluation/data/ontologies/risk/week2"

# required namespaces
NS_RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
NS_WL   = Namespace("http://www.weblyzard.com/2005/03/31/wl#")

# cleanup pipeline
CUSTOM_RISK_CORPUS = "risk-corpus.text.bz2"
s = SpellSuggestion()
s.verbose=True
s.train( SpellSuggestion.words( BZ2File( CUSTOM_RISK_CORPUS ).read() ) )

# compile cleanup queue

strCleanupPipe = (lambda s:s.replace(u'\xd7', " "), unicode.lower, RemovePossessive(), FixDashSpace() )
phrCleanupPipe = (SplitEnumerations(), SplitMultiTerms(), SplitBracketExplanations() )
fs = FixSpelling(s)
wrdCleanupPipe = (fs, RemovePunctationAndBrackets(),)
phraseCleanup = PhraseCleanup(strCleanupPipe, phrCleanupPipe, wrdCleanupPipe )


def extractSPO(rdfOntology):
    """ extracts a set of all relations present in the given ontology
        @param[in] rdfOntology    the rdflib.Graph object representing the ontology
Ejemplo n.º 6
0
from collections import defaultdict
from itertools import izip_longest
from operator import itemgetter
from csv import writer

# a directory containing all cxl ontology files
ONTOLOGY_DIR            = "/home/albert/data/ac/research/inwork/pakdd2011-ontology-evaluation/data/ontologies/risk/week2"
IMPORTANT_CONCEPTS_LIST = "top-terms.text"

# required namespaces
NS_RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
NS_WL   = Namespace("http://www.weblyzard.com/2005/03/31/wl#")

# cleanup pipeline
CUSTOM_RISK_CORPUS = "risk-corpus.text.bz2"
s = SpellSuggestion()
s.verbose=True
s.train( SpellSuggestion.words( BZ2File( CUSTOM_RISK_CORPUS ).read() ) )

# compile cleanup queue

strCleanupPipe = (lambda s:s.replace(u'\xd7', " "), unicode.lower, RemovePossessive(), FixDashSpace() )
phrCleanupPipe = (SplitEnumerations(), SplitMultiTerms(), SplitBracketExplanations() )
fs = FixSpelling(s)
wrdCleanupPipe = (fs, RemovePunctationAndBrackets(),)
phraseCleanup = PhraseCleanup(strCleanupPipe, phrCleanupPipe, wrdCleanupPipe )


def extractSPO(rdfOntology):
    """ extracts a set of all relations present in the given ontology
        @param[in] rdfOntology    the rdflib.Graph object representing the ontology
Ejemplo n.º 7
0
from csv import writer

from eWRT.input.conv.cxl import XCL2RDF
from eWRT.input.clean.text import *
from eWRT.stat.string.spelling import SpellSuggestion

# a directory containing all cxl ontology files
# ONTOLOGY_DIR = "/home/albert/data/ac/research/inwork/pakdd2011-ontology-evaluation/data/ontologies/risk/week2"

# required namespaces
NS_RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
NS_WL = Namespace("http://www.weblyzard.com/2005/03/31/wl#")

# cleanup pipeline
CUSTOM_RISK_CORPUS = "risk-corpus.text.bz2"
s = SpellSuggestion()
s.verbose = True
s.train(SpellSuggestion.words(BZ2File(CUSTOM_RISK_CORPUS).read()))

# compile cleanup queue

strCleanupPipe = (lambda s: s.replace(u'\xd7', " "), str.lower,
                  RemovePossessive(), FixDashSpace())
phrCleanupPipe = (SplitEnumerations(), SplitMultiTerms(),
                  SplitBracketExplanations())
fs = FixSpelling(s)
wrdCleanupPipe = (
    fs,
    RemovePunctationAndBrackets(),
)
phraseCleanup = PhraseCleanup(strCleanupPipe, phrCleanupPipe, wrdCleanupPipe)
Ejemplo n.º 8
0
import logging
logging.basicConfig(filename="/tmp/termtest.log",level=logging.DEBUG)
log = logging.getLogger("geoTEF.examples.ontology.termTest")

NS_RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
NS_WL   = Namespace("http://www.weblyzard.com/2005/03/31/wl#")

SOURCE_DIR         = "./source"
RESULT_DIR         = "./result"
CUSTOM_RISK_CORPUS = "risk-corpus.text.bz2"

# basic concept cleanup
cleanup = lambda c: " ".join(c.replace("'s", "").split() ).lower()

# compile customized spelling suggestions
s = SpellSuggestion()
s.verbose=True
s.train( SpellSuggestion.words( BZ2File( CUSTOM_RISK_CORPUS ).read() ) )

# compile cleanup queue

strCleanupPipe = (unicode.lower, RemovePossessive(), FixDashSpace() )
phrCleanupPipe = (SplitEnumerations(), SplitMultiTerms(), SplitBracketExplanations() )
wrdCleanupPipe = (FixSpelling(), RemovePunctationAndBrackets(),)
phraseCleanup = PhraseCleanup(strCleanupPipe, phrCleanupPipe, wrdCleanupPipe )

def extractConceptSet(rdfOntology):
    """ extracts a set of all concepts present in the given ontology
        @param[in] rdfOntology    the rdflib.Graph object representing the ontology
        @returns a set of all concepts present in the given ontology 
    """
# logging
import logging
logging.basicConfig(filename="/tmp/evaluator.log", filemode="w", level=logging.DEBUG)
log = logging.getLogger("geoTEF.examples.ontology.evaluator")

NS_RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
NS_WL   = Namespace("http://www.weblyzard.com/2005/03/31/wl#")

SOURCE_DIR = "./source"
RESULT_DIR = "./result"
CUSTOM_RISK_CORPUS = "risk-corpus.text.bz2"
PMI_CUTOFF_LEVEL   = 0.5

# compile customized spelling suggestions
s = SpellSuggestion()
s.verbose=True
s.train( SpellSuggestion.words( BZ2File( CUSTOM_RISK_CORPUS ).read() ) )

# compile cleanup queue

cleanXA = lambda x: x.replace("
", " ") # cleans up unicode characters used in the concept names

strCleanupPipe = (unicode.lower, cleanXA, RemovePossessive(), FixDashSpace() )
phrCleanupPipe = (SplitEnumerations(), SplitMultiTerms(), SplitBracketExplanations() )
wrdCleanupPipe = (FixSpelling(), RemovePunctationAndBrackets(),)
phraseCleanup = PhraseCleanup(strCleanupPipe, phrCleanupPipe, wrdCleanupPipe )


def extractSPO(rdfOntology):
    """ extracts a set of all relations present in the given ontology