Esempio n. 1
0
def getDifflibOrPyLev(seq2=None, junk=None, forceDifflib=False):
    '''
    returns either a difflib.SequenceMatcher or pyLevenshtein StringMatcher.StringMatcher
    object depending on what is installed.
    
    If forceDifflib is True then use difflib even if pyLevenshtein is installed:
    '''

    if forceDifflib is True:
        smObject = difflib.SequenceMatcher(junk, '', seq2)
    else:
        try:
            import StringMatcher as pyLevenshtein
            smObject = pyLevenshtein.StringMatcher(junk, '', seq2)
        except ImportError:
            smObject = difflib.SequenceMatcher(junk, '', seq2)

    return smObject
Esempio n. 2
0
    def runRule(self, buf, index, r):
        curindex = index
        sm = StringMatcher()
        dm = DateMatcher()
        tm = TimeMatcher()
        ri = RuleInstance(r)
        if r.getType() == 'stringmatch':
            p = sm.seek_until_keys(buf[curindex:], r.getKeys(),
                                   r.getExpectedConfidence())
            if p[0] != -1:  # Found something
                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + (p[0] + len(p[1]))
                ])
                curindex += (p[0] + len(p[1]))
        elif r.getType() == 'datematch':
            p = dm.locate_date(buf[curindex:], r.getExpectedConfidence())
            if p[0] != -1:
                condate = ""
                for v in p[1]:
                    condate += v[0] + " "

                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + p[0] + len(condate)
                ])
                curindex += p[0]
        elif r.getType() == 'timematch':
            p = tm.locate_time(buf[curindex:], r.getExpectedConfidence())
            if p[0] != -1:
                contime = ""
                for v in p[1]:
                    contime += v[0] + " "
                print "Time offset: ", curindex, " and ", p[
                    0], " = ", curindex + p[0] + len(contime)
                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + p[0] + len(contime)
                ])
                curindex += p[0]

        return (curindex, ri)
Esempio n. 3
0
from RuleSet import *
from Rule import *
from RuleInstance import *
from SpellChecker import *
from DateParser import *
from Record import *
from XLSProcessor import *
from HOCRParser import *
from kitchen.text.converters import getwriter
from time import strftime
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)

xls = XLSProcessor('out.xls')
xls.save()
s = StringMatcher()

#  Could be something like this
#
#                             programtype
#                           >0.9 /    \  <0.9
#                            date    fail
#                      >0.7 /   \  <0.7
#                        time   fail
#                      / | |  \
#                             fail
#



Esempio n. 4
0
    def __init__(self, lang, filePath, hotFilePath, muti_name_file, att_file,
                 batch_size, cache_size, maxSampCount, shuffle, word_vocab,
                 kb_vocab, kbp_type_vocab, kb_type_vocab):
        self.lang = lang
        assert (self.lang == 'ENG' or self.lang == 'CMN' or self.lang == 'SPA')
        self.doc_avg_dis = {}
        self.doc_max_dis = {}
        #para1
        self.doc_avg_dis['ENG'] = 0.0445
        self.doc_avg_dis['CMN'] = 0
        self.doc_avg_dis['SPA'] = 0.1397
        self.doc_max_dis['ENG'] = 0.2
        self.doc_max_dis['CMN'] = 0.8
        self.doc_max_dis['SPA'] = 0.3

        #para2
        #self.doc_avg_dis['ENG'] = 0.0
        #self.doc_avg_dis['CMN'] = 0.0
        #self.doc_avg_dis['SPA'] = 0.1397
        #self.doc_max_dis['ENG'] = 1.0
        #self.doc_max_dis['CMN'] = 1.0
        #self.doc_max_dis['SPA'] = 0.3

        self.filePath = filePath
        self.hotFilePath = hotFilePath
        self.mutiNamePath = muti_name_file
        self.att_file = att_file
        self.shuffle = shuffle
        self.stopWord = {}
        self.wikiContext = {}
        self.docContext = {}
        self.wikiContextIDs = {}
        self.docContextIDs = {}
        self.wordsINDoc = {}
        self.wordsINWiki = {}
        self.hotValue = {}
        self.samples = []
        self.candAttWord = {}
        self.mutiName = {}

        self.word_vocab = word_vocab
        self.kb_vocab = kb_vocab
        self.kbp_type_vocab = kbp_type_vocab
        self.kb_type_vocab = kb_type_vocab

        self.batch_size = batch_size
        self.cache_size = cache_size
        self.FileEnd = 0
        self.file = file
        self.samples_count = 0
        self.stringMatcher = StringMatcher.StringMatcher()
        self.docMatcher = DocMatcher.DocMatcher()

        self.maxSampCount = maxSampCount
        self.group = None

        if (self.lang == 'ENG'):
            self.docMatcher._loadIDF('../file/idf.txt', self.word_vocab)
            self.__loadStopWord('../file/StopWord.txt')
            self.__readDocWikiContext(self.filePath)
            self.__loadHotFile(self.hotFilePath)
            self.__loadMutiName(self.mutiNamePath)
        elif (self.lang == 'CMN'):
            self.docMatcher._loadIDF('../data/IDF.txt', self.word_vocab)
            self.__readDocWikiContext(self.filePath)
            self.__loadHotFile(self.hotFilePath)
            self.__loadMutiName(self.mutiNamePath)
        elif (self.lang == 'SPA'):
            self.docMatcher._loadIDF('../file/idf.txt', self.word_vocab)
            self.__readDocWikiContext(self.filePath)
            self.__loadHotFile(self.hotFilePath)
            self.__loadMutiName(self.mutiNamePath)

        self.reset()
    def statistic_similarity(self, paper, min_similarity):
        """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams)
        and with a Levenshtein it check the similarity for each of them with the topics in the ontology.

        Args:
            paper (string): The paper to analyse. At this stage it is a string.
            cso (dictionary): the ontology previously loaded from the file.
            min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. 

        Returns:
            found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed.
        """

        # analysing grams
        found_topics = {}

        idx = 0
        trigrams = ngrams(word_tokenize(paper), 3)
        matched_trigrams = []
        for grams in trigrams:
            idx += 1
            gram = " ".join(grams)
            topics = [
                key for key, _ in self.cso['topics'].items()
                if key.startswith(gram[:4])
            ]
            for topic in topics:
                m = ls.StringMatcher(None, topic, gram).ratio()
                if m >= min_similarity:
                    topic = self.get_primary_label(topic,
                                                   self.cso['primary_labels'])
                    if topic in found_topics:
                        found_topics[topic].append({
                            'matched': gram,
                            'similarity': m
                        })
                    else:
                        found_topics[topic] = [{
                            'matched': gram,
                            'similarity': m
                        }]
                    matched_trigrams.append(idx)

        idx = 0
        bigrams = ngrams(word_tokenize(paper), 2)
        matched_bigrams = []
        for grams in bigrams:
            idx += 1
            if (idx not in matched_trigrams) and ((idx - 1)
                                                  not in matched_trigrams):
                gram = " ".join(grams)
                topics = [
                    key for key, _ in self.cso['topics'].items()
                    if key.startswith(gram[:4])
                ]
                for topic in topics:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(
                            topic, self.cso['primary_labels'])
                        if topic in found_topics:
                            found_topics[topic].append({
                                'matched': gram,
                                'similarity': m
                            })
                        else:
                            found_topics[topic] = [{
                                'matched': gram,
                                'similarity': m
                            }]
                        matched_bigrams.append(idx)

        idx = 0
        unigrams = ngrams(word_tokenize(paper), 1)
        for grams in unigrams:
            idx += 1
            if (idx not in matched_trigrams) and (
                (idx - 1) not in matched_trigrams) and (
                    idx not in matched_bigrams) and (
                        (idx - 1) not in matched_bigrams) and (
                            (idx - 1) not in matched_bigrams):
                gram = " ".join(grams)
                topics = [
                    key for key, _ in self.cso['topics'].items()
                    if key.startswith(gram[:4])
                ]
                for topic in topics:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(
                            topic, self.cso['primary_labels'])
                        if topic in found_topics:
                            found_topics[topic].append({
                                'matched': gram,
                                'similarity': m
                            })
                        else:
                            found_topics[topic] = [{
                                'matched': gram,
                                'similarity': m
                            }]

        return found_topics