def runRule(self, buf, index, r): curindex = index sm = StringMatcher() dm = DateMatcher() tm = TimeMatcher() ri = RuleInstance(r) if r.getType() == 'stringmatch': p = sm.seek_until_keys(buf[curindex:], r.getKeys(), r.getExpectedConfidence()) if p[0] != -1: # Found something ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+(p[0]+len(p[1]))]) curindex += (p[0]+len(p[1])) elif r.getType() == 'datematch': p = dm.locate_date(buf[curindex:], r.getExpectedConfidence()) if p[0] != -1: condate = "" for v in p[1]: condate += v[0]+" " ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+p[0]+len(condate)]) curindex += p[0] elif r.getType() == 'timematch': p = tm.locate_time(buf[curindex:], r.getExpectedConfidence()) if p[0] != -1: contime = "" for v in p[1]: contime += v[0]+" " print "Time offset: ",curindex," and ",p[0]," = ", curindex+p[0]+len(contime) ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+p[0]+len(contime)]) curindex += p[0] return (curindex, ri)
def LevDistCorrect (token, suggestions): temp = StringMatcher.distance(token, suggestions[0]) corrected_token = suggestions[0] for word in suggestions[1:]: dist = StringMatcher.distance(token, word) if (dist < temp): corrected_token = word return corrected_token
def runRule(self, buf, index, r): curindex = index sm = StringMatcher() dm = DateMatcher() tm = TimeMatcher() ri = RuleInstance(r) if r.getType() == 'stringmatch': p = sm.seek_until_keys(buf[curindex:], r.getKeys(), r.getExpectedConfidence()) if p[0] != -1: # Found something ri.addMatchedKeys(r.getType(), [ p[1], p[2], curindex + p[0], curindex + (p[0] + len(p[1])) ]) curindex += (p[0] + len(p[1])) elif r.getType() == 'datematch': p = dm.locate_date(buf[curindex:], r.getExpectedConfidence()) if p[0] != -1: condate = "" for v in p[1]: condate += v[0] + " " ri.addMatchedKeys(r.getType(), [ p[1], p[2], curindex + p[0], curindex + p[0] + len(condate) ]) curindex += p[0] elif r.getType() == 'timematch': p = tm.locate_time(buf[curindex:], r.getExpectedConfidence()) if p[0] != -1: contime = "" for v in p[1]: contime += v[0] + " " print "Time offset: ", curindex, " and ", p[ 0], " = ", curindex + p[0] + len(contime) ri.addMatchedKeys(r.getType(), [ p[1], p[2], curindex + p[0], curindex + p[0] + len(contime) ]) curindex += p[0] return (curindex, ri)
def getDifflibOrPyLev(seq2=None, junk=None, forceDifflib=False): ''' returns either a difflib.SequenceMatcher or pyLevenshtein StringMatcher.StringMatcher object depending on what is installed. If forceDifflib is True then use difflib even if pyLevenshtein is installed: ''' if forceDifflib is True: smObject = difflib.SequenceMatcher(junk, '', seq2) else: try: import StringMatcher as pyLevenshtein smObject = pyLevenshtein.StringMatcher(junk, '', seq2) except ImportError: smObject = difflib.SequenceMatcher(junk, '', seq2) return smObject
nodeLists = [dom.xpath('//exercises//problem-set/entry | //exercises//multi-part/entry') for dom in doms] assert len(nodeLists[0]) == len(nodeLists[1]) for tagName in ['solution','correct']: for nodeIndex in range(len(nodeLists[0])): entries = [nodeList[nodeIndex] for nodeList in nodeLists] solutions = [entry.find(tagName) for entry in entries] solutionStrings = [] for solution in solutions: if solution is None: solutionStrings.append('') else: solutionStrings.append(strip_namespaces(etree.tostring(solution, with_tail=False))) if solutionStrings[0] != solutionStrings[1]: blocks = StringMatcher.matching_blocks(StringMatcher.editops(solutionStrings[0], solutionStrings[1]), solutionStrings[0], solutionStrings[1]) if sum([block[2] for block in blocks])/max(len(solutionStrings[0]), len(solutionStrings[1])) < 0.1: blocks = [] for i, col in [(0,'old'), (1,'new')]: pos = 0 output = '' for block in blocks: if block[i] > pos: output += termColors[col] + solutionStrings[i][pos:block[i]] + termColors['reset'] output += solutionStrings[i][block[i]:block[i]+block[2]] pos = block[i]+block[2] if pos < len(solutionStrings[i]): output += termColors[col] + solutionStrings[i][pos:] + termColors['reset'] print '===', col.upper(), '====================================================' print output print '============================================================'
from RuleSet import * from Rule import * from RuleInstance import * from SpellChecker import * from DateParser import * from Record import * from XLSProcessor import * from HOCRParser import * from kitchen.text.converters import getwriter from time import strftime UTF8Writer = getwriter('utf8') sys.stdout = UTF8Writer(sys.stdout) xls = XLSProcessor('out.xls') xls.save() s = StringMatcher() # Could be something like this # # programtype # >0.9 / \ <0.9 # date fail # >0.7 / \ <0.7 # time fail # / | | \ # fail #
#print("test ",i) #print(substr) naiveMatch(stri, substr) end3 = time.time() print("###################################") print("Time elapsed ", end3 - start3) print("###################################") start1 = time.time() for i in range(1, int(len(string) / 100000)): j = 100000 * (i - 1) stri = string[j:j + 100000] k = 10 * (i - 1) substr = substring[k:k + 10] #print("test ",i) #print(substr) kmp = RK.KarpRabin(substr, stri) end1 = time.time() print("###################################") print("Time elapsed ", end1 - start1) print("###################################") start2 = time.time() for i in range(1, int(len(string) / 100000)): j = 100000 * (i - 1) stri = string[j:j + 100000] k = 10 * (i - 1) substr = substring[k:k + 10] #print("test ",i) #print(substr) kmp = KMP.KMP().search(stri, substr) end2 = time.time() print("###################################")
def __init__(self, lang, filePath, hotFilePath, muti_name_file, att_file, batch_size, cache_size, maxSampCount, shuffle, word_vocab, kb_vocab, kbp_type_vocab, kb_type_vocab): self.lang = lang assert (self.lang == 'ENG' or self.lang == 'CMN' or self.lang == 'SPA') self.doc_avg_dis = {} self.doc_max_dis = {} #para1 self.doc_avg_dis['ENG'] = 0.0445 self.doc_avg_dis['CMN'] = 0 self.doc_avg_dis['SPA'] = 0.1397 self.doc_max_dis['ENG'] = 0.2 self.doc_max_dis['CMN'] = 0.8 self.doc_max_dis['SPA'] = 0.3 #para2 #self.doc_avg_dis['ENG'] = 0.0 #self.doc_avg_dis['CMN'] = 0.0 #self.doc_avg_dis['SPA'] = 0.1397 #self.doc_max_dis['ENG'] = 1.0 #self.doc_max_dis['CMN'] = 1.0 #self.doc_max_dis['SPA'] = 0.3 self.filePath = filePath self.hotFilePath = hotFilePath self.mutiNamePath = muti_name_file self.att_file = att_file self.shuffle = shuffle self.stopWord = {} self.wikiContext = {} self.docContext = {} self.wikiContextIDs = {} self.docContextIDs = {} self.wordsINDoc = {} self.wordsINWiki = {} self.hotValue = {} self.samples = [] self.candAttWord = {} self.mutiName = {} self.word_vocab = word_vocab self.kb_vocab = kb_vocab self.kbp_type_vocab = kbp_type_vocab self.kb_type_vocab = kb_type_vocab self.batch_size = batch_size self.cache_size = cache_size self.FileEnd = 0 self.file = file self.samples_count = 0 self.stringMatcher = StringMatcher.StringMatcher() self.docMatcher = DocMatcher.DocMatcher() self.maxSampCount = maxSampCount self.group = None if (self.lang == 'ENG'): self.docMatcher._loadIDF('../file/idf.txt', self.word_vocab) self.__loadStopWord('../file/StopWord.txt') self.__readDocWikiContext(self.filePath) self.__loadHotFile(self.hotFilePath) self.__loadMutiName(self.mutiNamePath) elif (self.lang == 'CMN'): self.docMatcher._loadIDF('../data/IDF.txt', self.word_vocab) self.__readDocWikiContext(self.filePath) self.__loadHotFile(self.hotFilePath) self.__loadMutiName(self.mutiNamePath) elif (self.lang == 'SPA'): self.docMatcher._loadIDF('../file/idf.txt', self.word_vocab) self.__readDocWikiContext(self.filePath) self.__loadHotFile(self.hotFilePath) self.__loadMutiName(self.mutiNamePath) self.reset()
def statistic_similarity(self, paper, min_similarity): """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams) and with a Levenshtein it check the similarity for each of them with the topics in the ontology. Args: paper (string): The paper to analyse. At this stage it is a string. cso (dictionary): the ontology previously loaded from the file. min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. Returns: found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. """ # analysing grams found_topics = {} idx = 0 trigrams = ngrams(word_tokenize(paper), 3) matched_trigrams = [] for grams in trigrams: idx += 1 gram = " ".join(grams) topics = [ key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4]) ] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: topic = self.get_primary_label(topic, self.cso['primary_labels']) if topic in found_topics: found_topics[topic].append({ 'matched': gram, 'similarity': m }) else: found_topics[topic] = [{ 'matched': gram, 'similarity': m }] matched_trigrams.append(idx) idx = 0 bigrams = ngrams(word_tokenize(paper), 2) matched_bigrams = [] for grams in bigrams: idx += 1 if (idx not in matched_trigrams) and ((idx - 1) not in matched_trigrams): gram = " ".join(grams) topics = [ key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4]) ] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: topic = self.get_primary_label( topic, self.cso['primary_labels']) if topic in found_topics: found_topics[topic].append({ 'matched': gram, 'similarity': m }) else: found_topics[topic] = [{ 'matched': gram, 'similarity': m }] matched_bigrams.append(idx) idx = 0 unigrams = ngrams(word_tokenize(paper), 1) for grams in unigrams: idx += 1 if (idx not in matched_trigrams) and ( (idx - 1) not in matched_trigrams) and ( idx not in matched_bigrams) and ( (idx - 1) not in matched_bigrams) and ( (idx - 1) not in matched_bigrams): gram = " ".join(grams) topics = [ key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4]) ] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: topic = self.get_primary_label( topic, self.cso['primary_labels']) if topic in found_topics: found_topics[topic].append({ 'matched': gram, 'similarity': m }) else: found_topics[topic] = [{ 'matched': gram, 'similarity': m }] return found_topics
def get_seq(i): global lines data = "" header = "" for line in lines: if line.startswith(">"): if i == 0: return (header, data) i = i - 1 header = line[:-1] else: data = line[10:-11] # remove MIDs and \n seq_count = get_seq_count() sm = StringMatcher() def index(a, x): i = bisect.bisect_left(a, x) if i != len(a) and a[i] == x: return True return False os.system("mkdir " + sys.argv[1] + "_result") for i in range(seq_count): if index(matched, i + 1): continue header, data = get_seq(i + 1)