def search_sentences_c(self, \ max_words = None): if max_words: self._MAX_WORDS = int(max_words) else: self._MAX_WORDS = None start_node, end_node = self.get_start_and_end_node() import gsflc self.links.sort(cmp=Link.cmp_id) links_c = [] for link in self.links: links_c.append([self.nodes.index(link.s), self.nodes.index(link.e)]) sentences = gsflc.search(self.nodes.index(start_node),\ self.nodes.index(end_node),\ len(self.nodes),\ links_c,\ self._MAX_WORDS) self.sentences = [] for sentence in sentences: new_sentence = Sentence() for link_index in sentence: new_sentence.add(self.links[link_index]) self.sentences.insert(0, new_sentence) self.sentences.sort(cmp=Sentence.cmp_score, reverse=True) self.sentences_ready = self.sentences return self.sentences
def on_message(self, message): if ('request' in eval(message)['type']): SocketHandler.send_to_all({ 'type': 'request', 'message': eval(message) }) return SocketHandler.send_to_other(self,{ 'type': 'user', 'id': id(self), 'message': eval(message), }) SocketHandler.send_to_self(self,{ 'type': 'self', 'id': id(self), 'message': eval(message), }) sentence = Sentence(message) termlist = sentence.getTerms() if (len(termlist) != 0): newmessage = { "termlist": termlist, "parent": eval(message)['parent'] } SocketHandler.send_to_all({ 'type': 'term', 'id': id(self), 'message': newmessage, })
def fromJSON(self, file): with open(file, 'r') as data: _json = json.load(data) self.sentences = list() for sentence in _json: _sentence = Sentence(None, None) _sentence = _sentence.fromJSON(_json[sentence]) self.sentences.append(_sentence) return self
def generateTestCasesAllErr(self): """Returns a list of sentences with all error""" testCases = [] # list of Sentences for sentence in self.corpus: cleanSentence = sentence.cleanSentence() testSentence = Sentence(cleanSentence) for i in range(0, len(sentence)): datum_i = sentence.get(i) if datum_i.hasError(): testSentence.put(i, datum_i) testCases.append(testSentence) return testCases
def generateTestCases(self): """Returns a list of sentences with exactly 1 elligible spelling error""" testCases = [] # list of Sentences for sentence in self.corpus: cleanSentence = sentence.cleanSentence() for i in range(0, len(sentence)): datum_i = sentence.get(i) if datum_i.hasError() and datum_i.isValidTest(): testSentence = Sentence(cleanSentence) testSentence.put(i, datum_i) testCases.append(testSentence) return testCases
def __init__(self, filepath): self.filepath = filepath # all sents in corpus self.sents = [] sentence_number = 0 sentence = Sentence(sentence_number) self.instances = {} with open(filepath) as f: for num, line in enumerate(f): # check if we have a valid token line if line[0].isalpha(): token_line = line.split() # create new Token object token = Token(token_line[3], token_line[4], token_line[5], token_line[-1].strip(")").strip("("), num) sentence.add_token(token) # check if line is empty - we reached end of current sentence elif not line.strip(): self.sents.append(sentence) sentence_number += 1 sentence = Sentence(sentence_number)
def __init__(self, goldPath, predictedPath=None): self.goldPath = goldPath self.predictedPath = predictedPath self.sents = [] # all sents in corpus self.sent_stats = {} self.numTokens = 0 # count total tokens in corpus self.tags = set() self.tokens = [] sent = Sentence() if predictedPath: with open(goldPath) as gf, open(predictedPath) as pf: for gline,pline in izip(gf, pf): # open two files simultaneously if gline.strip() and pline.strip(): # check if lines not empty gtoken_tag = re.split(r'\t', gline) ptoken_tag = re.split(r'\t', pline) if gtoken_tag[0] == ptoken_tag[0]: token = Token(gtoken_tag[0], gtoken_tag[1].strip(), ptoken_tag[1].strip()) # create new Token object sent.addToken(token) self.numTokens += 1 else: raise Exception("Files not in sync") else: self.sents.append(sent) sent = Sentence() else: # store all sentences from corpus sentences = [] # store a sentence that consists of tokens sentence = [] with open(goldPath) as gf: for line in gf: # check if lines not empty if line.strip(): # split line into token and tag as list elements token_tag = re.split(r'\t', line) # add a token object into sentence sentence.append(Token(token_tag[0].strip(), token_tag[1].strip())) # count total number of tokens self.numTokens += 1 else: # we have reached end of sentence (empty line) sentences.append(sentence) sentence = [] prev = "prevnotekzist" following = "folnotekzist" for j, sentence in enumerate(sentences): for i, token in enumerate(sentence): # make sure we don't go beyond sentence length if i+1 < len(sentence): following = sentence[i+1] # if we reached end of current sentence - take following word as first word of next sentence elif j+1 < len(sentences): following = sentences[j+1][0] token.setPrev(prev) token.setFollowing(following) token.getNeighborFeatures() # print (vars(token)) prev = token sent.addToken(token) self.sents.append(sent) sent = Sentence()
def parse_mtree(self): if self.lang!='en': raise Exception("MetricalTree parsing only works currently for English text.") import metricaltree as mtree mtree.set_paths(self.dir_mtree) wordtoks = self.wordtokens() toks = [wtok.token for wtok in wordtoks] pauses = mtree.pause_splitter_tokens(toks) #sents = [sent for pause in pauses for sent in pause] sents=[] for pause in pauses: sents.extend(mtree.split_sentences_from_tokens(pause)) parser = mtree.return_parser(self.dir_mtree) trees = list(parser.lex_parse_sents(sents, verbose=False)) stats = parser.get_stats(trees,arto=True,format_pandas=False) assert len(stats)==len(wordtoks) sents = [] sent = [] sent_id=None for wTok,wStat in zip(wordtoks,stats): if sent_id!=wStat['sidx']: sent_id=wStat['sidx'] if sent: sents+=[sent] sent=[] sent+=[wTok] #for k,v in wStat.items(): # setattr(wTok,k,v) if not hasattr(wTok,'feats'): wTok.feats={} for k,v in list(wStat.items()): if k in mtree.INFO_DO_NOT_STORE: continue wTok.feats[k]=v if sent: sents+=[sent] assert len(sents) == len(trees) from Sentence import Sentence for sent,tree in zip(sents,trees): sentobj = Sentence(sent, tree) self._sentences+=[sentobj] # create a normalized stress per line import numpy as np for line in self.lines(): wtoks = line.children # norm mean stresses = [wtok.feats['norm_mean'] for wtok in wtoks if not np.isnan(wtok.feats['norm_mean'])] max_stress = float(max(stresses)) min_stress = float(min(stresses)) for wtok in wtoks: wtok.feats['norm_mean_line']=(wtok.feats['norm_mean']-min_stress)/(max_stress-min_stress) if max_stress else np.nan # mean stresses = [wtok.feats['mean'] for wtok in wtoks if not np.isnan(wtok.feats['mean'])] min_stress = float(min(stresses)) diff = 1.0 - min_stress for wtok in wtoks: wtok.feats['mean_line']=wtok.feats['mean'] + diff
def convert_to_obj(self): for sec, block in self.document.items(): for key in block.keys(): block[key] = Sentence(block[key])
if (t[1] in want) and (t[0] not in stopword) and (len(t[0]) > 1): token.append(t[0]) pos.extend(token) return pos def save(self, coll, oid): coll.update({"_id": oid}, {"$set": { "keyword": self.get_result() }}, upsert=True) if __name__ == "__main__": from Sentence import Sentence import csv talk = [] f = open('script3.csv', 'r', encoding='utf-8') rdr = csv.reader(f) for line in rdr: talk.append(list(line)) f.close() sentences = [Sentence(i, t) for i, t in enumerate(talk)] keyword = Tf_Idsf(sentences) print(keyword.get_result())
def processLine(self, line): ''' Reads line contains misspell words like the following <ERR targ=That's> Thats </ERR> what <ERR targ=James> Jame </ERR> Returns sentence including a list of Datum(correct word, incorrect word) ''' processed_tokens = Sentence() processed_tokens.append(Datum("<s>")) # start symbol tokens = line.split() i = 0 while i < len(tokens): token = tokens[i] # find out misspell word place if token == "<err": targ = tokens[i + 1] targ_splits = targ.split("=") correct_token = targ_splits[1][: -1] # chop off the trailing '>' correct_token_splits = correct_token.split() if len(correct_token_splits) > 2: # targ with multiple words print 'targ with multiple words: "%s"' % targ for correct_word in correct_token_splits: processed_tokens.append(Datum(correct_word)) # if miss one word in between, so no incorrect_token in this case elif tokens[i + 3] != '</err>': processed_tokens.append(Datum(correct_token)) else: incorrect_token = tokens[i + 2] processed_tokens.append( Datum(correct_token, incorrect_token)) # move index to one after </ERR> symbol i += tokens[i:].index('</err>') + 1 else: # No mis-spell, correct sentence processed_tokens.append(Datum(token)) i += 1 processed_tokens.append(Datum("</s>")) # stop symbol return processed_tokens
testSet = SentenceReader(options.testPath, True) tagger.decodeParalle(testSet, outpath=options.outputPath, numThreads=options.numThreads, numPerTheads=options.numPerTheads) testSet.close() elif options.mode == "display": print("Loading model ... ") with open(options.modelPath, 'r') as file_in: tagger = cPickle.load(file_in) print("Done") while True: raw_sent = raw_input("请输入待分词的字符串:") if raw_sent.__len__() == 0: break sent = Sentence(raw_sent.decode('utf-8')) states = tagger.decodeBeamSearch(sent, "test") print states[1].getFinalResult().encode('utf-8') elif options.mode == "eval": print("Loading model ... ") with open(options.modelPath, 'r') as file_in: tagger = cPickle.load(file_in) print("Done") tagger.quiet = False testSet = SentenceReader(options.testPath) print("Evaluating ...") accuracy = tagger.evaluate(testSet, numThreads=options.numThreads, numPerTheads=options.numPerTheads)
def mapMaker(parsableParas): G = nx.Graph() nodeLabels = {} edgeLabels = {} for parsablePara in parsableParas: parsableWords = parsablePara sentences = getSentences(parsableWords) for sentenceTokens in sentences: # for each list of set of words in the sentence that forms a POS sentence = Sentence( sentenceTokens) # make a sentence object out of those words subject = sentence.structure() subject.title = subject.title.rstrip(".\n") # find the node in the graph which has as it's title, the subject of the current sentence try: s = findNode(G, subject.title) # if such a node doesn't exist, add it to the graph and add an entry for it in the graph labels except NoSuchNodeException: G.add_node(subject.title) nodeLabels[subject.title] = [subject.title] s = findNode(G, subject.title) # add the appropriate labels for this node finally: # print subject ## DEBUG ## if subject.getLabel() not in nodeLabels[subject.title]: nodeLabels[subject.title].append(subject.getLabel()) for verb in subject.paths.keys(): dest = subject.paths[verb] dest.title = dest.title.rstrip('.\n') # find the node in the graph which has as it's title, the object of the current sentence try: d = findNode(G, dest.title) # if such a node doesn't exist, add it to the graph and add an entry for it in the graph labels except NoSuchNodeException: G.add_node(dest.title) nodeLabels[dest.title] = [dest.title] # add the appropriate labels for this node finally: # print dest ## DEBUG ## d = findNode(G, dest.title) if dest.getLabel() not in nodeLabels[dest.title]: nodeLabels[dest.title].append(dest.getLabel()) # add the edge label in the graph for this verb edgeLabels[verb.getIdentifier()] = verb.getLabel() # add an edge between the subject and the object G.add_edge(s, d) # nltk requires that the node labels be strings and not lists. # this for loop takes the nodeLabels dict and turns the list of descriptors into a newline-separated string of descriptors # the next loop does the same for edge labels for node in nodeLabels.keys(): nodeLabels[node] = ''.join( ['\n' + label for label in nodeLabels[node]]) nodeLabels[node] = nodeLabels[node].strip("\n") for e in edgeLabels.keys(): edgeLabels[e] = edgeLabels[e].rstrip('\n') # draw the graph to the following specs: # use the generated node labels: nouns and attributes # the size of the node needs to be 1000 * (number of '\n' +1). The reason for the +1 is to accomodate single-line labels # node_shape='s' describes that each node should be a square # spectral layout is the one layout that makes the most sense to display the graph. Many layouts were tried in a trial-and-error fashion nx.draw(G, labels=nodeLabels, node_size=[(nodeLabels[node].count('\n') + 1) * 1000 for node in G.nodes()], node_shape='s', pos=nx.spectral_layout(G)) pos = nx.layout.spectral_layout(G) nx.draw_networkx_edge_labels(G, pos, edge_labels=edgeLabels) # nx.write_dot(G, '/home/ashwin/Desktop/sample') plt.show()
def main(): # 开启时清理日志 Log.clean_log(startup=True) def signal_handler(signal, frame): os._exit(0) if config["Other"]["INFO_MESSAGE"] != "False": Log.info("BiliBiliHelper Python " + version) Log.info("Powered By TheWanderingCoel with love❤️") if config["Other"]["SENTENCE"] != "False": Log.info(Sentence().get_sentence()) # 检查Config ConfigCheck() # 注册信号 signal.signal(signal.SIGINT, signal_handler) loop = asyncio.get_event_loop() timer = Timer(loop) console = Console.Console(loop) area_ids = [ 1, 2, 3, 4, 5, 6, ] Statistics(len(area_ids)) daily_tasks = [ Capsule.work(), CaseJudger.work(), Coin2Silver.work(), DailyBag.work(), GiftSend.work(), Group.work(), Heart.work(), Silver2Coin.work(), SilverBox.work(), Task.work() ] server_tasks = [MonitorServer.run_forever()] danmu_tasks = [Danmu_Monitor.run_Danmu_Raffle_Handler(i) for i in area_ids] other_tasks = [rafflehandler.run()] api_thread = threading.Thread(target=API.work) api_thread.start() if not options.disable_console: console_thread = threading.Thread(target=console.cmdloop) console_thread.start() # 先登陆一次,防止速度太快导致抽奖模块出错 Auth.work() if config["Function"]["RAFFLE_HANDLER"] != "False": loop.run_until_complete( asyncio.wait(daily_tasks + server_tasks + danmu_tasks + other_tasks)) else: loop.run_until_complete(asyncio.wait(daily_tasks)) api_thread.join() if not options.disable_console: console_thread.join() loop.close()
def answer_where(question, inst): best_sen = inst.ranked_list(question)[0] info = Sentence(best_sen,0) answer = find_tag_answer(question,info, ['LOCATION', 'ORGANIZATION']) return best_sen
def answer_what(question,inst): pattern = Sentence(question.wh_pattern(),0) best_sen = inst.ranked_list(pattern)[0] info = Sentence(best_sen,0) return best_sen
sentenceSet = " Sample" sentenceFilename = "sentences/ Sample" def resetSentenceFactory(): global factory factory = SentenceFactory(sentenceFilename) #check that the data file is formatted correctly factory.validate() testSentence = None endWord = Word("Congratulations!!") endSentence = Sentence([endWord], "!") lostWord = Word("You are out of lives!!") lostSentence = Sentence([lostWord], "!") def createFish(word): global fishes global sprites newFish = Fish(word, fishSpeed) newFish.moveTo(random.randrange(-500, 0), random.randrange(200, 420)) sprites.append(newFish) fishes.append(newFish) changeSpeed()
from DialogueManagement import DialogueManagement from Sentence import Sentence DM1 = DialogueManagement() sentence = input() sentence = Sentence(sentence) DM1.diliverSlots(sentence) DM1.diliverIntent(sentence) DM1.chooseDM(sentence.intent) # print("singer:",DM1.singer) # print("song:",DM1.song) # print("intent:",DM1.intent) DM1.DM()
def getCentroid(self, a): selectedsentnum = 0 selectedtranslation = "" total_num_outputs = len(a) sentences = [Sentence() for __idx0 in range(total_num_outputs)] self.reputation = [float() for __idx0 in range(total_num_outputs)] ## for-while i = 0 while i < total_num_outputs: sentences[i] = Sentence() self.reputation[i] = 0.0 i += 1 ## for-while i = 0 while i < total_num_outputs: sentences[i].number = i sentences[i].text = a[i] sentences[i].tokencount = SentenceTokenizer.tokencount(sentences[i].text) sentences[i].tokens = SentenceTokenizer.sentencetokenizer(sentences[i].text) sentences[i].setDistance([float() for __idx0 in range(total_num_outputs)]) sentences[i].setSimilarity([float() for __idx0 in range(total_num_outputs)]) i += 1 if len(sentences) <= 1:#//if only 1 or 2 candidate translations available randomGenerator = random() randomchoice = randomGenerator.nextInt(len(sentences))#check selectedtranslation = sentences[randomchoice].text else:#//if 3 or more candidate translations are available total_combinations = (total_num_outputs * (total_num_outputs - 1)) / 2 #print ("total combo",total_combinations) sentencemapcount = 0 filtercriteria = 0.4 sentencemaps = [SentenceMap(0,0,0,0,0,0) for __idx0 in range(total_combinations)] ## Two for loop to list nC2 combinations i = 0 while i < (len(sentences) - 1): ## for-while j = i + 1 while j <= (len(sentences) - 1): sentencemaps[sentencemapcount] = SentenceMap(i, j, sentences[i].tokencount, sentences[j].tokencount, len(sentences[i].text) - sentences[i].tokencount + 1, len(sentences[j].text) - sentences[i].tokencount + 1) a = [] for x in range(sentences[i].tokencount): a.append([]) for y in range(sentences[j].tokencount): a[x].append(float()) self.mapscorematrix = a a = [] for x in range(sentences[i].tokencount): a.append([]) for y in range(sentences[j].tokencount): a[x].append(float()) self.mapmatrix = a a = [] for x in range(sentences[i].tokencount): a.append([]) for y in range(sentences[j].tokencount): a[x].append(float()) self.mapweightmatrix = a ## initializing the matrices slength1 = 0 while slength1 < (sentences[i].tokencount): slength2 = 0 while slength2 < (sentences[j].tokencount): self.mapscorematrix[slength1][slength2] = 0 self.mapmatrix[slength1][slength2] = 0 self.mapweightmatrix[slength1][slength2] = 0 slength2 += 1 slength1 += 1 """ print "matrices" print self.mapscorematrix print self.mapmatrix print self.mapweightmatrix """ ## set scores for the bipartite graph slength1 = 0 while slength1 < sentences[i].tokencount: slength2 = 0 while slength2 < sentences[j].tokencount: #print "args" #print (sentences[i].tokens[slength1], sentences[j].tokens[slength2]) self.mapscorematrix[slength1][slength2] = WordComparer.comparescores(sentences[i].tokens[slength1], sentences[j].tokens[slength2]) #print "self.mapscorematrix[slength1][slength2]" + str(WordComparer.comparescores(sentences[i].tokens[slength1], sentences[j].tokens[slength2])) slength2 += 1 slength1 += 1 #print ("mapscorematrix",self.mapscorematrix) if sentences[i].tokencount <= sentences[j].tokencount: debaredlist = [] ## for-while slength1 = 0 while slength1 < sentences[i].tokencount: index2 = -1 similarity = filtercriteria ## for-while slength2 = 0 while slength2 < sentences[j].tokencount: if ((similarity < self.mapscorematrix[slength1][slength2]) and not (self.iselementpresent(debaredlist, slength2))): similarity = self.mapscorematrix[slength1][slength2] index2 = slength2 slength2 += 1 if (index2 != -1): if len(sentences[i].tokens[slength1]) > len(sentences[j].tokens[index2]): self.mapweightmatrix[slength1][index2] = len(sentences[j].tokens[index2]) else: self.mapweightmatrix[slength1][index2] = len(sentences[i].tokens[slength1]) self.mapmatrix[slength1][index2] = similarity debaredlist.append(index2) slength1 += 1 else: debaredlist = [] ## for-while slength2 = 0 while slength2 < sentences[j].tokencount: index1 = -1 similarity = filtercriteria ## for-while slength1 = 0 while slength1 < sentences[i].tokencount: if ((similarity < self.mapscorematrix[slength1][slength2]) and not (self.iselementpresent(debaredlist, slength1))): similarity = self.mapscorematrix[slength1][slength2] index1 = slength1 slength1 += 1 if (index1 != -1): if len(sentences[i].tokens[index1]) > len(sentences[j].tokens[slength2]): self.mapweightmatrix[index1][slength2] = len(sentences[j].tokens[slength2]) else: self.mapweightmatrix[index1][slength2] = len(sentences[i].tokens[index1]) self.mapmatrix[index1][slength2] = similarity #print "self.mapmatrix[index1][slength2] : " + str(self.mapmatrix[index1][slength2])correct debaredlist.append(index1) slength2 += 1 #print self.mapmatrix sentencemaps[sentencemapcount].setscorematrix(self.mapscorematrix) sentencemaps[sentencemapcount].setmapmatrix(self.mapmatrix) sentencemaps[sentencemapcount].setweightmatrix(self.mapweightmatrix) distance = sentencemaps[sentencemapcount].setsentencesimilarityscore() similarity = sentencemaps[sentencemapcount].sentencesimilarityscore #print (distance,similarity) sentences[i].setIthDistance(j, distance) sentences[j].setIthDistance(i, distance) sentences[i].setIthSimilarity(j, similarity) sentences[j].setIthSimilarity(i, similarity) sentencemapcount += 1 j += 1 i += 1 ## for-while i = 0 while i < total_num_outputs: totaldistance = 0 totalsimilarity = 0 ## for-while j = 0 while j < total_num_outputs: totalsimilarity = totalsimilarity + sentences[i].similarity[j] totaldistance = totaldistance + sentences[i].distance[j] #print (totalsimilarity,totaldistance) j += 1 sentences[i].averagesimilarity = totalsimilarity / (total_num_outputs - 1) self.reputation[i] = sentences[i].averagesimilarity #print "rep" + str(self.reputation[i]) sentences[i].averagedistance = totaldistance / (total_num_outputs - 1) i += 1 newdistance = 0 newoptdistance = 1000000000 ## for-while z = 0 while z < len(sentences): ## for-while y = 0 while y < len(sentences): #print sentences[z].distance[y] newdistance = newdistance + sentences[z].distance[y] y += 1 if newdistance < newoptdistance: selectedsentnum = z selectedtranslation = sentences[z].text newoptdistance = newdistance newdistance = 0 z += 1 self.normalizer = 10*(sentences[selectedsentnum].tokencount) #print "normalizer=" + str(self.normalizer) return selectedtranslation
from ModelCheck import ModelChecking from PL_Resolution import PL_Resolution, resolutionTest # All three people symbols, global variables Amy = Arguments("Amy") Bob = Arguments("Bob") Cal = Arguments("Cal") names = ["Amy", "Bob", "Cal"] # literals list to use,global variables literals = [Amy, Bob, Cal] # Part Model Check Class, passin all basic symbols check = ModelChecking(literals) # All Three atomic Propositional sentences Sentence_Amy = Sentence(unit, [Amy]) Sentence_Bob = Sentence(unit, [Bob]) Sentence_Cal = Sentence(unit, [Cal]) def modelCheckKBParta(): """ :return: The Knowledge base sentences of modelchecking of part a """ sentence_2 = Sentence(conjunction, [Cal, Amy]) KB1 = Sentence(equals, [Sentence_Amy, sentence_2], sentenceBase=True) sentence_3 = Sentence(unit, [Bob]) sentence_4 = Sentence(negative, [Cal]) KB2 = Sentence(equals, [Sentence_Bob, sentence_4], sentenceBase=True)
def processLine(self, line): line = line.strip() line = line.lower() line = line.replace('"', '') line = line.replace(',', '') line = line.replace('.', '') line = line.replace('!', '') line = line.replace("'", '') line = line.replace(":", '') line = line.replace(";", '') if line == '': return None processed_tokens = Sentence() processed_tokens.append(Datum("<s>")) tokens = line.split() i = 0 while i < len(tokens): token = tokens[i] if token == '<err': targ = tokens[i + 1] targ_splits = targ.split('=') correct_token = targ_splits[1][:-1] correct_token_splits = correct_token.split() if len(correct_token_splits) > 2: for correct_word in correct_token_splits: processed_tokens.append(Datum(correct_word)) elif tokens[i + 3] != '</err>': processed_tokens.append(Datum(correct_token)) else: incorrect_token = tokens[i + 2] processed_tokens.append( Datum(correct_token, incorrect_token)) i += tokens[i:].index('</err>') + 1 else: # regular word processed_tokens.append(Datum(token)) i += 1 processed_tokens.append(Datum("</s>")) return processed_tokens
def modelchecking(): # construct sentence # part a goal sentence sentence_1 = Sentence(unit, [mythical]) sentence_2 = Sentence(unit, [immortal]) KB1 = Sentence(implies, [sentence_1, sentence_2], sentenceBase=True) sentence_3 = Sentence(negative, [mythical]) sentence_3_2 = Sentence(negative, [immortal]) sentence_3_3 = Sentence(unit, [mammal]) sentence_3_4 = Sentence(conjunction, [sentence_3_2, sentence_3_3], True) KB2 = Sentence(implies, [sentence_3, sentence_3_4], True) # part b goal sentence sentence_5 = Sentence(disjunction, [immortal, mammal]) sentence_6 = Sentence(unit, [horned]) KB3 = Sentence(implies, [sentence_5, sentence_6], sentenceBase=True) # part b goal sentence sentence_7 = Sentence(unit, [magical]) KB4 = Sentence(implies, [sentence_6, sentence_7], sentenceBase=True) KB = [KB1, KB2, KB3, KB4] check = ModelChecking(literals) # # part a: to prove mythical, sentence1 # print(check.check(KB, sentence_1)) # # # part b: to prove magical, sentence7 # print(check.check(KB, sentence_7)) # # # part c:to prove horned,sentence6 # print(check.check(KB, sentence_6)) check.check(KB)
print("""\033[32;1m ______ __ __ __ ______ __ __ __ __ __ ______ __ ______ ______ ______ /\ == \ /\ \ /\ \ /\ \ /\ == \ /\ \ /\ \ /\ \ /\ \_\ \ /\ ___\ /\ \ /\ == \ /\ ___\ /\ == \ \ \ __< \ \ \ \ \ \____ \ \ \ \ \ __< \ \ \ \ \ \____ \ \ \ \ \ __ \ \ \ __\ \ \ \____ \ \ _-/ \ \ __\ \ \ __< \ \_____\ \ \_\ \ \_____\ \ \_\ \ \_____\ \ \_\ \ \_____\ \ \_\ \ \_\ \_\ \ \_____\ \ \_____\ \ \_\ \ \_____\ \ \_\ \_\ \/_____/ \/_/ \/_____/ \/_/ \/_____/ \/_/ \/_____/ \/_/ \/_/\/_/ \/_____/ \/_____/ \/_/ \/_____/ \/_/ /_/ \033[0m""") if config["Other"]["INFO_MESSAGE"] != "False": Log.info("BiliBiliHelper Python " + version) Log.info("Powered By TheWanderingCoel, kotoriのねこ and 洛水.山岭居室️") if config["Other"]["SENTENCE"] != "False": Log.info(Sentence().get_sentence()) # 检查Config ConfigCheck() # 注册信号 signal.signal(signal.SIGINT, signal_handler) loop = asyncio.get_event_loop() timer = Timer(loop) console = Console.Console(loop) area_ids = [1,2,3,4,5,6,] Statistics(len(area_ids))
len(conf_train['fp']) from random import sample examples = sample(fp, 1) example = examples[0] print(example[0]) print(example[1].text) print('# Truth:') for e in example[1].entities: print(e) print('# Pred:') for e in example[2].entities: print(e, e.pos) f.dictLookuper.word('Octanal') from Sentence import Sentence sample_sentence = Sentence('A new and efficient synthesis of a naturally occurring amide alkaloid, N-isobutyl-4,5-epoxy-2(E)-decenamide isolated from the roots of Piper nigrum has been described involving a total of nine steps. Octanal and 2-bromoacetic acid have been used as the starting materials.') sample_sentence.pos(s) f.bl_sifter = True f.sifter.bl_list_pos_star = False f.bl_refiner = True f.filt([sample_sentence]) for e in sample_sentence.entities: print(e) # def smash(sentences): # for sentence in sentences: # _smash(sentence, )
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results, rel_words_unigrams, rel_words_bigrams): idx = open_dir(index) count = 0 distance = MAX_TOKENS_AWAY with idx.searcher() as searcher: while True: try: r = queue.get_nowait() if count % 50 == 0: print multiprocessing.current_process( ), "In Queue", queue.qsize(), "Total Matched: ", len( results) #TODO: fazer uma cache t1 = query.Term( 'sentence', "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">") t3 = query.Term( 'sentence', "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">") # Entities proximity query without relational words q1 = spans.SpanNear2([t1, t3], slop=distance, ordered=True, mindist=1) hits = searcher.search(q1, limit=q_limit) # Entities proximity considering relational words # From the results above count how many contain a relational word hits_with_r = 0 total_hits = 0 for s in hits: sentence = s.get("sentence") s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY, MIN_TOKENS_AWAY, CONTEXT_WINDOW, stopwords) for s_r in s.relationships: if r.e1.decode("utf8") == s_r.e1 and r.e2.decode( "utf8") == s_r.e2: total_hits += 1 unigrams_rel_words = s_r.between bigrams_rel_words = extract_bigrams(' '.join( s_r.between)) if any(x in rel_words_unigrams for x in unigrams_rel_words): hits_with_r += 1 continue if any(x in rel_words_bigrams for x in bigrams_rel_words): hits_with_r += 1 continue assert total_hits >= hits_with_r if total_hits > 0: pmi = float(hits_with_r) / float(total_hits) if pmi >= PMI: results.append(r) count += 1 except Queue.Empty: break
def __init__(self,test_file,lexicon): self.sentences = [] lines = open(test_file,"r").readlines() for line in lines: line = line.strip() self.sentences.append(Sentence(line,lexicon))
def __init__(self, train_filename, test_filename, tag_num, l1, l2): ''' Constructor ''' self.word_count = {} self.word_tag_count = {} self.tag_uni_count = [0 for i in range(tag_num)] self.tag_bigram_count = [[0 for j in range(tag_num)] for i in range(tag_num)] self.tag_trigram_count = [[[0 for k in range(tag_num)] for j in range(tag_num)] for i in range(tag_num)] self.sents = [Sentence(tag_num)] self.total_tag = 0 self.__rare_word = {} lines = open(train_filename).readlines() for i in range(len(lines)): line = lines[i].decode('utf-8').strip('\n') if len(line) > 0: wt = line.split('-') self.sents[-1].add_word_tag(wt[1], wt[0]) else: self.sents[-1].finish(self.word_count, self.word_tag_count, self.tag_uni_count, self.tag_bigram_count, self.tag_trigram_count) self.total_tag += len(self.sents[-1].word_tag) - 2 self.sents.append(Sentence(tag_num)) self.sents[-1].finish(self.word_count, self.word_tag_count, self.tag_uni_count, self.tag_bigram_count, self.tag_trigram_count) self.total_tag += len(self.sents[-1].word_tag) - 2 # self.word_prob = {} for w in self.word_count.keys(): if self.word_count[w] <= 1: self.__rare_word[w] = self.word_count[w] self.word_tag_prob = {} for w in self.word_tag_count.keys(): if w in self.__rare_word: self.word_tag_prob['RARE'] = [0 for i in range(tag_num)] for i in range(tag_num): self.word_tag_prob['RARE'][i] = self.word_tag_count[ 'RARE'][i] / self.tag_uni_count[i] else: self.word_tag_prob[w] = [0 for i in range(tag_num)] for i in range(tag_num): self.word_tag_prob[w][ i] = self.word_tag_count[w][i] / self.tag_uni_count[i] # self.tag_uni_prob = [0 for i in range(tag_num)] # for i in range(tag_num): # self.tag_uni_prob[i] = self.tag_uni_count[i] / self.total_word # self.tag_bigram_prob = [[0 for j in range(tag_num)] for i in range(tag_num)] # for i in range(tag_num): # for j in range(tag_num): # self.tag_bigram_prob[i][j] = (self.tag_bigram_count[i][j] / self.total_bigram) / \ # (self.tag_uni_prob[i]) self.tag_trigram_prob = [[[0 for k in range(tag_num)] for j in range(tag_num)] for i in range(tag_num)] for i in range(tag_num): for j in range(tag_num): for k in range(tag_num): # q(i|j,k) that is the order of the tags is j,k,i self.tag_trigram_prob[i][j][k] = l1 * (self.tag_uni_count[i] / self.total_tag) + \ l2 * (self.tag_bigram_count[k][i] / self.tag_uni_count[i]) + \ (1 - l2 - l1) * (self.tag_trigram_count[j][k][i] / self.tag_bigram_count[k][i])
def processLine(self, line): line = line.strip() line = line.lower() line = line.replace('"', '') line = line.replace(',', '') line = line.replace('.', '') line = line.replace('!', '') line = line.replace("'", '') line = line.replace(":", '') line = line.replace(";", '') if line == '': return None processed_tokens = Sentence() processed_tokens.append(Datum("<s>")) #start symbol tokens = line.split() i = 0 while i < len(tokens): token = tokens[i] if token == '<err': targ = tokens[i + 1] targ_splits = targ.split('=') correct_token = targ_splits[1][: -1] # chop off the trailing '>' correct_token_splits = correct_token.split() if len(correct_token_splits) > 2: # targ with multiple words #print 'targ with multiple words: "%s"' % targ for correct_word in correct_token_splits: processed_tokens.append(Datum(correct_word)) elif tokens[i + 3] != '</err>': processed_tokens.append(Datum(correct_token)) else: incorrect_token = tokens[i + 2] processed_tokens.append( Datum(correct_token, incorrect_token)) i += tokens[i:].index('</err>') + 1 # update index else: # regular word processed_tokens.append(Datum(token)) i += 1 processed_tokens.append(Datum("</s>")) return processed_tokens
from Sentence import Sentence sentence = Sentence('爱的可能') arcs = sentence.arcs print(sentence.words) print(sentence.postags) print(sentence.personNames) print('\t'.join('%s,%s' % (arc.head, arc.relation) for arc in arcs))
return self.score def getFinalResult(self): tags = self.getActionSequence() wordSeq = list(self.sentence.getChars()) for index, tag in enumerate(tags): if tag == 'S' or tag == 'E': wordSeq[index] += ' ' return u''.join(wordSeq).strip() if __name__ == "__main__": print 'initial..' line = u'我 是 傻逼 , 这 你 也 信 ?' from Sentence import Sentence s = Sentence(line) print(' '.join(s.characters)) print(' '.join(s.tags)) s0 = SegState(s) sss = SegState(s) print s0.sentence.tags == s.tags print 'fillPrimitiveUnits..' print s0.C_2, s0.C_1, s0.C, s0.C1, s0.C2 print 'compareTo..' print s0.compareTo(sss) == 0 print 'getUnlabeledFeatures..' print ' '.join(s0.getUnlabeledFeatures()) print 'getPrevState...' print s0.getPrevState() print 'isGold..' print s0.IsGold()
from Sentence import Sentence if __name__ == '__main__': sentence = Sentence("I'm a #Mac#. I am not #Ubuntu#. Lol") print (sentence.getTerms())
def mapMaker(parsableParas): G = nx.Graph() nodeLabels = {} edgeLabels = {} for parsablePara in parsableParas: parsableWords = parsablePara sentences = getSentences(parsableWords) for sentenceTokens in sentences: # for each list of set of words in the sentence that forms a POS sentence = Sentence(sentenceTokens) # make a sentence object out of those words subject = sentence.structure() subject.title = subject.title.rstrip(".\n") # find the node in the graph which has as it's title, the subject of the current sentence try: s = findNode(G, subject.title) # if such a node doesn't exist, add it to the graph and add an entry for it in the graph labels except NoSuchNodeException: G.add_node(subject.title) nodeLabels[subject.title] = [subject.title] s = findNode(G, subject.title) # add the appropriate labels for this node finally: # print subject ## DEBUG ## if subject.getLabel() not in nodeLabels[subject.title]: nodeLabels[subject.title].append(subject.getLabel()) for verb in subject.paths.keys(): dest = subject.paths[verb] dest.title = dest.title.rstrip(".\n") # find the node in the graph which has as it's title, the object of the current sentence try: d = findNode(G, dest.title) # if such a node doesn't exist, add it to the graph and add an entry for it in the graph labels except NoSuchNodeException: G.add_node(dest.title) nodeLabels[dest.title] = [dest.title] # add the appropriate labels for this node finally: # print dest ## DEBUG ## d = findNode(G, dest.title) if dest.getLabel() not in nodeLabels[dest.title]: nodeLabels[dest.title].append(dest.getLabel()) # add the edge label in the graph for this verb edgeLabels[verb.getIdentifier()] = verb.getLabel() # add an edge between the subject and the object G.add_edge(s, d) # nltk requires that the node labels be strings and not lists. # this for loop takes the nodeLabels dict and turns the list of descriptors into a newline-separated string of descriptors # the next loop does the same for edge labels for node in nodeLabels.keys(): nodeLabels[node] = "".join(["\n" + label for label in nodeLabels[node]]) nodeLabels[node] = nodeLabels[node].strip("\n") for e in edgeLabels.keys(): edgeLabels[e] = edgeLabels[e].rstrip("\n") # draw the graph to the following specs: # use the generated node labels: nouns and attributes # the size of the node needs to be 1000 * (number of '\n' +1). The reason for the +1 is to accomodate single-line labels # node_shape='s' describes that each node should be a square # spectral layout is the one layout that makes the most sense to display the graph. Many layouts were tried in a trial-and-error fashion nx.draw( G, labels=nodeLabels, node_size=[(nodeLabels[node].count("\n") + 1) * 1000 for node in G.nodes()], node_shape="s", pos=nx.spectral_layout(G), ) pos = nx.layout.spectral_layout(G) nx.draw_networkx_edge_labels(G, pos, edge_labels=edgeLabels) # nx.write_dot(G, '/home/ashwin/Desktop/sample') plt.show()
def sentence(): data = {"sentence": Sentence().get_sentence()} return jsonify(data)
def load(self): self.maxPoints = 0 f = BufferedReader(self.filename) #keep track of line numbers, which can be used to display #error messages about bad data files lineNumber = 0 #scan through the entire file while (not f.eof()): nextLine = f.readLine() lineNumber += 1 #check if the line is a comment/whitespace of if it contains some #sentence data. Sentence data is always on its own line #and starts with the "Sentence:" identifier if (nextLine.startswith("Sentence:")): self.maxPoints += SentenceFactory.POINTS_PER_SENTENCE sentenceComponents = [] st = StringTokenizer(nextLine) #discard the "Sentence:" token st.nextToken() #indicate whether we are reading in words from a sentence #or words that belong to blanks in a sentence readingBlank = False readingBlankValid = False blankValidWords = [] blankInvalidWords = [] while (st.hasNext()): nextToken = st.nextToken() if (readingBlank): #"]" denotes that the end of the list of words for the #given blank has been reached. if (nextToken == "]"): readingBlank = False sentenceComponents.append( Blank(blankValidWords, blankInvalidWords)) blankValidWords = [] blankInvalidWords = [] #"|" separates the valid and invalid words that can #be used to fill in a blank elif (nextToken == "|"): readingBlankValid = False else: if (readingBlankValid): blankValidWords.append(Word(nextToken)) else: blankInvalidWords.append(Word(nextToken)) else: #"[" indicates the start of a blank and the words #that can be used to fill it if (nextToken == "["): self.maxPoints += SentenceFactory.POINTS_PER_CORRECT_WORD readingBlank = True readingBlankValid = True else: sentenceComponents.append(Word(nextToken)) #last token would be the punctuation punctuation = sentenceComponents[len(sentenceComponents) - 1] sentenceComponents.remove(punctuation) #create an add the sentence to the sentence factory's #master list of sentences self.sentences.append( SentenceData( Sentence(sentenceComponents, punctuation.toString()), self.filename, nextLine, lineNumber)) else: #ignore comments and whitespace pass
def getCentroid(self, a): selectedsentnum = 0 selectedtranslation = "" total_num_outputs = len(a) sentences = [Sentence() for __idx0 in range(total_num_outputs)] self.reputation = [float() for __idx0 in range(total_num_outputs)] ## for-while i = 0 while i < total_num_outputs: sentences[i] = Sentence() self.reputation[i] = 0.0 i += 1 ## for-while i = 0 while i < total_num_outputs: sentences[i].number = i sentences[i].text = a[i] sentences[i].tokencount = SentenceTokenizer.tokencount(sentences[i].text) sentences[i].tokens = SentenceTokenizer.sentencetokenizer(sentences[i].text) sentences[i].setDistance([float() for __idx0 in range(total_num_outputs)]) sentences[i].setSimilarity([float() for __idx0 in range(total_num_outputs)]) i += 1 if len(sentences) <= 1: # //if only 1 or 2 candidate translations available randomGenerator = random() randomchoice = randomGenerator.nextInt(len(sentences)) # check selectedtranslation = sentences[randomchoice].text else: # //if 3 or more candidate translations are available total_combinations = (total_num_outputs * (total_num_outputs - 1)) / 2 # print ("total combo",total_combinations) sentencemapcount = 0 filtercriteria = 0.4 sentencemaps = [SentenceMap(0, 0, 0, 0, 0, 0) for __idx0 in range(total_combinations)] ## Two for loop to list nC2 combinations i = 0 while i < (len(sentences) - 1): ## for-while j = i + 1 while j <= (len(sentences) - 1): sentencemaps[sentencemapcount] = SentenceMap( i, j, sentences[i].tokencount, sentences[j].tokencount, len(sentences[i].text) - sentences[i].tokencount + 1, len(sentences[j].text) - sentences[i].tokencount + 1, ) a = [] for x in range(sentences[i].tokencount): a.append([]) for y in range(sentences[j].tokencount): a[x].append(float()) self.mapscorematrix = a a = [] for x in range(sentences[i].tokencount): a.append([]) for y in range(sentences[j].tokencount): a[x].append(float()) self.mapmatrix = a a = [] for x in range(sentences[i].tokencount): a.append([]) for y in range(sentences[j].tokencount): a[x].append(float()) self.mapweightmatrix = a ## initializing the matrices slength1 = 0 while slength1 < (sentences[i].tokencount): slength2 = 0 while slength2 < (sentences[j].tokencount): self.mapscorematrix[slength1][slength2] = 0 self.mapmatrix[slength1][slength2] = 0 self.mapweightmatrix[slength1][slength2] = 0 slength2 += 1 slength1 += 1 """ print "matrices" print self.mapscorematrix print self.mapmatrix print self.mapweightmatrix """ ## set scores for the bipartite graph slength1 = 0 while slength1 < sentences[i].tokencount: slength2 = 0 while slength2 < sentences[j].tokencount: # print "args" # print (sentences[i].tokens[slength1], sentences[j].tokens[slength2]) self.mapscorematrix[slength1][slength2] = WordComparer.comparescores( sentences[i].tokens[slength1], sentences[j].tokens[slength2] ) # print "self.mapscorematrix[slength1][slength2]" + str(WordComparer.comparescores(sentences[i].tokens[slength1], sentences[j].tokens[slength2])) slength2 += 1 slength1 += 1 # print ("mapscorematrix",self.mapscorematrix) if sentences[i].tokencount <= sentences[j].tokencount: debaredlist = [] ## for-while slength1 = 0 while slength1 < sentences[i].tokencount: index2 = -1 similarity = filtercriteria ## for-while slength2 = 0 while slength2 < sentences[j].tokencount: if (similarity < self.mapscorematrix[slength1][slength2]) and not ( self.iselementpresent(debaredlist, slength2) ): similarity = self.mapscorematrix[slength1][slength2] index2 = slength2 slength2 += 1 if index2 != -1: if len(sentences[i].tokens[slength1]) > len(sentences[j].tokens[index2]): self.mapweightmatrix[slength1][index2] = len(sentences[j].tokens[index2]) else: self.mapweightmatrix[slength1][index2] = len(sentences[i].tokens[slength1]) self.mapmatrix[slength1][index2] = similarity debaredlist.append(index2) slength1 += 1 else: debaredlist = [] ## for-while slength2 = 0 while slength2 < sentences[j].tokencount: index1 = -1 similarity = filtercriteria ## for-while slength1 = 0 while slength1 < sentences[i].tokencount: if (similarity < self.mapscorematrix[slength1][slength2]) and not ( self.iselementpresent(debaredlist, slength1) ): similarity = self.mapscorematrix[slength1][slength2] index1 = slength1 slength1 += 1 if index1 != -1: if len(sentences[i].tokens[index1]) > len(sentences[j].tokens[slength2]): self.mapweightmatrix[index1][slength2] = len(sentences[j].tokens[slength2]) else: self.mapweightmatrix[index1][slength2] = len(sentences[i].tokens[index1]) self.mapmatrix[index1][slength2] = similarity # print "self.mapmatrix[index1][slength2] : " + str(self.mapmatrix[index1][slength2])correct debaredlist.append(index1) slength2 += 1 # print self.mapmatrix sentencemaps[sentencemapcount].setscorematrix(self.mapscorematrix) sentencemaps[sentencemapcount].setmapmatrix(self.mapmatrix) sentencemaps[sentencemapcount].setweightmatrix(self.mapweightmatrix) distance = sentencemaps[sentencemapcount].setsentencesimilarityscore() similarity = sentencemaps[sentencemapcount].sentencesimilarityscore # print (distance,similarity) sentences[i].setIthDistance(j, distance) sentences[j].setIthDistance(i, distance) sentences[i].setIthSimilarity(j, similarity) sentences[j].setIthSimilarity(i, similarity) sentencemapcount += 1 j += 1 i += 1 ## for-while i = 0 while i < total_num_outputs: totaldistance = 0 totalsimilarity = 0 ## for-while j = 0 while j < total_num_outputs: totalsimilarity = totalsimilarity + sentences[i].similarity[j] totaldistance = totaldistance + sentences[i].distance[j] # print (totalsimilarity,totaldistance) j += 1 sentences[i].averagesimilarity = totalsimilarity / (total_num_outputs - 1) self.reputation[i] = sentences[i].averagesimilarity # print "rep" + str(self.reputation[i]) sentences[i].averagedistance = totaldistance / (total_num_outputs - 1) i += 1 newdistance = 0 newoptdistance = 1000000000 ## for-while z = 0 while z < len(sentences): ## for-while y = 0 while y < len(sentences): # print sentences[z].distance[y] newdistance = newdistance + sentences[z].distance[y] y += 1 if newdistance < newoptdistance: selectedsentnum = z selectedtranslation = sentences[z].text newoptdistance = newdistance newdistance = 0 z += 1 self.normalizer = 10 * (sentences[selectedsentnum].tokencount) # print "normalizer=" + str(self.normalizer) return selectedtranslation
def buildcorpus(corpus, rootpath, filelimit = 0): #rootpath = corpus.rootpath fileids = os.listdir(rootpath) hugewordlist = [] hugewordlist.extend(corpus.words) # will contain distinct Word instances numoffiles = 0 corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts") for fileid in fileids: allwords = nltk.FreqDist() # will contain all words in this text doc_id = fileid.split(".")[0] # corpus.inserttext(doc_id) ##### ! text in kendisini gondermeli newtext = Text(doc_id) path = rootpath + os.sep + fileid #lines = readtextlines(path) #rawtext = texter.readtxtfile(path) rawtext = texter.readnewstext(path) lines = texter.splitToSentences(rawtext) sntindex = 0 # each line is a sentence for line in lines: words = [] # words in this sentence words = line.split() words = texter.eliminatepunctuation(words) words = [word for word in words if not word.isspace()] for word in words: allwords.inc(word) newword = Word(word) newword.insertsentenceid(doc_id+"_"+str(sntindex)) if allwords[word] <= 1: # if this was not added to the hugelist before, add it hugewordlist.append(newword) sentence = Sentence(sntindex) sntindex = sntindex + 1 # sentence'a Word mu wordindex mi atalim? for word in words: index = hugewordlist.index(Word(word)) hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1)) sentence.insertword(index) newtext.insertsentence(sentence) if (not rawtext.isspace()) or (len(allwords) != 0): corpus.inserttext(newtext) print str(numoffiles)," : finished handling the words-snts-txts ",doc_id numofwords = reduce(lambda x,y : x+y, allwords.values()) for word in hugewordlist: cnt = allwords[word.literal] #freq = cnt / float(numofwords) word.assigntermfreq(cnt, numofwords, doc_id) #hugewordlist[index].toscreen() numoffiles = numoffiles + 1 if filelimit == numoffiles: break # end for - docs numofdocs = len(fileids) print "computing tf*idf" for word in hugewordlist: word.computeinvdocfreq(numofdocs) word.computeTFIDF() #word.toscreen() corpus.assignwords(hugewordlist) print "corpus length ",str(len(corpus.words))," words" print "huges length ",str(len(hugewordlist))," words" print "exiting buildcorpus()" print "pickle-dumping words" corpus.pickledumpwords()
def processLine(self, line): line = line.strip() line = line.lower() line = line.replace('"','') line = line.replace(',', '') line = line.replace('.','') line = line.replace('!','') line = line.replace("'",'') line = line.replace(":",'') line = line.replace(";",'') if line == '': return None processed_tokens = Sentence() processed_tokens.append(Datum("<s>")) #start symbol tokens = line.split() i = 0 while i < len(tokens): token = tokens[i] if token == '<err': targ = tokens[i+1] targ_splits = targ.split('=') correct_token = targ_splits[1][:-1] # chop off the trailing '>' correct_token_splits = correct_token.split() if len(correct_token_splits) > 2: # targ with multiple words #print 'targ with multiple words: "%s"' % targ for correct_word in correct_token_splits: processed_tokens.append(Datum(correct_word)) elif tokens[i+3] != '</err>': processed_tokens.append(Datum(correct_token)) else: incorrect_token = tokens[i+2] processed_tokens.append(Datum(correct_token, incorrect_token)) i += tokens[i:].index('</err>') + 1 # update index else: # regular word processed_tokens.append(Datum(token)) i += 1 processed_tokens.append(Datum("</s>")) return processed_tokens
def processLine(self, line): self.dict = enchant.Dict('en') line = line.strip() #line = line.lower() line = line.replace('"','') line = line.replace(',', '') line = line.replace('.', ' . ') line = line.replace('!', ' . ') line = line.replace('?', ' ? ') #line = line.replace("'",'') line = line.replace(":",'') line = line.replace(";",' , ') line = line.replace(" ",' ') if line == '': return None processed_tokens = Sentence() processed_tokens.append(Datum("<s>")) #start symbol tokens = line.split() i = 0 while i < len(tokens): token = tokens[i] if token == '': i += 1 continue if not token.islower(): if self.dict.check(token.lower()): token = token.lower() if token == '<ERR': try: if tokens[i+3] == '</ERR>': targ = tokens[i+1] targ_splits = targ.split('=') correct_token = targ_splits[1][:-1] # chop off the trailing '>' incorrect_token = tokens[i+2] else: #either targ or error has more than one word end = i + tokens[i:].index('</ERR>') - 1 for j in range(i, end): if tokens[j].endswith('>'): break targ = tokens[i+1] targ_splits = targ.split('=') if i+1 == j:# one word in targ correct_token = targ_splits[1][:-1] # chop off the trailing '>' else:# more than one word in targ correct_token = targ_splits[1] if (i+2) <= (j-1): for k in range(i+2, j): correct_token = correct_token + ' ' + tokens[k] correct_token = correct_token + ' ' + tokens[j][:-1] # chop off the trailing '>' incorrect_token = tokens[j+1] if (j+2) <= end: for k in range(j+2, end+1): incorrect_token = incorrect_token + ' ' + tokens[k] except IndexError: print tokens #raise RuntimeError return None # processed_tokens.append(Datum("</s>")) # return processed_tokens # correct_token_splits = correct_token.split() # if len(correct_token_splits) > 2: # targ with multiple words # #print 'targ with multiple words: "%s"' % targ # for correct_word in correct_token_splits: # processed_tokens.append(Datum(correct_word)) # elif tokens[i+3] != '</ERR>': # processed_tokens.append(Datum(correct_token)) # else: # incorrect_token = tokens[i+2] # processed_tokens.append(Datum(correct_token, incorrect_token)) processed_tokens.append(Datum(correct_token, incorrect_token)) i += tokens[i:].index('</ERR>') + 1 # update index else: # regular word processed_tokens.append(Datum(token)) i += 1 processed_tokens.append(Datum("</s>")) return processed_tokens
def main(): sentence = Sentence() filters = FirstWord() sentence.loopSentences(filters) #apply the filter to every sentence
def generate_sentences(weight, iteration_count, noun_l, verb_l, adj_l, conj_l, p_pones_l): sentences = [] strings = [] for i in range(0, iteration_count): if (weight < 151): str = "" limit = weight word = [] s = Sentence(0, 0, word) x = weight / 5 y = weight / 1.5 noun = choose_word_in_range(x, y, noun_l) limit -= noun.weight verb = choose_word_in_range(x, y, verb_l) limit -= verb.weight conj = choose_word_in_range(limit - 1, limit + 1, conj_l) s.words.append(noun) s.words.append(conj) s.words.append(verb) # To check if we already added it str = noun.name str += conj.name str += verb.name if (s.lentgth_of_sentence() == weight): matching = [s for s in strings if str in s] if (matching): continue sentences.append(s) strings.append(str) if (150 < weight < 251): limit = weight word = [] s = Sentence(0, 0, word) x = weight / 5 y = weight / 2 noun = choose_word_in_range(x, y, noun_l) limit -= noun.weight verb = choose_word_in_range(x, y, verb_l) limit -= verb.weight s.words.append(noun) while True: if (limit >= 80): x = choose_word_in_range(30, 50, adj_l) limit -= x.weight s.words.append(x) else: break if (limit > 10): p_pone = choose_word_in_range(limit - 1, limit + 1, conj_l) s.words.append(p_pone) s.words.append(verb) if (s.lentgth_of_sentence() == weight): sentences.append(s) if (250 < weight): limit = weight word = [] s = Sentence(0, 0, word) x = weight / 25 y = weight / 5 noun = choose_word_in_range(x, y, noun_l) limit -= noun.weight verb = choose_word_in_range(x, y, verb_l) limit -= verb.weight s.words.append(noun) while True: if (limit >= 80): x = choose_word_in_range(30, 50, noun_l) limit -= x.weight s.words.append(x) else: break if (limit > 10): conj = choose_word_in_range(limit - 1, limit + 1, noun_l) s.words.append(conj) s.words.append(verb) if (s.lentgth_of_sentence() == weight): sentences.append(s) return sentences