Exemple #1
0
   def search_sentences_c(self, \
	 max_words = None):
      if max_words:
	 self._MAX_WORDS = int(max_words)
      else:
	 self._MAX_WORDS = None

      start_node, end_node = self.get_start_and_end_node()

      import gsflc
      self.links.sort(cmp=Link.cmp_id)
      links_c = []
      for link in self.links:
	 links_c.append([self.nodes.index(link.s), self.nodes.index(link.e)])

      sentences = gsflc.search(self.nodes.index(start_node),\
	    self.nodes.index(end_node),\
	    len(self.nodes),\
	    links_c,\
	    self._MAX_WORDS)

      self.sentences = []

      for sentence in sentences:
	 new_sentence = Sentence()
	 for link_index in sentence:
	    new_sentence.add(self.links[link_index])
	 self.sentences.insert(0, new_sentence)

      self.sentences.sort(cmp=Sentence.cmp_score, reverse=True)

      self.sentences_ready = self.sentences
      return self.sentences
	def on_message(self, message):
		if ('request' in eval(message)['type']):
			SocketHandler.send_to_all({
				'type': 'request',
				'message': eval(message)
				})
			return
		

		
		SocketHandler.send_to_other(self,{
			'type': 'user',
			'id': id(self),
			'message': eval(message),
			})

		SocketHandler.send_to_self(self,{
			'type': 'self',
			'id': id(self),
			'message': eval(message),
			})

		sentence = Sentence(message)
		termlist = sentence.getTerms()
		if (len(termlist) != 0):
			newmessage = {
				"termlist": termlist,
				"parent": eval(message)['parent']
			}
			SocketHandler.send_to_all({
				'type': 'term',
				'id': id(self),
				'message': newmessage,
				})
Exemple #3
0
 def fromJSON(self, file):
     with open(file, 'r') as data:
         _json = json.load(data)
     self.sentences = list()
     for sentence in _json:
         _sentence = Sentence(None, None)
         _sentence = _sentence.fromJSON(_json[sentence])
         self.sentences.append(_sentence)
     return self
 def generateTestCasesAllErr(self):
     """Returns a list of sentences with all error"""
     testCases = [] # list of Sentences
     for sentence in self.corpus:
         cleanSentence = sentence.cleanSentence()
         testSentence = Sentence(cleanSentence)
         for i in range(0, len(sentence)):
             datum_i = sentence.get(i)
             if datum_i.hasError():
                 testSentence.put(i, datum_i)
         testCases.append(testSentence)
     return testCases    
Exemple #5
0
 def generateTestCases(self):  
   """Returns a list of sentences with exactly 1 elligible spelling error"""
   testCases = [] # list of Sentences
   for sentence in self.corpus:
     cleanSentence = sentence.cleanSentence()
     for i in range(0, len(sentence)):
       datum_i = sentence.get(i)
       if datum_i.hasError() and datum_i.isValidTest():
         testSentence = Sentence(cleanSentence)
         testSentence.put(i, datum_i)
         testCases.append(testSentence)
   return testCases
	def __init__(self, filepath):
		self.filepath = filepath
		# all sents in corpus
		self.sents = []
		sentence_number = 0 
		sentence = Sentence(sentence_number)
		self.instances = {}

		with open(filepath) as f:
			for num, line in enumerate(f):
				# check if we have a valid token line
				if line[0].isalpha():
					token_line = line.split()
					# create new Token object
					token = Token(token_line[3], token_line[4], token_line[5], token_line[-1].strip(")").strip("("), num) 
					sentence.add_token(token)
				# check if line is empty - we reached end of current sentence
				elif not line.strip(): 
					self.sents.append(sentence)
					sentence_number += 1
					sentence = Sentence(sentence_number)
Exemple #7
0
	def __init__(self, goldPath, predictedPath=None):
		self.goldPath = goldPath
		self.predictedPath = predictedPath
		self.sents = [] # all sents in corpus
		self.sent_stats = {}
		self.numTokens = 0 # count total tokens in corpus
		self.tags = set()
		self.tokens = []
		sent = Sentence()



		if predictedPath:
			with open(goldPath) as gf, open(predictedPath) as pf:
				for gline,pline in izip(gf, pf): # open two files simultaneously
					if gline.strip() and pline.strip(): # check if lines not empty
						gtoken_tag = re.split(r'\t', gline)
						ptoken_tag = re.split(r'\t', pline)
						if gtoken_tag[0] == ptoken_tag[0]:
							token = Token(gtoken_tag[0], gtoken_tag[1].strip(), ptoken_tag[1].strip()) # create new Token object
							sent.addToken(token)
							self.numTokens += 1 
						else:
							raise Exception("Files not in sync")
					else:
						self.sents.append(sent)
						sent = Sentence()
		else:
			# store all sentences from corpus
			sentences = []
			# store a sentence that consists of tokens
			sentence = []
			with open(goldPath) as gf:
				for line in gf: 
					# check if lines not empty
					if line.strip(): 
						# split line into token and tag as list elements
						token_tag = re.split(r'\t', line)
						# add a token object into sentence
						sentence.append(Token(token_tag[0].strip(), token_tag[1].strip()))
						# count total number of tokens
						self.numTokens += 1 
					else:
						# we have reached end of sentence (empty line)
						sentences.append(sentence)
						sentence = []

			prev = "prevnotekzist"
			following = "folnotekzist"
			for j, sentence in enumerate(sentences):
				for i, token in enumerate(sentence):
					# make sure we don't go beyond sentence length
					if i+1 < len(sentence):
						following = sentence[i+1]
					# if we reached end of current sentence - take following word as first word of next sentence
					elif j+1 < len(sentences):
						following = sentences[j+1][0]
					token.setPrev(prev)
					token.setFollowing(following)
					token.getNeighborFeatures()
					# print (vars(token))
					prev = token
					sent.addToken(token)
				self.sents.append(sent)
		 		sent = Sentence()
Exemple #8
0
	def parse_mtree(self):
		if self.lang!='en': raise Exception("MetricalTree parsing only works currently for English text.")

		import metricaltree as mtree
		mtree.set_paths(self.dir_mtree)

		wordtoks = self.wordtokens()
		toks = [wtok.token for wtok in wordtoks]

		pauses = mtree.pause_splitter_tokens(toks)

		#sents = [sent for pause in pauses for sent in pause]
		sents=[]
		for pause in pauses:
			sents.extend(mtree.split_sentences_from_tokens(pause))
		parser = mtree.return_parser(self.dir_mtree)
		trees = list(parser.lex_parse_sents(sents, verbose=False))
		stats = parser.get_stats(trees,arto=True,format_pandas=False)
		assert len(stats)==len(wordtoks)

		sents = []
		sent = []
		sent_id=None
		for wTok,wStat in zip(wordtoks,stats):
			if sent_id!=wStat['sidx']:
				sent_id=wStat['sidx']
				if sent: sents+=[sent]
				sent=[]

			sent+=[wTok]
			#for k,v in wStat.items():
			#	setattr(wTok,k,v)
			if not hasattr(wTok,'feats'): wTok.feats={}
			for k,v in list(wStat.items()):
				if k in mtree.INFO_DO_NOT_STORE: continue
				wTok.feats[k]=v

		if sent: sents+=[sent]
		assert len(sents) == len(trees)

		from Sentence import Sentence
		for sent,tree in zip(sents,trees):
			sentobj = Sentence(sent, tree)
			self._sentences+=[sentobj]

		# create a normalized stress per line
		import numpy as np
		for line in self.lines():
			wtoks = line.children

			# norm mean
			stresses = [wtok.feats['norm_mean'] for wtok in wtoks if not np.isnan(wtok.feats['norm_mean'])]
			max_stress = float(max(stresses))
			min_stress = float(min(stresses))
			for wtok in wtoks:
				wtok.feats['norm_mean_line']=(wtok.feats['norm_mean']-min_stress)/(max_stress-min_stress) if max_stress else np.nan

			# mean
			stresses = [wtok.feats['mean'] for wtok in wtoks if not np.isnan(wtok.feats['mean'])]
			min_stress = float(min(stresses))
			diff = 1.0 - min_stress
			for wtok in wtoks:
				wtok.feats['mean_line']=wtok.feats['mean'] + diff
Exemple #9
0
 def convert_to_obj(self):
     for sec, block in self.document.items():
         for key in block.keys():
             block[key] = Sentence(block[key])
Exemple #10
0
                if (t[1] in want) and (t[0]
                                       not in stopword) and (len(t[0]) > 1):
                    token.append(t[0])

            pos.extend(token)

        return pos

    def save(self, coll, oid):
        coll.update({"_id": oid}, {"$set": {
            "keyword": self.get_result()
        }},
                    upsert=True)


if __name__ == "__main__":

    from Sentence import Sentence

    import csv
    talk = []
    f = open('script3.csv', 'r', encoding='utf-8')
    rdr = csv.reader(f)
    for line in rdr:
        talk.append(list(line))
    f.close()

    sentences = [Sentence(i, t) for i, t in enumerate(talk)]
    keyword = Tf_Idsf(sentences)
    print(keyword.get_result())
Exemple #11
0
 def processLine(self, line):
     '''
     Reads line contains misspell words like the following
     <ERR targ=That's> Thats </ERR> what <ERR targ=James> Jame </ERR>
     Returns sentence including a list of Datum(correct word, incorrect word)
     '''
     processed_tokens = Sentence()
     processed_tokens.append(Datum("<s>"))  # start symbol
     tokens = line.split()
     i = 0
     while i < len(tokens):
         token = tokens[i]
         # find out misspell word place
         if token == "<err":
             targ = tokens[i + 1]
             targ_splits = targ.split("=")
             correct_token = targ_splits[1][:
                                            -1]  # chop off the trailing '>'
             correct_token_splits = correct_token.split()
             if len(correct_token_splits) > 2:  # targ with multiple words
                 print 'targ with multiple words: "%s"' % targ
                 for correct_word in correct_token_splits:
                     processed_tokens.append(Datum(correct_word))
             # if miss one word in between, so no incorrect_token in this case
             elif tokens[i + 3] != '</err>':
                 processed_tokens.append(Datum(correct_token))
             else:
                 incorrect_token = tokens[i + 2]
                 processed_tokens.append(
                     Datum(correct_token, incorrect_token))
             # move index to one after </ERR> symbol
             i += tokens[i:].index('</err>') + 1
         else:
             # No mis-spell, correct sentence
             processed_tokens.append(Datum(token))
             i += 1
     processed_tokens.append(Datum("</s>"))  # stop symbol
     return processed_tokens
Exemple #12
0
        testSet = SentenceReader(options.testPath, True)
        tagger.decodeParalle(testSet,
                             outpath=options.outputPath,
                             numThreads=options.numThreads,
                             numPerTheads=options.numPerTheads)
        testSet.close()
    elif options.mode == "display":
        print("Loading model ... ")
        with open(options.modelPath, 'r') as file_in:
            tagger = cPickle.load(file_in)
        print("Done")
        while True:
            raw_sent = raw_input("请输入待分词的字符串:")
            if raw_sent.__len__() == 0:
                break
            sent = Sentence(raw_sent.decode('utf-8'))
            states = tagger.decodeBeamSearch(sent, "test")
            print states[1].getFinalResult().encode('utf-8')

    elif options.mode == "eval":
        print("Loading model ... ")
        with open(options.modelPath, 'r') as file_in:
            tagger = cPickle.load(file_in)
        print("Done")

        tagger.quiet = False
        testSet = SentenceReader(options.testPath)
        print("Evaluating ...")
        accuracy = tagger.evaluate(testSet,
                                   numThreads=options.numThreads,
                                   numPerTheads=options.numPerTheads)
Exemple #13
0
def mapMaker(parsableParas):

    G = nx.Graph()

    nodeLabels = {}
    edgeLabels = {}

    for parsablePara in parsableParas:
        parsableWords = parsablePara
        sentences = getSentences(parsableWords)

        for sentenceTokens in sentences:  # for each list of set of words in the sentence that forms a POS

            sentence = Sentence(
                sentenceTokens)  # make a sentence object out of those words

            subject = sentence.structure()
            subject.title = subject.title.rstrip(".\n")

            # find the node in the graph which has as it's title, the subject of the current sentence
            try:
                s = findNode(G, subject.title)

            # if such a node doesn't exist, add it to the graph and add an entry for it in the graph labels
            except NoSuchNodeException:
                G.add_node(subject.title)
                nodeLabels[subject.title] = [subject.title]
                s = findNode(G, subject.title)

            # add the appropriate labels for this node
            finally:
                #				print subject	## DEBUG ##
                if subject.getLabel() not in nodeLabels[subject.title]:
                    nodeLabels[subject.title].append(subject.getLabel())

            for verb in subject.paths.keys():

                dest = subject.paths[verb]
                dest.title = dest.title.rstrip('.\n')

                # find the node in the graph which has as it's title, the object of the current sentence
                try:
                    d = findNode(G, dest.title)

                # if such a node doesn't exist, add it to the graph and add an entry for it in the graph labels
                except NoSuchNodeException:
                    G.add_node(dest.title)
                    nodeLabels[dest.title] = [dest.title]

                # add the appropriate labels for this node
                finally:
                    #					print dest	## DEBUG ##
                    d = findNode(G, dest.title)
                    if dest.getLabel() not in nodeLabels[dest.title]:
                        nodeLabels[dest.title].append(dest.getLabel())

                # add the edge label in the graph for this verb
                edgeLabels[verb.getIdentifier()] = verb.getLabel()

                # add an edge between the subject and the object
                G.add_edge(s, d)

    # nltk requires that the node labels be strings and not lists.
    # this for loop takes the nodeLabels dict and turns the list of descriptors into a newline-separated string of descriptors
    # the next loop does the same for edge labels
    for node in nodeLabels.keys():
        nodeLabels[node] = ''.join(
            ['\n' + label for label in nodeLabels[node]])
        nodeLabels[node] = nodeLabels[node].strip("\n")

    for e in edgeLabels.keys():
        edgeLabels[e] = edgeLabels[e].rstrip('\n')

    # draw the graph to the following specs:
    # use the generated node labels: nouns and attributes
    # the size of the node needs to be 1000 * (number of '\n' +1). The reason for the +1 is to accomodate single-line labels
    # node_shape='s' describes that each node should be a square
    # spectral layout is the one layout that makes the most sense to display the graph. Many layouts were tried in a trial-and-error fashion
    nx.draw(G,
            labels=nodeLabels,
            node_size=[(nodeLabels[node].count('\n') + 1) * 1000
                       for node in G.nodes()],
            node_shape='s',
            pos=nx.spectral_layout(G))
    pos = nx.layout.spectral_layout(G)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edgeLabels)
    #	nx.write_dot(G, '/home/ashwin/Desktop/sample')
    plt.show()
Exemple #14
0
def main():
    # 开启时清理日志
    Log.clean_log(startup=True)

    def signal_handler(signal, frame):
        os._exit(0)

    if config["Other"]["INFO_MESSAGE"] != "False":
        Log.info("BiliBiliHelper Python " + version)
        Log.info("Powered By TheWanderingCoel with love❤️")

    if config["Other"]["SENTENCE"] != "False":
        Log.info(Sentence().get_sentence())

    # 检查Config
    ConfigCheck()

    # 注册信号
    signal.signal(signal.SIGINT, signal_handler)

    loop = asyncio.get_event_loop()

    timer = Timer(loop)
    console = Console.Console(loop)

    area_ids = [
        1,
        2,
        3,
        4,
        5,
        6,
    ]
    Statistics(len(area_ids))

    daily_tasks = [
        Capsule.work(),
        CaseJudger.work(),
        Coin2Silver.work(),
        DailyBag.work(),
        GiftSend.work(),
        Group.work(),
        Heart.work(),
        Silver2Coin.work(),
        SilverBox.work(),
        Task.work()
    ]
    server_tasks = [MonitorServer.run_forever()]
    danmu_tasks = [Danmu_Monitor.run_Danmu_Raffle_Handler(i) for i in area_ids]
    other_tasks = [rafflehandler.run()]

    api_thread = threading.Thread(target=API.work)
    api_thread.start()

    if not options.disable_console:
        console_thread = threading.Thread(target=console.cmdloop)
        console_thread.start()

    # 先登陆一次,防止速度太快导致抽奖模块出错
    Auth.work()

    if config["Function"]["RAFFLE_HANDLER"] != "False":
        loop.run_until_complete(
            asyncio.wait(daily_tasks + server_tasks + danmu_tasks +
                         other_tasks))
    else:
        loop.run_until_complete(asyncio.wait(daily_tasks))

    api_thread.join()

    if not options.disable_console:
        console_thread.join()

    loop.close()
Exemple #15
0
def answer_where(question, inst):
    best_sen = inst.ranked_list(question)[0]
    info = Sentence(best_sen,0)
    answer = find_tag_answer(question,info, ['LOCATION', 'ORGANIZATION'])
    return best_sen
Exemple #16
0
def answer_what(question,inst):
    pattern = Sentence(question.wh_pattern(),0)
    best_sen = inst.ranked_list(pattern)[0]
    info = Sentence(best_sen,0)
    return best_sen
sentenceSet = " Sample"
sentenceFilename = "sentences/ Sample"


def resetSentenceFactory():
    global factory
    factory = SentenceFactory(sentenceFilename)

    #check that the data file is formatted correctly
    factory.validate()


testSentence = None

endWord = Word("Congratulations!!")
endSentence = Sentence([endWord], "!")

lostWord = Word("You are out of lives!!")
lostSentence = Sentence([lostWord], "!")


def createFish(word):
    global fishes
    global sprites
    newFish = Fish(word, fishSpeed)
    newFish.moveTo(random.randrange(-500, 0), random.randrange(200, 420))
    sprites.append(newFish)
    fishes.append(newFish)
    changeSpeed()

Exemple #18
0
from DialogueManagement import DialogueManagement
from Sentence import Sentence

DM1 = DialogueManagement()
sentence = input()
sentence = Sentence(sentence)
DM1.diliverSlots(sentence)
DM1.diliverIntent(sentence)
DM1.chooseDM(sentence.intent)

# print("singer:",DM1.singer)
# print("song:",DM1.song)
# print("intent:",DM1.intent)
DM1.DM()
Exemple #19
0
 def getCentroid(self, a):
     selectedsentnum = 0
     selectedtranslation = ""
     total_num_outputs = len(a)
     sentences = [Sentence() for __idx0 in range(total_num_outputs)]
     self.reputation = [float() for __idx0 in range(total_num_outputs)]
     ## for-while
     i = 0
     while i < total_num_outputs:
         sentences[i] = Sentence()
         self.reputation[i] = 0.0
         i += 1
     ## for-while
     i = 0
     while i < total_num_outputs:
         sentences[i].number = i
         sentences[i].text = a[i]
         sentences[i].tokencount = SentenceTokenizer.tokencount(sentences[i].text)
         sentences[i].tokens = SentenceTokenizer.sentencetokenizer(sentences[i].text)
         sentences[i].setDistance([float() for __idx0 in range(total_num_outputs)])
         sentences[i].setSimilarity([float() for __idx0 in range(total_num_outputs)])
         i += 1
         
     if len(sentences) <= 1:#//if only 1 or 2 candidate translations available
         randomGenerator = random()
         randomchoice = randomGenerator.nextInt(len(sentences))#check
         selectedtranslation = sentences[randomchoice].text
     else:#//if 3 or more candidate translations are available
         total_combinations = (total_num_outputs * (total_num_outputs - 1)) / 2
         #print ("total combo",total_combinations)
         sentencemapcount = 0
         filtercriteria = 0.4
         sentencemaps = [SentenceMap(0,0,0,0,0,0) for __idx0 in range(total_combinations)]
         ## Two for loop to list nC2 combinations
         i = 0
         while i < (len(sentences) - 1):
             ## for-while
             j = i + 1
             while j <= (len(sentences) - 1):
                 sentencemaps[sentencemapcount] = SentenceMap(i, j, sentences[i].tokencount, sentences[j].tokencount, len(sentences[i].text) - sentences[i].tokencount + 1, len(sentences[j].text) - sentences[i].tokencount + 1)
                 a = []
                 for x in range(sentences[i].tokencount):
                     a.append([])
                     for y in range(sentences[j].tokencount):
                         a[x].append(float())
                 self.mapscorematrix = a
                 
                 a = []
                 for x in range(sentences[i].tokencount):
                     a.append([])
                     for y in range(sentences[j].tokencount):
                         a[x].append(float())
                 self.mapmatrix = a
                 
                 a = []
                 for x in range(sentences[i].tokencount):
                     a.append([])
                     for y in range(sentences[j].tokencount):
                         a[x].append(float())
                 self.mapweightmatrix = a
                 ## initializing the matrices 
                 slength1 = 0
                 while slength1 < (sentences[i].tokencount):
                     slength2 = 0
                     while slength2 < (sentences[j].tokencount):
                         self.mapscorematrix[slength1][slength2] = 0
                         self.mapmatrix[slength1][slength2] = 0
                         self.mapweightmatrix[slength1][slength2] = 0
                         slength2 += 1
                     slength1 += 1
                 """
                 print "matrices"
                 print self.mapscorematrix
                 print self.mapmatrix
                 print self.mapweightmatrix
                 """
                 ## set scores for the bipartite graph
                 slength1 = 0
                 while slength1 < sentences[i].tokencount:
                     slength2 = 0
                     while slength2 < sentences[j].tokencount:
                         #print "args"
                         #print (sentences[i].tokens[slength1], sentences[j].tokens[slength2])
                         self.mapscorematrix[slength1][slength2] = WordComparer.comparescores(sentences[i].tokens[slength1], sentences[j].tokens[slength2])
                         #print "self.mapscorematrix[slength1][slength2]" + str(WordComparer.comparescores(sentences[i].tokens[slength1], sentences[j].tokens[slength2]))
                         slength2 += 1
                     slength1 += 1
                 #print ("mapscorematrix",self.mapscorematrix)
                 if sentences[i].tokencount <= sentences[j].tokencount:
                     debaredlist = []
                     ## for-while
                     slength1 = 0
                     while slength1 < sentences[i].tokencount:
                         index2 = -1
                         similarity = filtercriteria
                         ## for-while
                         slength2 = 0
                         while slength2 < sentences[j].tokencount:
                             if ((similarity < self.mapscorematrix[slength1][slength2]) and not (self.iselementpresent(debaredlist, slength2))):
                                 similarity = self.mapscorematrix[slength1][slength2]
                                 index2 = slength2
                             slength2 += 1
                         if (index2 != -1):
                             if len(sentences[i].tokens[slength1]) > len(sentences[j].tokens[index2]):
                                 self.mapweightmatrix[slength1][index2] = len(sentences[j].tokens[index2])
                             else:
                                 self.mapweightmatrix[slength1][index2] = len(sentences[i].tokens[slength1])
                             self.mapmatrix[slength1][index2] = similarity
                             debaredlist.append(index2)
                         slength1 += 1
                 else:
                     debaredlist = []
                     ## for-while
                     slength2 = 0
                     while slength2 < sentences[j].tokencount:
                         index1 = -1
                         similarity = filtercriteria
                         ## for-while
                         slength1 = 0
                         while slength1 < sentences[i].tokencount:
                             if ((similarity < self.mapscorematrix[slength1][slength2]) and not (self.iselementpresent(debaredlist, slength1))):
                                 similarity = self.mapscorematrix[slength1][slength2]
                                 index1 = slength1
                             slength1 += 1
                         if (index1 != -1):
                             if len(sentences[i].tokens[index1]) > len(sentences[j].tokens[slength2]):
                                 self.mapweightmatrix[index1][slength2] = len(sentences[j].tokens[slength2])
                             else:
                                 self.mapweightmatrix[index1][slength2] = len(sentences[i].tokens[index1])
                             self.mapmatrix[index1][slength2] = similarity
                             #print "self.mapmatrix[index1][slength2] : " + str(self.mapmatrix[index1][slength2])correct
                             debaredlist.append(index1)
                         slength2 += 1
                 #print self.mapmatrix
                 sentencemaps[sentencemapcount].setscorematrix(self.mapscorematrix)
                 sentencemaps[sentencemapcount].setmapmatrix(self.mapmatrix)
                 sentencemaps[sentencemapcount].setweightmatrix(self.mapweightmatrix)
                 distance = sentencemaps[sentencemapcount].setsentencesimilarityscore()
                 similarity = sentencemaps[sentencemapcount].sentencesimilarityscore
                 #print (distance,similarity)
                 sentences[i].setIthDistance(j, distance)
                 sentences[j].setIthDistance(i, distance)
                 sentences[i].setIthSimilarity(j, similarity)
                 sentences[j].setIthSimilarity(i, similarity)
                 sentencemapcount += 1
                 j += 1
             i += 1
         ## for-while
         i = 0
         while i < total_num_outputs:
             totaldistance = 0
             totalsimilarity = 0
             ## for-while
             j = 0
             while j < total_num_outputs:
                 totalsimilarity = totalsimilarity + sentences[i].similarity[j]
                 totaldistance = totaldistance + sentences[i].distance[j]
                 #print (totalsimilarity,totaldistance)
                 j += 1
             sentences[i].averagesimilarity = totalsimilarity / (total_num_outputs - 1)
             self.reputation[i] = sentences[i].averagesimilarity
             #print "rep" + str(self.reputation[i])
             sentences[i].averagedistance = totaldistance / (total_num_outputs - 1)
             i += 1
             
         newdistance = 0
         newoptdistance = 1000000000
         ## for-while
         z = 0
         while z < len(sentences):
             ## for-while
             y = 0
             while y < len(sentences):
                 #print sentences[z].distance[y]
                 newdistance = newdistance + sentences[z].distance[y]
                 y += 1
                 
             if newdistance < newoptdistance:
                 selectedsentnum = z
                 selectedtranslation = sentences[z].text
                 newoptdistance = newdistance
             newdistance = 0
             z += 1
     
     self.normalizer = 10*(sentences[selectedsentnum].tokencount)
     #print "normalizer=" + str(self.normalizer)
     return selectedtranslation
Exemple #20
0
from ModelCheck import ModelChecking
from PL_Resolution import PL_Resolution, resolutionTest

# All three people symbols, global variables
Amy = Arguments("Amy")
Bob = Arguments("Bob")
Cal = Arguments("Cal")
names = ["Amy", "Bob", "Cal"]
# literals list to use,global variables
literals = [Amy, Bob, Cal]

# Part Model Check Class, passin all basic symbols
check = ModelChecking(literals)

# All Three atomic Propositional sentences
Sentence_Amy = Sentence(unit, [Amy])
Sentence_Bob = Sentence(unit, [Bob])
Sentence_Cal = Sentence(unit, [Cal])


def modelCheckKBParta():
    """
    :return: The Knowledge base sentences of modelchecking of part a
    """
    sentence_2 = Sentence(conjunction, [Cal, Amy])
    KB1 = Sentence(equals, [Sentence_Amy, sentence_2], sentenceBase=True)

    sentence_3 = Sentence(unit, [Bob])
    sentence_4 = Sentence(negative, [Cal])
    KB2 = Sentence(equals, [Sentence_Bob, sentence_4], sentenceBase=True)
Exemple #21
0
 def processLine(self, line):
     line = line.strip()
     line = line.lower()
     line = line.replace('"', '')
     line = line.replace(',', '')
     line = line.replace('.', '')
     line = line.replace('!', '')
     line = line.replace("'", '')
     line = line.replace(":", '')
     line = line.replace(";", '')
     if line == '':
         return None
     processed_tokens = Sentence()
     processed_tokens.append(Datum("<s>"))
     tokens = line.split()
     i = 0
     while i < len(tokens):
         token = tokens[i]
         if token == '<err':
             targ = tokens[i + 1]
             targ_splits = targ.split('=')
             correct_token = targ_splits[1][:-1]
             correct_token_splits = correct_token.split()
             if len(correct_token_splits) > 2:
                 for correct_word in correct_token_splits:
                     processed_tokens.append(Datum(correct_word))
             elif tokens[i + 3] != '</err>':
                 processed_tokens.append(Datum(correct_token))
             else:
                 incorrect_token = tokens[i + 2]
                 processed_tokens.append(
                     Datum(correct_token, incorrect_token))
             i += tokens[i:].index('</err>') + 1
         else:  # regular word
             processed_tokens.append(Datum(token))
             i += 1
     processed_tokens.append(Datum("</s>"))
     return processed_tokens
Exemple #22
0
def modelchecking():
    # construct sentence
    # part a goal sentence
    sentence_1 = Sentence(unit, [mythical])
    sentence_2 = Sentence(unit, [immortal])
    KB1 = Sentence(implies, [sentence_1, sentence_2], sentenceBase=True)

    sentence_3 = Sentence(negative, [mythical])
    sentence_3_2 = Sentence(negative, [immortal])
    sentence_3_3 = Sentence(unit, [mammal])
    sentence_3_4 = Sentence(conjunction, [sentence_3_2, sentence_3_3], True)
    KB2 = Sentence(implies, [sentence_3, sentence_3_4], True)

    # part b goal sentence
    sentence_5 = Sentence(disjunction, [immortal, mammal])
    sentence_6 = Sentence(unit, [horned])
    KB3 = Sentence(implies, [sentence_5, sentence_6], sentenceBase=True)
    # part b goal sentence
    sentence_7 = Sentence(unit, [magical])
    KB4 = Sentence(implies, [sentence_6, sentence_7], sentenceBase=True)

    KB = [KB1, KB2, KB3, KB4]
    check = ModelChecking(literals)

    # # part a: to prove mythical, sentence1
    # print(check.check(KB, sentence_1))
    #
    # # part b: to prove magical, sentence7
    # print(check.check(KB, sentence_7))
    #
    # # part c:to prove horned,sentence6
    # print(check.check(KB, sentence_6))

    check.check(KB)
Exemple #23
0
print("""\033[32;1m
 ______     __     __         __     ______     __     __         __     __  __     ______     __         ______   ______     ______    
/\  == \   /\ \   /\ \       /\ \   /\  == \   /\ \   /\ \       /\ \   /\ \_\ \   /\  ___\   /\ \       /\  == \ /\  ___\   /\  == \   
\ \  __<   \ \ \  \ \ \____  \ \ \  \ \  __<   \ \ \  \ \ \____  \ \ \  \ \  __ \  \ \  __\   \ \ \____  \ \  _-/ \ \  __\   \ \  __<   
 \ \_____\  \ \_\  \ \_____\  \ \_\  \ \_____\  \ \_\  \ \_____\  \ \_\  \ \_\ \_\  \ \_____\  \ \_____\  \ \_\    \ \_____\  \ \_\ \_\ 
  \/_____/   \/_/   \/_____/   \/_/   \/_____/   \/_/   \/_____/   \/_/   \/_/\/_/   \/_____/   \/_____/   \/_/     \/_____/   \/_/ /_/ 
\033[0m""")

if config["Other"]["INFO_MESSAGE"] != "False":
    Log.info("BiliBiliHelper Python " + version)

Log.info("Powered By TheWanderingCoel, kotoriのねこ and 洛水.山岭居室️")

if config["Other"]["SENTENCE"] != "False":
    Log.info(Sentence().get_sentence())

# 检查Config
ConfigCheck()

# 注册信号
signal.signal(signal.SIGINT, signal_handler)

loop = asyncio.get_event_loop()

timer = Timer(loop)
console = Console.Console(loop)

area_ids = [1,2,3,4,5,6,]
Statistics(len(area_ids))
Exemple #24
0
len(conf_train['fp'])
from random import sample
examples = sample(fp, 1)
example = examples[0]
print(example[0])
print(example[1].text)
print('# Truth:')
for e in example[1].entities:
    print(e)
print('# Pred:')
for e in example[2].entities:
    print(e, e.pos)

f.dictLookuper.word('Octanal')
from Sentence import Sentence
sample_sentence = Sentence('A new and efficient synthesis of a naturally occurring amide alkaloid, N-isobutyl-4,5-epoxy-2(E)-decenamide isolated from the roots of Piper nigrum has been described involving a total of nine steps. Octanal and 2-bromoacetic acid have been used as the starting materials.')
sample_sentence.pos(s)
f.bl_sifter = True
f.sifter.bl_list_pos_star = False
f.bl_refiner = True

f.filt([sample_sentence])


for e in sample_sentence.entities:
    print(e)

# def smash(sentences):
#     for sentence in sentences:
#         _smash(sentence, )
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results,
                           rel_words_unigrams, rel_words_bigrams):
    idx = open_dir(index)
    count = 0
    distance = MAX_TOKENS_AWAY
    with idx.searcher() as searcher:
        while True:
            try:
                r = queue.get_nowait()
                if count % 50 == 0:
                    print multiprocessing.current_process(
                    ), "In Queue", queue.qsize(), "Total Matched: ", len(
                        results)

                #TODO: fazer uma cache

                t1 = query.Term(
                    'sentence',
                    "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">")
                t3 = query.Term(
                    'sentence',
                    "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">")

                # Entities proximity query without relational words
                q1 = spans.SpanNear2([t1, t3],
                                     slop=distance,
                                     ordered=True,
                                     mindist=1)
                hits = searcher.search(q1, limit=q_limit)

                # Entities proximity considering relational words
                # From the results above count how many contain a relational word
                hits_with_r = 0
                total_hits = 0

                for s in hits:
                    sentence = s.get("sentence")
                    s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY,
                                 MIN_TOKENS_AWAY, CONTEXT_WINDOW, stopwords)
                    for s_r in s.relationships:
                        if r.e1.decode("utf8") == s_r.e1 and r.e2.decode(
                                "utf8") == s_r.e2:
                            total_hits += 1
                            unigrams_rel_words = s_r.between
                            bigrams_rel_words = extract_bigrams(' '.join(
                                s_r.between))
                            if any(x in rel_words_unigrams
                                   for x in unigrams_rel_words):
                                hits_with_r += 1
                                continue

                            if any(x in rel_words_bigrams
                                   for x in bigrams_rel_words):
                                hits_with_r += 1
                                continue

                assert total_hits >= hits_with_r

                if total_hits > 0:
                    pmi = float(hits_with_r) / float(total_hits)
                    if pmi >= PMI:
                        results.append(r)
                count += 1
            except Queue.Empty:
                break
Exemple #26
0
	def __init__(self,test_file,lexicon):
		self.sentences = []
		lines = open(test_file,"r").readlines()
		for line in lines:
			line = line.strip()
			self.sentences.append(Sentence(line,lexicon))
Exemple #27
0
    def __init__(self, train_filename, test_filename, tag_num, l1, l2):
        '''
        Constructor
        '''
        self.word_count = {}
        self.word_tag_count = {}
        self.tag_uni_count = [0 for i in range(tag_num)]
        self.tag_bigram_count = [[0 for j in range(tag_num)]
                                 for i in range(tag_num)]
        self.tag_trigram_count = [[[0 for k in range(tag_num)]
                                   for j in range(tag_num)]
                                  for i in range(tag_num)]
        self.sents = [Sentence(tag_num)]
        self.total_tag = 0
        self.__rare_word = {}

        lines = open(train_filename).readlines()
        for i in range(len(lines)):
            line = lines[i].decode('utf-8').strip('\n')
            if len(line) > 0:
                wt = line.split('-')
                self.sents[-1].add_word_tag(wt[1], wt[0])
            else:
                self.sents[-1].finish(self.word_count, self.word_tag_count,
                                      self.tag_uni_count,
                                      self.tag_bigram_count,
                                      self.tag_trigram_count)
                self.total_tag += len(self.sents[-1].word_tag) - 2
                self.sents.append(Sentence(tag_num))
        self.sents[-1].finish(self.word_count, self.word_tag_count,
                              self.tag_uni_count, self.tag_bigram_count,
                              self.tag_trigram_count)
        self.total_tag += len(self.sents[-1].word_tag) - 2
        # self.word_prob = {}

        for w in self.word_count.keys():
            if self.word_count[w] <= 1:
                self.__rare_word[w] = self.word_count[w]

        self.word_tag_prob = {}
        for w in self.word_tag_count.keys():
            if w in self.__rare_word:
                self.word_tag_prob['RARE'] = [0 for i in range(tag_num)]
                for i in range(tag_num):
                    self.word_tag_prob['RARE'][i] = self.word_tag_count[
                        'RARE'][i] / self.tag_uni_count[i]
            else:
                self.word_tag_prob[w] = [0 for i in range(tag_num)]
                for i in range(tag_num):
                    self.word_tag_prob[w][
                        i] = self.word_tag_count[w][i] / self.tag_uni_count[i]
        # self.tag_uni_prob = [0 for i in range(tag_num)]
        # for i in range(tag_num):
        #    self.tag_uni_prob[i] = self.tag_uni_count[i] / self.total_word
        # self.tag_bigram_prob = [[0 for j in range(tag_num)] for i in range(tag_num)]
        # for i in range(tag_num):
        #    for j in range(tag_num):
        #        self.tag_bigram_prob[i][j] = (self.tag_bigram_count[i][j] / self.total_bigram) / \
        #                (self.tag_uni_prob[i])
        self.tag_trigram_prob = [[[0 for k in range(tag_num)]
                                  for j in range(tag_num)]
                                 for i in range(tag_num)]
        for i in range(tag_num):
            for j in range(tag_num):
                for k in range(tag_num):
                    # q(i|j,k) that is the order of the tags is j,k,i
                    self.tag_trigram_prob[i][j][k] = l1 * (self.tag_uni_count[i] / self.total_tag) + \
                                                    l2 * (self.tag_bigram_count[k][i] / self.tag_uni_count[i]) + \
                                                    (1 - l2 - l1) * (self.tag_trigram_count[j][k][i] / self.tag_bigram_count[k][i])
 def processLine(self, line):
     line = line.strip()
     line = line.lower()
     line = line.replace('"', '')
     line = line.replace(',', '')
     line = line.replace('.', '')
     line = line.replace('!', '')
     line = line.replace("'", '')
     line = line.replace(":", '')
     line = line.replace(";", '')
     if line == '':
         return None
     processed_tokens = Sentence()
     processed_tokens.append(Datum("<s>"))  #start symbol
     tokens = line.split()
     i = 0
     while i < len(tokens):
         token = tokens[i]
         if token == '<err':
             targ = tokens[i + 1]
             targ_splits = targ.split('=')
             correct_token = targ_splits[1][:
                                            -1]  # chop off the trailing '>'
             correct_token_splits = correct_token.split()
             if len(correct_token_splits) > 2:  # targ with multiple words
                 #print 'targ with multiple words: "%s"' % targ
                 for correct_word in correct_token_splits:
                     processed_tokens.append(Datum(correct_word))
             elif tokens[i + 3] != '</err>':
                 processed_tokens.append(Datum(correct_token))
             else:
                 incorrect_token = tokens[i + 2]
                 processed_tokens.append(
                     Datum(correct_token, incorrect_token))
             i += tokens[i:].index('</err>') + 1  # update index
         else:  # regular word
             processed_tokens.append(Datum(token))
             i += 1
     processed_tokens.append(Datum("</s>"))
     return processed_tokens
from Sentence import Sentence

sentence = Sentence('爱的可能')
arcs = sentence.arcs

print(sentence.words)
print(sentence.postags)
print(sentence.personNames)
print('\t'.join('%s,%s' % (arc.head, arc.relation) for arc in arcs))
Exemple #30
0
        return self.score

    def getFinalResult(self):
        tags = self.getActionSequence()
        wordSeq = list(self.sentence.getChars())
        for index, tag in enumerate(tags):
            if tag == 'S' or tag == 'E':
                wordSeq[index] += ' '
        return u''.join(wordSeq).strip()


if __name__ == "__main__":
    print 'initial..'
    line = u'我 是 傻逼 , 这 你 也 信 ?'
    from Sentence import Sentence
    s = Sentence(line)
    print(' '.join(s.characters))
    print(' '.join(s.tags))
    s0 = SegState(s)
    sss = SegState(s)
    print s0.sentence.tags == s.tags
    print 'fillPrimitiveUnits..'
    print s0.C_2, s0.C_1, s0.C, s0.C1, s0.C2
    print 'compareTo..'
    print s0.compareTo(sss) == 0
    print 'getUnlabeledFeatures..'
    print ' '.join(s0.getUnlabeledFeatures())
    print 'getPrevState...'
    print s0.getPrevState()
    print 'isGold..'
    print s0.IsGold()
Exemple #31
0
from Sentence import Sentence

if __name__ == '__main__':
	sentence = Sentence("I'm a #Mac#. I am not #Ubuntu#. Lol")
	print (sentence.getTerms())
Exemple #32
0
	def __init__(self, goldPath, predictedPath=None):
		self.goldPath = goldPath
		self.predictedPath = predictedPath
		self.sents = [] # all sents in corpus
		self.sent_stats = {}
		self.numTokens = 0 # count total tokens in corpus
		self.tags = set()
		self.tokens = []
		sent = Sentence()



		if predictedPath:
			with open(goldPath) as gf, open(predictedPath) as pf:
				for gline,pline in izip(gf, pf): # open two files simultaneously
					if gline.strip() and pline.strip(): # check if lines not empty
						gtoken_tag = re.split(r'\t', gline)
						ptoken_tag = re.split(r'\t', pline)
						if gtoken_tag[0] == ptoken_tag[0]:
							token = Token(gtoken_tag[0], gtoken_tag[1].strip(), ptoken_tag[1].strip()) # create new Token object
							sent.addToken(token)
							self.numTokens += 1 
						else:
							raise Exception("Files not in sync")
					else:
						self.sents.append(sent)
						sent = Sentence()
		else:
			# store all sentences from corpus
			sentences = []
			# store a sentence that consists of tokens
			sentence = []
			with open(goldPath) as gf:
				for line in gf: 
					# check if lines not empty
					if line.strip(): 
						# split line into token and tag as list elements
						token_tag = re.split(r'\t', line)
						# add a token object into sentence
						sentence.append(Token(token_tag[0].strip(), token_tag[1].strip()))
						# count total number of tokens
						self.numTokens += 1 
					else:
						# we have reached end of sentence (empty line)
						sentences.append(sentence)
						sentence = []

			prev = "prevnotekzist"
			following = "folnotekzist"
			for j, sentence in enumerate(sentences):
				for i, token in enumerate(sentence):
					# make sure we don't go beyond sentence length
					if i+1 < len(sentence):
						following = sentence[i+1]
					# if we reached end of current sentence - take following word as first word of next sentence
					elif j+1 < len(sentences):
						following = sentences[j+1][0]
					token.setPrev(prev)
					token.setFollowing(following)
					token.getNeighborFeatures()
					# print (vars(token))
					prev = token
					sent.addToken(token)
				self.sents.append(sent)
		 		sent = Sentence()
def mapMaker(parsableParas):

    G = nx.Graph()

    nodeLabels = {}
    edgeLabels = {}

    for parsablePara in parsableParas:
        parsableWords = parsablePara
        sentences = getSentences(parsableWords)

        for sentenceTokens in sentences:  # for each list of set of words in the sentence that forms a POS

            sentence = Sentence(sentenceTokens)  # make a sentence object out of those words

            subject = sentence.structure()
            subject.title = subject.title.rstrip(".\n")

            # find the node in the graph which has as it's title, the subject of the current sentence
            try:
                s = findNode(G, subject.title)

                # if such a node doesn't exist, add it to the graph and add an entry for it in the graph labels
            except NoSuchNodeException:
                G.add_node(subject.title)
                nodeLabels[subject.title] = [subject.title]
                s = findNode(G, subject.title)

                # add the appropriate labels for this node
            finally:
                # 				print subject	## DEBUG ##
                if subject.getLabel() not in nodeLabels[subject.title]:
                    nodeLabels[subject.title].append(subject.getLabel())

            for verb in subject.paths.keys():

                dest = subject.paths[verb]
                dest.title = dest.title.rstrip(".\n")

                # find the node in the graph which has as it's title, the object of the current sentence
                try:
                    d = findNode(G, dest.title)

                    # if such a node doesn't exist, add it to the graph and add an entry for it in the graph labels
                except NoSuchNodeException:
                    G.add_node(dest.title)
                    nodeLabels[dest.title] = [dest.title]

                    # add the appropriate labels for this node
                finally:
                    # 					print dest	## DEBUG ##
                    d = findNode(G, dest.title)
                    if dest.getLabel() not in nodeLabels[dest.title]:
                        nodeLabels[dest.title].append(dest.getLabel())

                        # add the edge label in the graph for this verb
                edgeLabels[verb.getIdentifier()] = verb.getLabel()

                # add an edge between the subject and the object
                G.add_edge(s, d)

                # nltk requires that the node labels be strings and not lists.
                # this for loop takes the nodeLabels dict and turns the list of descriptors into a newline-separated string of descriptors
                # the next loop does the same for edge labels
    for node in nodeLabels.keys():
        nodeLabels[node] = "".join(["\n" + label for label in nodeLabels[node]])
        nodeLabels[node] = nodeLabels[node].strip("\n")

    for e in edgeLabels.keys():
        edgeLabels[e] = edgeLabels[e].rstrip("\n")

        # draw the graph to the following specs:
        # use the generated node labels: nouns and attributes
        # the size of the node needs to be 1000 * (number of '\n' +1). The reason for the +1 is to accomodate single-line labels
        # node_shape='s' describes that each node should be a square
        # spectral layout is the one layout that makes the most sense to display the graph. Many layouts were tried in a trial-and-error fashion
    nx.draw(
        G,
        labels=nodeLabels,
        node_size=[(nodeLabels[node].count("\n") + 1) * 1000 for node in G.nodes()],
        node_shape="s",
        pos=nx.spectral_layout(G),
    )
    pos = nx.layout.spectral_layout(G)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edgeLabels)
    # 	nx.write_dot(G, '/home/ashwin/Desktop/sample')
    plt.show()
Exemple #34
0
 def sentence():
     data = {"sentence": Sentence().get_sentence()}
     return jsonify(data)
Exemple #35
0
    def load(self):
        self.maxPoints = 0
        f = BufferedReader(self.filename)

        #keep track of line numbers, which can be used to display
        #error messages about bad data files
        lineNumber = 0

        #scan through the entire file
        while (not f.eof()):
            nextLine = f.readLine()
            lineNumber += 1

            #check if the line is a comment/whitespace of if it contains some
            #sentence data. Sentence data is always on its own line
            #and starts with the "Sentence:" identifier
            if (nextLine.startswith("Sentence:")):
                self.maxPoints += SentenceFactory.POINTS_PER_SENTENCE
                sentenceComponents = []
                st = StringTokenizer(nextLine)

                #discard the "Sentence:" token
                st.nextToken()

                #indicate whether we are reading in words from a sentence
                #or words that belong to blanks in a sentence
                readingBlank = False
                readingBlankValid = False
                blankValidWords = []
                blankInvalidWords = []

                while (st.hasNext()):
                    nextToken = st.nextToken()
                    if (readingBlank):

                        #"]" denotes that the end of the list of words for the
                        #given blank has been reached.
                        if (nextToken == "]"):
                            readingBlank = False
                            sentenceComponents.append(
                                Blank(blankValidWords, blankInvalidWords))
                            blankValidWords = []
                            blankInvalidWords = []

                        #"|" separates the valid and invalid words that can
                        #be used to fill in a blank
                        elif (nextToken == "|"):
                            readingBlankValid = False
                        else:
                            if (readingBlankValid):
                                blankValidWords.append(Word(nextToken))
                            else:
                                blankInvalidWords.append(Word(nextToken))
                    else:

                        #"[" indicates the start of a blank and the words
                        #that can be used to fill it
                        if (nextToken == "["):
                            self.maxPoints += SentenceFactory.POINTS_PER_CORRECT_WORD
                            readingBlank = True
                            readingBlankValid = True
                        else:
                            sentenceComponents.append(Word(nextToken))

                #last token would be the punctuation
                punctuation = sentenceComponents[len(sentenceComponents) - 1]
                sentenceComponents.remove(punctuation)

                #create an add the sentence to the sentence factory's
                #master list of sentences
                self.sentences.append(
                    SentenceData(
                        Sentence(sentenceComponents, punctuation.toString()),
                        self.filename, nextLine, lineNumber))

            else:
                #ignore comments and whitespace
                pass
Exemple #36
0
    def getCentroid(self, a):
        selectedsentnum = 0
        selectedtranslation = ""
        total_num_outputs = len(a)
        sentences = [Sentence() for __idx0 in range(total_num_outputs)]
        self.reputation = [float() for __idx0 in range(total_num_outputs)]
        ## for-while
        i = 0
        while i < total_num_outputs:
            sentences[i] = Sentence()
            self.reputation[i] = 0.0
            i += 1
        ## for-while
        i = 0
        while i < total_num_outputs:
            sentences[i].number = i
            sentences[i].text = a[i]
            sentences[i].tokencount = SentenceTokenizer.tokencount(sentences[i].text)
            sentences[i].tokens = SentenceTokenizer.sentencetokenizer(sentences[i].text)
            sentences[i].setDistance([float() for __idx0 in range(total_num_outputs)])
            sentences[i].setSimilarity([float() for __idx0 in range(total_num_outputs)])
            i += 1

        if len(sentences) <= 1:  # //if only 1 or 2 candidate translations available
            randomGenerator = random()
            randomchoice = randomGenerator.nextInt(len(sentences))  # check
            selectedtranslation = sentences[randomchoice].text
        else:  # //if 3 or more candidate translations are available
            total_combinations = (total_num_outputs * (total_num_outputs - 1)) / 2
            # print ("total combo",total_combinations)
            sentencemapcount = 0
            filtercriteria = 0.4
            sentencemaps = [SentenceMap(0, 0, 0, 0, 0, 0) for __idx0 in range(total_combinations)]
            ## Two for loop to list nC2 combinations
            i = 0
            while i < (len(sentences) - 1):
                ## for-while
                j = i + 1
                while j <= (len(sentences) - 1):
                    sentencemaps[sentencemapcount] = SentenceMap(
                        i,
                        j,
                        sentences[i].tokencount,
                        sentences[j].tokencount,
                        len(sentences[i].text) - sentences[i].tokencount + 1,
                        len(sentences[j].text) - sentences[i].tokencount + 1,
                    )
                    a = []
                    for x in range(sentences[i].tokencount):
                        a.append([])
                        for y in range(sentences[j].tokencount):
                            a[x].append(float())
                    self.mapscorematrix = a

                    a = []
                    for x in range(sentences[i].tokencount):
                        a.append([])
                        for y in range(sentences[j].tokencount):
                            a[x].append(float())
                    self.mapmatrix = a

                    a = []
                    for x in range(sentences[i].tokencount):
                        a.append([])
                        for y in range(sentences[j].tokencount):
                            a[x].append(float())
                    self.mapweightmatrix = a
                    ## initializing the matrices
                    slength1 = 0
                    while slength1 < (sentences[i].tokencount):
                        slength2 = 0
                        while slength2 < (sentences[j].tokencount):
                            self.mapscorematrix[slength1][slength2] = 0
                            self.mapmatrix[slength1][slength2] = 0
                            self.mapweightmatrix[slength1][slength2] = 0
                            slength2 += 1
                        slength1 += 1
                    """
                    print "matrices"
                    print self.mapscorematrix
                    print self.mapmatrix
                    print self.mapweightmatrix
                    """
                    ## set scores for the bipartite graph
                    slength1 = 0
                    while slength1 < sentences[i].tokencount:
                        slength2 = 0
                        while slength2 < sentences[j].tokencount:
                            # print "args"
                            # print (sentences[i].tokens[slength1], sentences[j].tokens[slength2])
                            self.mapscorematrix[slength1][slength2] = WordComparer.comparescores(
                                sentences[i].tokens[slength1], sentences[j].tokens[slength2]
                            )
                            # print "self.mapscorematrix[slength1][slength2]" + str(WordComparer.comparescores(sentences[i].tokens[slength1], sentences[j].tokens[slength2]))
                            slength2 += 1
                        slength1 += 1
                    # print ("mapscorematrix",self.mapscorematrix)
                    if sentences[i].tokencount <= sentences[j].tokencount:
                        debaredlist = []
                        ## for-while
                        slength1 = 0
                        while slength1 < sentences[i].tokencount:
                            index2 = -1
                            similarity = filtercriteria
                            ## for-while
                            slength2 = 0
                            while slength2 < sentences[j].tokencount:
                                if (similarity < self.mapscorematrix[slength1][slength2]) and not (
                                    self.iselementpresent(debaredlist, slength2)
                                ):
                                    similarity = self.mapscorematrix[slength1][slength2]
                                    index2 = slength2
                                slength2 += 1
                            if index2 != -1:
                                if len(sentences[i].tokens[slength1]) > len(sentences[j].tokens[index2]):
                                    self.mapweightmatrix[slength1][index2] = len(sentences[j].tokens[index2])
                                else:
                                    self.mapweightmatrix[slength1][index2] = len(sentences[i].tokens[slength1])
                                self.mapmatrix[slength1][index2] = similarity
                                debaredlist.append(index2)
                            slength1 += 1
                    else:
                        debaredlist = []
                        ## for-while
                        slength2 = 0
                        while slength2 < sentences[j].tokencount:
                            index1 = -1
                            similarity = filtercriteria
                            ## for-while
                            slength1 = 0
                            while slength1 < sentences[i].tokencount:
                                if (similarity < self.mapscorematrix[slength1][slength2]) and not (
                                    self.iselementpresent(debaredlist, slength1)
                                ):
                                    similarity = self.mapscorematrix[slength1][slength2]
                                    index1 = slength1
                                slength1 += 1
                            if index1 != -1:
                                if len(sentences[i].tokens[index1]) > len(sentences[j].tokens[slength2]):
                                    self.mapweightmatrix[index1][slength2] = len(sentences[j].tokens[slength2])
                                else:
                                    self.mapweightmatrix[index1][slength2] = len(sentences[i].tokens[index1])
                                self.mapmatrix[index1][slength2] = similarity
                                # print "self.mapmatrix[index1][slength2] : " + str(self.mapmatrix[index1][slength2])correct
                                debaredlist.append(index1)
                            slength2 += 1
                    # print self.mapmatrix
                    sentencemaps[sentencemapcount].setscorematrix(self.mapscorematrix)
                    sentencemaps[sentencemapcount].setmapmatrix(self.mapmatrix)
                    sentencemaps[sentencemapcount].setweightmatrix(self.mapweightmatrix)
                    distance = sentencemaps[sentencemapcount].setsentencesimilarityscore()
                    similarity = sentencemaps[sentencemapcount].sentencesimilarityscore
                    # print (distance,similarity)
                    sentences[i].setIthDistance(j, distance)
                    sentences[j].setIthDistance(i, distance)
                    sentences[i].setIthSimilarity(j, similarity)
                    sentences[j].setIthSimilarity(i, similarity)
                    sentencemapcount += 1
                    j += 1
                i += 1
            ## for-while
            i = 0
            while i < total_num_outputs:
                totaldistance = 0
                totalsimilarity = 0
                ## for-while
                j = 0
                while j < total_num_outputs:
                    totalsimilarity = totalsimilarity + sentences[i].similarity[j]
                    totaldistance = totaldistance + sentences[i].distance[j]
                    # print (totalsimilarity,totaldistance)
                    j += 1
                sentences[i].averagesimilarity = totalsimilarity / (total_num_outputs - 1)
                self.reputation[i] = sentences[i].averagesimilarity
                # print "rep" + str(self.reputation[i])
                sentences[i].averagedistance = totaldistance / (total_num_outputs - 1)
                i += 1

            newdistance = 0
            newoptdistance = 1000000000
            ## for-while
            z = 0
            while z < len(sentences):
                ## for-while
                y = 0
                while y < len(sentences):
                    # print sentences[z].distance[y]
                    newdistance = newdistance + sentences[z].distance[y]
                    y += 1

                if newdistance < newoptdistance:
                    selectedsentnum = z
                    selectedtranslation = sentences[z].text
                    newoptdistance = newdistance
                newdistance = 0
                z += 1

        self.normalizer = 10 * (sentences[selectedsentnum].tokencount)
        # print "normalizer=" + str(self.normalizer)
        return selectedtranslation
def buildcorpus(corpus, rootpath, filelimit = 0):
    
    #rootpath = corpus.rootpath
    fileids = os.listdir(rootpath)
    
    hugewordlist = []   
    hugewordlist.extend(corpus.words)   # will contain distinct Word instances

    numoffiles = 0
    
    corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts")
    
    for fileid in fileids:
    
        
        allwords = nltk.FreqDist()    # will contain all words in this text
        
        doc_id = fileid.split(".")[0]
        # corpus.inserttext(doc_id)    ##### !   text in kendisini gondermeli
        newtext = Text(doc_id)
        
        path = rootpath + os.sep + fileid
        #lines = readtextlines(path)
    
        #rawtext = texter.readtxtfile(path)
        rawtext = texter.readnewstext(path)
        lines = texter.splitToSentences(rawtext)
        
        sntindex = 0
        # each line is a sentence
        for line in lines:
            words = []   # words in this sentence
            words = line.split()
            words = texter.eliminatepunctuation(words)
            words = [word for word in words if not word.isspace()]
            
            
            
            for word in words:
                allwords.inc(word)
                
                
                newword = Word(word)
                newword.insertsentenceid(doc_id+"_"+str(sntindex))
                
                if allwords[word] <= 1:    # if this was not added to the hugelist before, add it
                    hugewordlist.append(newword)
                
                    
            sentence = Sentence(sntindex)
            sntindex = sntindex + 1
            
            # sentence'a Word mu wordindex mi atalim?
            for word in words:
                index = hugewordlist.index(Word(word))
                hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1))
                sentence.insertword(index)
                
            newtext.insertsentence(sentence)
            
        if (not rawtext.isspace()) or (len(allwords) != 0):   
            corpus.inserttext(newtext)    
            
            print str(numoffiles)," : finished handling the words-snts-txts ",doc_id 
    
                
            numofwords = reduce(lambda x,y : x+y, allwords.values())
            
            for word in hugewordlist:
                cnt =  allwords[word.literal]
                #freq = cnt / float(numofwords)
                word.assigntermfreq(cnt, numofwords, doc_id)
                #hugewordlist[index].toscreen()
        
        numoffiles = numoffiles + 1
        if filelimit == numoffiles:
            break       

        
    # end for - docs
    

    numofdocs = len(fileids)
    print "computing tf*idf"
    for word in hugewordlist:
        word.computeinvdocfreq(numofdocs)
        word.computeTFIDF()
        #word.toscreen()
        
    corpus.assignwords(hugewordlist)
    print "corpus length ",str(len(corpus.words))," words"
    print "huges length ",str(len(hugewordlist))," words"
    print "exiting buildcorpus()"
    
    print "pickle-dumping words"
    corpus.pickledumpwords()
Exemple #38
0
 def processLine(self, line):
  line = line.strip()
  line = line.lower() 
  line = line.replace('"','') 
  line = line.replace(',', '')
  line = line.replace('.','') 
  line = line.replace('!','') 
  line = line.replace("'",'') 
  line = line.replace(":",'') 
  line = line.replace(";",'') 
  if line == '':
    return None
  processed_tokens = Sentence() 
  processed_tokens.append(Datum("<s>")) #start symbol
  tokens = line.split()
  i = 0
  while i < len(tokens):
    token = tokens[i]
    if token == '<err':
      targ = tokens[i+1]
      targ_splits = targ.split('=')
      correct_token = targ_splits[1][:-1] # chop off the trailing '>'
      correct_token_splits = correct_token.split()
      if len(correct_token_splits) > 2: # targ with multiple words
        #print 'targ with multiple words: "%s"' % targ
        for correct_word in correct_token_splits:
          processed_tokens.append(Datum(correct_word))
      elif tokens[i+3] != '</err>':
        processed_tokens.append(Datum(correct_token))
      else:
        incorrect_token = tokens[i+2]
        processed_tokens.append(Datum(correct_token, incorrect_token))    
      i += tokens[i:].index('</err>') + 1 # update index
    else: # regular word
      processed_tokens.append(Datum(token))
      i += 1
  processed_tokens.append(Datum("</s>"))
  return processed_tokens
    def processLine(self, line):
        self.dict = enchant.Dict('en')
        line = line.strip()
        #line = line.lower() 
        line = line.replace('"','') 
        line = line.replace(',', '')
        line = line.replace('.', ' . ') 
        line = line.replace('!', ' . ') 
        line = line.replace('?', ' ? ')
        #line = line.replace("'",'') 
        line = line.replace(":",'') 
        line = line.replace(";",' , ') 
        line = line.replace("  ",' ')
        if line == '':
            return None
        processed_tokens = Sentence() 
        processed_tokens.append(Datum("<s>")) #start symbol
        tokens = line.split()
        i = 0
        while i < len(tokens):
            token = tokens[i]
            if token == '':
                i += 1
                continue
            if not token.islower():
                if self.dict.check(token.lower()):
                    token = token.lower()
            if token == '<ERR':
                try:
                    if tokens[i+3] == '</ERR>':
                        targ = tokens[i+1]
                        targ_splits = targ.split('=')
                        correct_token = targ_splits[1][:-1] # chop off the trailing '>'
                        incorrect_token = tokens[i+2]
                    else:
                        #either targ or error has more than one word
                        end = i + tokens[i:].index('</ERR>') - 1
                        for j in range(i, end):
                            if tokens[j].endswith('>'):
                                break
                            
                        targ = tokens[i+1]
                        targ_splits = targ.split('=')
                        if i+1 == j:# one word in targ
                            correct_token = targ_splits[1][:-1] # chop off the trailing '>'
                        else:# more than one word in targ
                            correct_token = targ_splits[1]
                            if (i+2) <= (j-1):
                                for k in range(i+2, j):
                                    correct_token = correct_token + ' ' + tokens[k]
                            correct_token = correct_token + ' ' + tokens[j][:-1] # chop off the trailing '>'
                        
                        incorrect_token = tokens[j+1]
                        if (j+2) <= end:
                            for k in range(j+2, end+1):
                                incorrect_token = incorrect_token + ' ' + tokens[k]
                except IndexError:
                    print tokens
                    #raise RuntimeError
                    return None
#                    processed_tokens.append(Datum("</s>"))
#                    return processed_tokens

#                correct_token_splits = correct_token.split()
#                if len(correct_token_splits) > 2: # targ with multiple words
#                    #print 'targ with multiple words: "%s"' % targ
#                    for correct_word in correct_token_splits:
#                        processed_tokens.append(Datum(correct_word))
#                elif tokens[i+3] != '</ERR>':
#                    processed_tokens.append(Datum(correct_token))
#                else:
#                    incorrect_token = tokens[i+2]
#                    processed_tokens.append(Datum(correct_token, incorrect_token)) 

                processed_tokens.append(Datum(correct_token, incorrect_token))         
                i += tokens[i:].index('</ERR>') + 1 # update index
            else: # regular word
                processed_tokens.append(Datum(token))
                i += 1
        processed_tokens.append(Datum("</s>"))
        return processed_tokens
def main():
	sentence = Sentence()
	filters = FirstWord()
	sentence.loopSentences(filters) #apply the filter to every sentence
def generate_sentences(weight, iteration_count, noun_l, verb_l, adj_l, conj_l,
                       p_pones_l):
    sentences = []
    strings = []
    for i in range(0, iteration_count):
        if (weight < 151):
            str = ""
            limit = weight
            word = []
            s = Sentence(0, 0, word)
            x = weight / 5
            y = weight / 1.5
            noun = choose_word_in_range(x, y, noun_l)
            limit -= noun.weight
            verb = choose_word_in_range(x, y, verb_l)
            limit -= verb.weight
            conj = choose_word_in_range(limit - 1, limit + 1, conj_l)
            s.words.append(noun)
            s.words.append(conj)
            s.words.append(verb)
            # To check if we already added it
            str = noun.name
            str += conj.name
            str += verb.name

            if (s.lentgth_of_sentence() == weight):
                matching = [s for s in strings if str in s]
                if (matching):
                    continue
                sentences.append(s)
                strings.append(str)

        if (150 < weight < 251):
            limit = weight
            word = []
            s = Sentence(0, 0, word)
            x = weight / 5
            y = weight / 2
            noun = choose_word_in_range(x, y, noun_l)
            limit -= noun.weight
            verb = choose_word_in_range(x, y, verb_l)
            limit -= verb.weight

            s.words.append(noun)
            while True:
                if (limit >= 80):
                    x = choose_word_in_range(30, 50, adj_l)
                    limit -= x.weight
                    s.words.append(x)
                else:
                    break
            if (limit > 10):
                p_pone = choose_word_in_range(limit - 1, limit + 1, conj_l)
                s.words.append(p_pone)
            s.words.append(verb)

            if (s.lentgth_of_sentence() == weight):
                sentences.append(s)

        if (250 < weight):
            limit = weight
            word = []
            s = Sentence(0, 0, word)
            x = weight / 25
            y = weight / 5
            noun = choose_word_in_range(x, y, noun_l)
            limit -= noun.weight
            verb = choose_word_in_range(x, y, verb_l)
            limit -= verb.weight

            s.words.append(noun)
            while True:
                if (limit >= 80):
                    x = choose_word_in_range(30, 50, noun_l)
                    limit -= x.weight
                    s.words.append(x)
                else:
                    break
            if (limit > 10):
                conj = choose_word_in_range(limit - 1, limit + 1, noun_l)
                s.words.append(conj)
            s.words.append(verb)
            if (s.lentgth_of_sentence() == weight):
                sentences.append(s)

    return sentences