Esempio n. 1
0
    def handle_response(self, response):
        soup = BeautifulSoup(response.decode('utf-8', 'ignore'))

        table = soup.find('table', {'border': '1'})
        is_going_on = False
        for tr in table.findAll('tr'):
            is_going_on = True
            for td in tr.findAll('td'):
                td_value = str(td)
                word = re.search(r'(.*)kelime=(.+)&c(.*)',
                                 td_value).group(2)

                is_letter = word.__len__() == 1
                is_proper_name = word[0].isupper()
                is_phrase = word.__contains__(' ')

                w = Word(word, is_letter, is_proper_name, is_phrase)

                if DictionaryConfig.detailed_log:
                    print 'word consumed #', DictionaryService.regular_words.__len__(
                    ) + 1, ':', w.get_value()

                if w.is_regular():
                    DictionaryService.regular_words.append(w)
                elif w.is_proper_name:
                    DictionaryService.proper_words.append(w)
                elif w.is_phrase:
                    DictionaryService.phrases.append(w)

                DictionaryService.fs.write(w.formatted_value() + "\n")

        return is_going_on
Esempio n. 2
0
    def test_IsWordLegal_IllegalWordBadCollisionsOneLetterAnchorDown_ReturnsFalse(
            self):

        # Arrange
        board = Board()
        t = Tile('t', -1, -1.0, -1)
        e = Tile('e', -1, -1.0, -1)
        s = Tile('s', -1, -1.0, -1)
        i = Tile('i', -1, -1.0, -1)
        n = Tile('n', -1, -1.0, -1)
        g = Tile('g', -1, -1.0, -1)

        hand = Hand("test",
                    [t, e, s, t, i, n, g, t, e, s, t, i, n, g, s, i, t])
        testing = Word([t, e, s, t, i, n, g])
        sit = Word([s, i, t])

        # Act
        board.PlaceWord(testing, board.GetAnchors()[0], hand, 0, 'down')
        board.PlaceWord(sit, board.GetAnchors()[3], hand, 1, 'across')
        results = board.IsWordLegal(testing, board.GetAnchors()[6], 3, 'down')

        # Assert
        #board.PrintBoard()
        self.assertFalse(results[0])
        self.assertEqual(results[1],
                         'word creates an invalid word when placed')
Esempio n. 3
0
 def add(self, word, documentID, occurences):
     if word in self.cache.keys():
         self.cache[word].add(documentID, occurences)
     else:
         wordInstance = Word(word, self.directory)
         wordInstance.add(documentID, occurences)
         self.cache[word] = wordInstance
    def handle_response(self, response):
        soup = BeautifulSoup(response.decode('utf-8', 'ignore'))

        table = soup.find('table', {'border': '1'})
        is_going_on = False
        for tr in table.findAll('tr'):
            is_going_on = True
            for td in tr.findAll('td'):
                td_value = str(td)
                word = re.search(r'(.*)kelime=(.+)&c(.*)', td_value).group(2)

                is_letter = word.__len__() == 1
                is_proper_name = word[0].isupper()
                is_phrase = word.__contains__(' ')

                w = Word(word, is_letter, is_proper_name, is_phrase)

                if DictionaryConfig.detailed_log:
                    print 'word consumed #', DictionaryService.regular_words.__len__() + 1, ':', w.get_value()

                if w.is_regular():
                    DictionaryService.regular_words.append(w)
                elif w.is_proper_name:
                    DictionaryService.proper_words.append(w)
                elif w.is_phrase:
                    DictionaryService.phrases.append(w)

                DictionaryService.fs.write(w.formatted_value() + "\n")

        return is_going_on
Esempio n. 5
0
 def __init__(self):
     print("Word_data_Based_on : . https://github.com/KKuTu-Korea/KKuTu")
     print("-----------------------------------------------------------")
     self.word = Word()
     self.words = {}  # 단어 dict
     self.thirdword = ['', '', '']
     self.usedword = []
    def test_AnchorSearch_FindingWordsWithScript_ReturnsCorrectWords(self):

        # Arrange
        words = Words()
        selectedKeys = []
        s = Tile('S', 1, 0.09480520300163461, 3)
        c = Tile('C', 3, 0.04049934678472928, 29)
        r = Tile('R', 1, 0.07098146383333229, 11)
        i = Tile('I', 1, 0.0885545324304026, 5)
        p = Tile('P', 3, 0.029410465329100584, 41)
        t = Tile('T', 1, 0.06566549066880407, 17)
        word = Word([s, c, r, i, p, t])
        allWordsContainWord = True
        noWordsContainWord = True

        # Act
        selectedWords = words.AnchorSearch(word)
        for key in selectedWords.GetDict():
            selectedKeys.append(key)
            for element in selectedWords.GetDict().get(key):
                if word.GetString() not in element:
                    allWordsContainWord = False
        for key in words.GetDict():
            if key not in selectedKeys:
                for element in words.GetDict().get(key):
                    if word.GetString() in element:
                        noWordsContainWord = False
                        print(element, "contains the word", word.GetString())

        # Assert
        self.assertTrue(allWordsContainWord)
        self.assertTrue(noWordsContainWord)
Esempio n. 7
0
def filter_corpus_for_relevance(filein, fileout):
    print "Converting ", filein, " to ", fileout
    print "\tReading ", filein
    inpt = open(filein, 'r')
    content = filter(lambda x: not x == "",
                     inpt.readlines()[0].replace("\n", "").split(" "))
    inpt.close()

    print "\tNow filtering and writing relevant words"
    print "\t", len(content), "words to go..."
    outpt = open(fileout, 'w')
    cache = dict()
    for i in xrange(len(content)):
        if i % 1000000 == 0:
            print "\t\tIteration", i
        word = content[i]
        if word not in cache:
            word_object = Word(word)
            if word_object.relevant():
                cache[word] = word_object.lemma()
            else:
                cache[word] = False
        token = cache[word]
        if not token == False:
            outpt.write(token + " ")
    outpt.close()
    print "Done!"
Esempio n. 8
0
    def words(self, word, line):
        str = ''
        for w in word:
            if w in self.symbols and self.reading_string == False:
                if str != '':
                    if str.lower() in self.table:
                        self.channel_values.append(Word(
                            str, str.lower(), line))
                        #print(Word(str, self.table.get(str.lower()), line).toString())
                    else:
                        self.channel_values.append(
                            Word(str, 'identifier', line))
                        #print(Word(str, Tag.IDENTIFIER, line).toString())
                    str = ''
                self.channel_values.append(Token(w, w, line))
                #print(Token(w, self.symbols.get(w), line).toString())
            else:
                str += w
                if w == "'" and self.reading_string == False:
                    self.reading_string = True
                elif w == "'" and self.reading_string:
                    self.big_string += str
                    self.channel_values.append(String(self.big_string, line))
                    #print(String(self.big_string, line).toString())
                    self.reading_string = False
                    str = ''
                    self.big_string = ''

        if self.reading_string:
            self.big_string += str + ' '
        else:
            if str != '':
                self.channel_values.append(Word(str, 'identifier', line))
Esempio n. 9
0
def split_word(sentence):
    sentence = format_sentence(sentence)
    words = sentence.split(' ')
    word_array = []
    hash_table = get_hashtable()

    i = 0
    len_ = len(words)

    while i < len_:
        if i + 2 < len_ and is_exist((words[i] + ' ' + words[i + 1] + ' ' + words[i + 2]).encode('utf-8'), hash_table)\
                == u'\u2713':
            word_array.append(
                Word(words[i] + '_' + words[i + 1] + '_' + words[i + 2],
                     u'\u2713'))
            i += 2
        elif i + 1 < len_ and is_exist(
            (words[i] + ' ' + words[i + 1]).encode('utf-8'),
                hash_table) == u'\u2713':
            word_array.append(Word(words[i] + '_' + words[i + 1], u'\u2713'))
            i += 1
        else:
            if is_exist(words[i].encode('utf-8'), hash_table) == u'\u2713':
                word_array.append(Word(words[i], u'\u2713'))
            else:
                if words[i] != '\n':
                    word_array.append(Word('~' + words[i], u'\u274C'))
                else:
                    word_array.append(Word(words[i], u'\u2713'))
        i += 1

    return word_array
Esempio n. 10
0
	def get(self,word,stress_ambiguity=True):
		# python3 unnecessary:
		#if type(word)==str:
		#	word=word.decode('utf-8',errors='ignore')

		(word,punct)=gleanPunc(word)

		if self.has(word):
			words=self.dict['Word'][word.lower()]
		elif self.getprep:
			words=self.getprep(word,config=self.config)
		else:
			return [Word(word,[],None)]

		if not words:
			return [Word(word,[],None)]

		if type(words)==list:
			if type(words[0])==tuple:	# New word needs to be built
				wordobjs=[]
				for wordtuple in words:
					wrd=wordtuple[:2]
					attrs=wordtuple[2] if len(wordtuple)>2 else {}
					wordobj=self.make(wrd,word)
					for _k,_v in list(attrs.items()): setattr(wordobj,_k,_v)
					wordobjs+=[wordobj]
				self.dict['Word'][word.lower()]=wordobjs
				return self.maybeUnstress(wordobjs) if stress_ambiguity else wordobjs
			else:
				wordobjs=words
		else:
			wordobjs=[words]

		return self.maybeUnstress(wordobjs) if stress_ambiguity else wordobjs
Esempio n. 11
0
def filter_corpus_for_relevance(filein, fileout):
	print "Converting ", filein, " to ", fileout
	print "\tReading ", filein
	inpt = open(filein, 'r')
	content = filter(lambda x : not x == "", inpt.readlines()[0].replace("\n", "").split(" "))
	inpt.close()

	print "\tNow filtering and writing relevant words"
	print "\t", len(content), "words to go..."
	outpt = open(fileout, 'w')
	cache = dict()
	for i in xrange(len(content)):
		if i % 1000000 == 0:
			print "\t\tIteration", i
		word = content[i]
		if word not in cache:
			word_object = Word(word)
			if word_object.relevant():
				cache[word] = word_object.lemma()
			else:
				cache[word] = False
		token = cache[word]
		if not token == False:
			outpt.write(token + " ")
	outpt.close()
	print "Done!"
Esempio n. 12
0
    def pseudo2real(self, pseudo_words, increment=False):
        """ Convert a pseudo sentence to a real sentence.
            If increment is True, we update the occurence
        """
        #Word._connection.debug=True
        self.words = []
        for pword in pseudo_words:
            try:
                real_word = Word.byAppeared_name(
                    pword['appeared_name'].encode('utf-8'))
                #Do we increment?
                if increment: real_word.increment()
            except SQLObjectNotFound:
                #We don't have the word yet
                try:
                    main_type = MainType.byName(
                        pword['main_type'].encode('utf-8'))
                except SQLObjectNotFound:
                    main_type = MainType(name=pword['main_type'])
                try:
                    sub_type = SubType.byName(
                        pword['sub_type'].encode('utf-8'))
                except SQLObjectNotFound:
                    sub_type = SubType(name=pword['sub_type'])

                # We create a new word object
                real_word = Word(appeared_name=pword['appeared_name'],
                                 appeared_reading=pword['appeared_reading'],
                                 base_name=pword['base_name'],
                                 base_reading=pword['base_reading'],
                                 main_type=main_type.id,
                                 sub_type=sub_type.id)
            self.words.append(real_word)
Esempio n. 13
0
	def setOb(self, ob):
		Word.setOb(self, ob)
		self.autogenJSON()
		for typ in typs:
			if self.rats[typ]:
				self.rats[typ] *= 10
				self.onlyTyp = typ
Esempio n. 14
0
	def make(self,stressedipasylls_text,token):
		stressedipa=stressedipasylls_text[0]
		sylls_text=stressedipasylls_text[1]

		stress=stressedipa2stress(stressedipa)
		(prom_stress,prom_strength)=getStrengthStress(stress)
		syllphons=self.ipa2phons(stressedipa)

		sylls=[]

		for i in range(len(syllphons)):
			syllbody=self.use('SyllableBody',syllphons[i])
			syll=self.use('Syllable',(syllbody,prom_strength[i],prom_stress[i]))
			#print token,i,syllbody,syll,syllphons,stressedipa,stress,prom_stress,prom_strength
			sylls.append(syll)

		word=Word(token,sylls,sylls_text)
		word.ipa=stressedipa
		word.stress=stress
		word.lang=self.lang

		# when is word broken?
		if not word.ipa:
			word.broken=True


		return word
Esempio n. 15
0
    def test_case_value(self):
        letters = Letter.from_raw(test.letters)
        grid = Grid(test.grid)
        word = Word(grid, letters, Letter.from_raw(test.wordons))
        print(word.__repr__())

        assert word.value is test.value
Esempio n. 16
0
    def new_word(self):
        word1 = self.text1.get().lower().lstrip().rstrip()
        word2 = self.text2.get().lower().lstrip().rstrip()

        if len(word1) == 0 or len(word2) == 0:
            self.text2.insert(0.0, "Зполните все поля")
            return 0
        elif (word1[0] in ABV_english and word2[0] in ABV_english) or (word1[0] in ABV_russian and word2[0] in ABV_russian):
            self.text3.delete(0.0, END)
            self.text3.insert(0.0, "Так ниизяяя, подумайте дважды")
        else:
            languages = ['russian', 'english'] if word1[0] in ABV_russian else ['english', 'russian']
            
            word = Word(word=languages[0], translate=languages[1])
            result = word.set(word1, word2)

            if result == -1:
                self.text3.delete(0.0, END)
                self.text3.insert(0.0, "Такой вид перевода уже имеется")
            else:
                file = open('new_words.txt', 'a')
                file.write(word1+'-'+word2+'\n')
                file.close()
                
                self.text3.delete(0.0, END)
                self.text3.insert(0.0, "Перевод записан!")
Esempio n. 17
0
    def __init__(self, path, source='text'):
        self.words = []
        if source == 'umarker':  #xml from uncertaintymaker code
            import untangle
            template = untangle.parse(path)
            for word in template.transcription.words:
                if not word.word.cdata in ["[sil]", "[NOISE]", "[SPEECH]"]:
                    self.words.append(
                        Word(word.word.cdata, word.timeStart.cdata,
                             word.timeEnd.cdata))

        elif source == 'ldc-wrd':
            wrd_file = open(path, 'r')
            for line in wrd_file:
                elements = line.split(' ')
                word = elements[2].rstrip()
                time_from = self.convert_time(elements[0],
                                              from_time='ldc',
                                              to_time='ms')
                time_to = self.convert_time(elements[1],
                                            from_time='ldc',
                                            to_time='ms')
                self.words.append(Word(word, time_from, time_to))
            wrd_file.close()
        elif source == 'text':
            path = path.encode('ascii', 'ignore')
            for word in word_tokenize(path):
                if not word in ['"', "'", ".", "!", '``', '`', "''", '""']:
                    self.words.append(Word(word, '', ''))
Esempio n. 18
0
 def FindBestMove(self, hand, board):
     #Return word, anchor, anchorindex, direction
     #Get words for each anchor, find best word, compare best words sequentially
     anchors = board.GetAnchors()
     random.shuffle(anchors)
     bestWord = Word()
     bestWord.SetScore(min)
     for anchor in anchors:
         # get list of possible words for each anchor
         # match words to hand
         words = self.MatchWords(hand, anchor, board)
         # check for case no legal move is found
         if words is not None :
             # set scores for words, find best word
             for word in words.keys():
                 word.SetScore(self.heuristic.ScoreWord(word, hand))
                 if word.GetScore() > bestWord.GetScore() :
                     bestWord = word
                     bestAnchor = anchor
                     bestIndex = words[word][0]
                     bestDirection = words[word][1]
     # check for case no legal move is found
     if bestWord.GetScore() is min:
         raise Exception("BRI: No valid word options found!")
     return bestWord, bestAnchor, bestIndex, bestDirection
Esempio n. 19
0
def prepareWordBufferForDecode(buffer):
    """Add to every word of the buffer features GOVREF and LABELREF.

    GOVEREF is a copy of feature GOV and LABELREF a copy of LABEL
    GOV and LABEL are set to initialization values
    """
    for word in buffer.array:
        word.setFeat('GOV', str(Word.invalidGov()))
        word.setFeat('LABEL', Word.invalidLabel())
Esempio n. 20
0
 def readAllMcfFile(self):
     tokens = []
     for ligne in self.mcfFile:
         ligne = ligne.rstrip()
         tokens = ligne.split("\t")
         w = Word()
         for i in range(0, len(tokens)):
             w.setFeat(self.mcd.getColName(i), tokens[i])
         self.addWord(w)
     self.mcfFile.close()
Esempio n. 21
0
    def find_word(self):
        word = self.text1.get().lower().lstrip().rstrip()

        _word = Word()
        result = _word._translate(word)

        if result:
            self.text3.insert(0.0, result)
        else:
            message = "Подходящих слов нет\n\nПОРА БЫ ИХ ДОБАВИТЬ!!!"
            self.text3.insert(0.0, message)
Esempio n. 22
0
 def parse_matrix(self, matrix):
     self.__check_format(matrix)
     for i in xrange(len(matrix)):
         words_coordinates = self.__extract_words_coordinates(matrix[i])
         for (x1, x2) in words_coordinates:
             self.horizontal_words.append(Word(i, x1, x2))
     for j in xrange(len(matrix[0])):
         vertical_list = [list[j] for list in matrix]
         words_coordinates = self.__extract_words_coordinates(vertical_list)
         for (y1, y2) in words_coordinates:
             self.vertical_words.append(Word(j, y1, y2))
     self.__build_tree(self.horizontal_words[0], False)
Esempio n. 23
0
    def readNextWord(self):
        line = self.mcfFile.readline()
        if line == "":
            return None

        line = line.rstrip()
        tokens = line.split("\t")
        w = Word()
        for i in range(0, len(tokens)):
            w.setFeat(self.mcd.getColName(i), tokens[i])
        self.addWord(w)
        return w
def read_input():
    input_file = open("input.txt", "r")
    line1 = input_file.readline().rstrip("\n").split("OR")
    for i in line1:
        a.add_word(Word(i.strip()))

    num = int(input_file.readline())
    for i in range(num):
        line = input_file.readline().rstrip("\n").split("OR")
        new_sen = Sentence()
        for j in line:
            new_sen.add_word(Word(j.strip()))
        KB.append(new_sen)
Esempio n. 25
0
    def calculate_by_severity(
            self):  #Pontuação baseada na "severidade" de cada palavra
        dic = self.curtail_dictionary()
        counter = 0
        weight = 0
        wd = Word()

        for key, occurrences in dic.items():
            validated = wd.word_validation(key)
            weight = wd.categorize_word(validated)
            counter += occurrences * weight

        return counter
Esempio n. 26
0
def toCSVEntry(str):
    elements = str.split('\t')
    rel = elements[1]
    source = elements[2]
    target = elements[3]

    sourceForm = ''
    targetForm = ''
    sourceCategory = ''
    targetCategory = ''

    if source.endswith('/n') or source.endswith('/v') or source.endswith(
            '/a') or source.endswith('/s') or source.endswith('/r'):
        sourceForm = source[:-2].replace('/c/fr/', '').replace('_',
                                                               ' ').strip()
        sourceCategory = source[len(source) - 1:]
    else:
        sourceForm = source.replace('/c/fr/', '').replace('_', ' ').strip()

    if target.endswith('/n') or target.endswith('/v') or target.endswith(
            '/a') or target.endswith('/s') or target.endswith('/r'):
        targetForm = target[:-2].replace('/c/fr/', '').replace('_',
                                                               ' ').strip()
        targetCategory = target[len(target) - 1:]
    else:
        targetForm = target.replace('/c/fr/', '').replace('_', ' ').strip()

    line = sourceCategory + "\t" + source + "\t" + sourceForm + "\t" + rel + "\t" + targetCategory + "\t" + target + "\t" + targetForm + "\n"

    idSource = toId(sourceForm, sourceCategory)
    idTarget = toId(targetForm, targetCategory)

    sourceWord = None
    targetWord = None

    try:
        sourceWord = words[idSource]
    except KeyError:
        sourceWord = Word(sourceForm, sourceCategory)
        words[idSource] = sourceWord

    try:
        targetWord = words[idTarget]
    except KeyError:
        targetWord = Word(targetForm, targetCategory)
        words[idTarget] = targetWord

    sourceWord.add_relation(rel, targetWord)
    #sourceWord.print_relations_count()
    return line
Esempio n. 27
0
def ranked(*words, directory='lookup', func=lambda *x: 1):
    if directory == 'stemming':
        words = list(map(lambda x: stem(x), words))

    query = []
    processed = []
    for w in words:
        if w in processed:
            continue
        query.append(words.count(w))
        processed.append(w)

    query = np.array(list(map(lambda x: x / sum(query), query)))

    documents = set()
    # query = np.array([1/len(words) for i in range(len(words))])
    heap = []

    Words = []
    DFt = []
    for word in processed:
        WordInstance = Word(word, directory)
        Words.append(WordInstance)
        documents = documents.union(WordInstance.documents())
        DFt.append(len(WordInstance.documents()))

    N = len(documents)
    iDFt = list(map(lambda X: np.log10(N / X), DFt))

    for document in documents:
        vector = []
        for WordInstance in Words:
            ## calculate the TF(t,d)
            count = WordInstance.count(document)
            if count > 0:
                vector.append(1 + np.log10(count))
            else:
                vector.append(0)

        ## tf-idf weighting
        for i in range(len(iDFt)):
            vector[i] = vector[i] * iDFt[i]

        vector = np.array(vector)

        score = func(query, vector)

        heapq.heappush(heap, (score, document))

    return heap
Esempio n. 28
0
    def curtail_dictionary(self):
        aux_dic = {}
        dic = self.get_dictionary()
        wd = Word()

        for key in dic:
            word = wd.word_validation(key)
            occu = dic[key]

            if word in aux_dic:
                aux_dic[word] += occu
            else:
                aux_dic[word] = occu

        return aux_dic
Esempio n. 29
0
def insert_words(fv, hash_set):
    """
    -------------------------------------------------------
    Retrieves every Word in fv and inserts into
    a Hash_Set.
    -------------------------------------------------------
    Parameters:
        fv - the already open file containing data to evaluate (file)
        hash_set - the Hash_Set to insert the words into (Hash_Set)
    Returns:
        Each Word object in hash_set contains the number of comparisons
        required to insert that Word object from file_variable into hash_set.
    -------------------------------------------------------
    """
    fv.seek(0)
    lines = fv.readlines()

    for line in lines:
        #print("[{}]".format(line.rstrip()))
        words = line.split(' ')
        for word in words:
            if word.isalpha():
                #print("Word: {}".format(word))
                # Ignoring any punctuation and words with punctuation
                _word = Word(word.lower())
                hash_set.insert(_word)
    return
Esempio n. 30
0
 def MatchWords(self, hand, anchor, board):
     # match available tiles in hand to possible words for a certain anchor
     anchorWords = anchor.GetPossibleWords()
     handTiles = hand.PeekHand()
     anchorTile = anchor.GetTile()
     if anchorTile.GetLetter() is " ":
         handTiles.append(anchorTile)
     tiles = handTiles
     totalHand = Word(tiles)
     options = anchorWords.WordSearch(totalHand)
     optionsCleaned = dict()
     direction = anchor.GetDirection()
     timeStart = time.time()
     shuffledOptions = list(options.GetDict().values())
     random.shuffle(shuffledOptions)
     #print(shuffledOptions)
     for strWordList in shuffledOptions:
         for strWord in strWordList:
             if (len(strWord) <= len(handTiles)) :
                 word = self.MakeItWord(strWord)
                 if anchor.GetLetter() is " ":
                     indices = [int(len(strWord)/2)]
                 else:
                     indices = [i for i, a in enumerate(word.GetString()) if a == anchor.GetLetter() ]
                 for i in indices:
                     if board.IsWordLegal(word, anchor, i, direction):
                         optionsCleaned[word] = (i, direction)
         timeDiff = time.time() - timeStart
         if (timeDiff > 5):
             break
     return optionsCleaned
Esempio n. 31
0
def comparison_total(hash_set):
    """
    -------------------------------------------------------
    Sums the comparison values of all Word objects in hash_set.
    -------------------------------------------------------
    Parameters:
        hash_set - a hash set of Word objects (Hash_Set)
    Returns:
        total - the total of all comparison fields in the Hash_Set
            Word objects (int)
        max_word - the word having the most comparisons (Word)
    -------------------------------------------------------
    """

    total = 0
    max_word = Word('a')

    for word in hash_set:

        total += word.comparisons

        if word.comparisons > max_word.comparisons:

            max_word = word

    return total, max_word
Esempio n. 32
0
 def addWordToList(self, word, fWord):
     if self.wordInList(word):
         return False
     else:
         newWord = Word(word, fWord)
         self.wordList.append(newWord)
         return True
Esempio n. 33
0
    def extractTopics(self, text_list, threshold=0.5):
        """ Return a list of Words which seems to be the current topic """
        if not text_list: return []
        #Make pseudo sentences from the text_list
        pCandidates = []
        for message in text_list:
            pSentence = self.parser.parseSentence(message)
            if not pSentence: return []
            #pick out only the Candidates from the pseudo words
            candidates = [x for x in pSentence if x['main_type'] in [u'名詞']]
            #print 'candidates:%s' % candidates
            #pCandidates is a list of pseudo words which are candidates of the topic
            if candidates: pCandidates.extend(candidates)
            #print 'pCandidates:%s' % pCandidates
        #Next we make a dictionary of occurences of this particular candidates
        #Convert the pseudo words to real words
        true_candidates = Sentence()
        true_candidates.pseudo2real(pCandidates)
        #print 'true_candidates:%s' % true_candidates.words
        #make a dictionary {word_id:occurence}
        sample_words = {}
        for w in true_candidates:
            if sample_words.has_key(int(w.id)):
                sample_words[int(w.id)] += 1
            else:
                sample_words[int(w.id)] = 1

        if len(sample_words) < 1: return []
        #print 'sample_words:%s' % sample_words

        #We make the actual occurences from the database
        base_words = {}
        for word_id in sample_words.keys():
            base_words[word_id] = Word.get(word_id).occurence

        sample_count = 0
        for k, v in sample_words.iteritems():
            sample_count += v

        base_count = 0
        for k, v in base_words.iteritems():
            base_count += v

        scores = {}
        for w in sample_words.keys():
            scores[w] = self.score(float(sample_words[w]), float(sample_count),
                                   float(base_words[w]), float(base_count))

        items = [(v, k) for k, v in scores.items()]
        items.sort()
        items.reverse()  # so largest is first
        candidate_keywords = [x[1] for x in items if x[0] > threshold]
        #Fallback - if no topics are found, we choose a random noun
        # Yucky, but makes Juna more talkative.
        if (not candidate_keywords) and (candidates):
            choice = [candidates[int(random.random() * len(candidates))]]
            s = Sentence()
            s.pseudo2real(choice)
            candidate_keywords = [s[0].id]
        return candidate_keywords
Esempio n. 34
0
def insert_words(fv, hash_set):
    """
    -------------------------------------------------------
    Retrieves every Word in fv and inserts into
    a Hash_Set.
    Each Word object in hash_set contains the number of comparisons
    required to insert that Word object from file_variable into hash_set.
    -------------------------------------------------------
    Parameters:
        fv - the already open file containing data to evaluate (file)
        hash_set - the Hash_Set to insert the words into (Hash_Set)
    Returns:
        None
    -------------------------------------------------------
    """
    lines = fv.read()
    words = lines.split()

    for word in words:

        if word.isalpha():

            k = Word(word.lower())
            hash_set.insert(k)

    return
Esempio n. 35
0
def oracle(c):
    if (c.getStack().isEmpty()):
        return ('SHIFT', '')

    s0_index = c.getStack().top()
    b0_index = c.getBuffer().getCurrentIndex()
    #    print("s0_index = ", s0_index)
    s0_gov_index = int(
        c.getBuffer().getWord(s0_index).getFeat('GOVREF')) + s0_index
    s0_label = c.getBuffer().getWord(s0_index).getFeat('LABELREF')
    #    print('s0_index = ', s0_index, 'b0_index = ', b0_index, 's0_gov_index = ', s0_gov_index, 'b0_gov_index = ', b0_gov_index, 's0 label =', s0_label)

    if (s0_gov_index == b0_index):
        return ('LEFT', c.getBuffer().getWord(s0_index).getFeat('LABELREF'))

    if (b0_index < c.getBuffer().getLength()):
        b0_gov_index = int(
            c.getBuffer().getWord(b0_index).getFeat('GOVREF')) + b0_index
        if (b0_gov_index == s0_index):
            return ('RIGHT',
                    c.getBuffer().getWord(b0_index).getFeat('LABELREF'))

    if ((c.getStack().getLength() > 1)
            and check_all_dependents_of_word_in_ref_are_in_hyp(c, s0_index)
            and  # word on top must have all its dependents
        (int(c.getBuffer().getWord(c.getStack().top()).getFeat('GOV')) !=
         Word.invalidGov())):  # word on top of the stack has a governor
        return ('REDUCE', '')

    #print("no movement possible return SHIFT")
    if not c.getBuffer().endReached():
        return ('SHIFT', '')
    print("The machine is stucked")
    exit(1)
Esempio n. 36
0
    def test_case_find(self):
        letters = Letter.from_raw(test.letters)
        grid = Grid(test.grid)
        words = Word.find_all(letters, grid, test.language)

        assert len(words) is test.wordcount
        assert words[0].__str__() == test.topword
        assert words[0].value == test.topvalue
Esempio n. 37
0
 def loadFromParser(self, parsedSentence):
     #Read the parseTree
     parseTree = parsedSentence['parsetree']
     #Read the Dependencies
     dependencies = parsedSentence['dependencies']
     #Read the Words
     wordsInJson = parsedSentence['words']
     words = map(lambda w : Word.loadFromParser(w), wordsInJson)
     return self(dependencies, words, parseTree)
Esempio n. 38
0
 def deserialize(rep):
     navi = Navigator(rep["systemId"])
     wordsGenerated = []
     for word in rep["wordsGenerated"]:
         wordsGenerated.append(Word.deserialize(word))
     navi.wordsGenerated = wordsGenerated
     navi.length = rep["length"]
     navi.cutoff_length = rep["cutoff_length"]
     return navi
Esempio n. 39
0
 def createWord(word_str, pof_line):
     """
     Parses part of speech line
     :rtype: Word
     """
     knownPofs = [Word.NOUN, Word.VERB, Word.ADJ]
     ignored = ['OBJECT', 'ADJ_DIST', 'PREFIX']
     match = re.search(ur'^\[([\w]+):(\w+).*\]$', pof_line)
     if not match:
         raise Exception('Unknown line format: %s' % format(pof_line))
     pof = match.group(1)
     if pof not in knownPofs and pof not in ignored:
         print pof_line
         raise Exception('Unknown pof: %s' % format(pof))
     if pof in ignored:
         return None
     word = Word(word_str, '', pof, '')
     word.word_form = match.group(2).capitalize()
     return word
Esempio n. 40
0
	def make(self,stressedipasylls_text,token):
		stressedipa=stressedipasylls_text[0]
		sylls_text=stressedipasylls_text[1]
		
		stress=self.stressedipa2stress(stressedipa)
		(prom_stress,prom_strength)=self.getStrengthStress(stress)
		syllphons=self.ipa2phons(stressedipa)
		
		sylls=[]
		for i in range(len(syllphons)):
			syllbody=self.use('SyllableBody',syllphons[i])
			syll=self.use('Syllable',(syllbody,prom_strength[i],prom_stress[i]))
			sylls.append(syll)
		word=Word(token,sylls,sylls_text)
		word.stress=stress
		word.lang=self.lang
		
		
		return word
Esempio n. 41
0
 def setOb(self, ob):
     Word.setOb(self, ob)
     if "translations" in ob:
         for text in ob["translations"]:
             if not (text in self.rl.ru):
                 ruWord = RuWord({"text": text}, self.rl, 1)
                 self.rl.add(ruWord)
             self.trs.append(self.rl.ru[text])
     oneTyp = False
     for typ in typs:
         if typ in ob:
             if oneTyp:
                 oneTyp = False
                 break
             else:
                 oneTyp = typ
     if oneTyp:
         self.oneTyp = oneTyp
         self.rats[oneTyp] *= 10
Esempio n. 42
0
 def tokenize(self, sent, pos_tags):
     head = Word()
     head.word = [sent[0].lower()]
     head.actual = [sent[0]]
     head.pos = pos_tags[0]
     curr = head
     length = len(sent)
     for i in xrange(1, len(sent)):
         new_word = Word()
         new_word.left = curr
         curr.right = new_word
         new_word.word = [sent[i].lower()]
         new_word.actual = [sent[i]]
         new_word.pos = pos_tags[i]
         curr = new_word
     return head, length
def testValidator():
    val=WordValidator()
    word=Word("","","")
    try:
        val.wordempty(word.get_id(),word.get_lang(),word.get_word())
    except ValueError:
        assert True
        
    word1=Word("1","","")
    try:
        val.wordempty(word1.get_id(),word1.get_lang(),word1.get_word())
    except ValueError:
        assert True    
Esempio n. 44
0
 def loadFromParser(self, parsedSentence, tokenCountBeforeThisSentence = 0, docId = 0):
     #Read the parseTree
     parseTree = parsedSentence['parsetree']
     #Read the Dependencies
     dependencies = parsedSentence['dependencies']
     #Read the Words
     wordsInJson = parsedSentence['words']
     words = map(lambda w : Word.loadFromParser(w), wordsInJson)
     startIndex = 0;
     if tokenCountBeforeThisSentence != 0:
         startIndex = tokenCountBeforeThisSentence + 1;
     return self(dependencies, words, parseTree, startIndex, startIndex + len(words) - 1, docId)
Esempio n. 45
0
    def play(self):
        if not self.my_turn:
            return

        words = Word.find_all(self.letters, self.grid, self.locale)
        if len(words) == 0:
            self.swap()
        else:
            for word in words:
                if self.submit_word(word):
                    return
            self.swap()
Esempio n. 46
0
    def getWordsAndTheirParses(self, words):

        for word in words:
            lines = word.split("\n")
            firstLine = lines[0]
            splittedFirstLine = firstLine.split(":")
            if len(splittedFirstLine) < 2:
                continue
            wordText = splittedFirstLine[0]

            unique_serialized_word = self.upsertWord(wordText)

            correctMorphParseNumber = splittedFirstLine[1]  # get correct
            correctMorphParseFull = lines[int(correctMorphParseNumber)]

            unique_serialized_word.addParse(correctMorphParseFull)

           # if "Punc" not in correctMorphParseFull:
            w = Word(wordText)  # add all words to all_words_in_corpus
            w.correct_parse = Parse(correctMorphParseFull)
            self.all_words_in_corpus.append(w)
Esempio n. 47
0
def load_vectors(filename):

	def normalizeString(vec):
		vec = [ float(x) for x in vec]
		total = sqrt( sum([v**2 for v in vec]) )
		new_vec = []
		for v in vec:
			new_vec.append(v/total)
		return tuple(new_vec)
	
	print "\tLoading projections..."
	f = open(filename,'r')
	f.readline()
	content = [ filter( lambda x : not x in ["\n",""], l.replace("\n", "").split(" ")) for l in f.readlines() ]
	content = [ (l[0], normalizeString(l[1:])) for l in content ]
	content = filter(lambda x : not x[1] == None, content)
	words = dict()
	for (word, vector) in content:
		wordClass = Word(word)
		# print word, wordClass
		if wordClass.relevant():
			# print "Keeping: ", word, wordClass.pos()
			words[word.lower()] = vector
	return words
Esempio n. 48
0
    def apply(self, word, other_word, words):
        rule_changed_any_tag = False
        condition = (other_word.assigned_parse.tag == self.tag_b)
        tag_a_is_at_least_one_tag_of_morph_parses_of_word = False

        parse_to_assign_to_word = None
        unique_word = Word.find_word_by_text(word.text, words)
        for parse in unique_word.parses:

            if parse.tag == self.tag_a:
                tag_a_is_at_least_one_tag_of_morph_parses_of_word = True
                parse_to_assign_to_word = parse
                break

        if condition is True and tag_a_is_at_least_one_tag_of_morph_parses_of_word is True:
            if word.assigned_parse != parse_to_assign_to_word:
                word.assigned_parse = parse_to_assign_to_word
                rule_changed_any_tag = True
                #print(word.text + " changed." + self.text)
        return rule_changed_any_tag
Esempio n. 49
0
def tag_word(word, tags):
    '''A helper function to tag words.'''
    
    # Make the word a Word class. Cleanse it.
    word = Word(word)
    word.cleanse()
    
    # Open the tagging file. Match each tag with word in question. Once found, replace word
    # with tagged word.
    with codecs.open(tags, "r", "utf-8") as tags:   
        for tagline in tags:
            tagline = tagline.strip().split(', ') 
            if word.clean().lower() == tagline[0]:
                replacement = tagline[1] + word.__str__() + tagline[2]
                return replacement
    
    # If the word's not in the tag list, close tagging file and return the word as is.
    tags.close()
    return word.__str__()
Esempio n. 50
0
    def _checkSpellingInteractive(self, page):
        """Interactively goes through all wrong words in a page.

        All we do here is save doReplace = True if we want to replace it, while
        doReplace will do the actual replacement.
        """

        title = page.title()
        text = page.get()
        words = page.words
        for w in words: 
            w.doReplace = False

        # Go through all wrong words in this page
        for w in words:
            smallword = w.word

            # Check if on ignore list -> continue
            if self.pm.checkIsIgnored(title, smallword):
                continue

            bigword = Word(w.bigword)
            loc = w.location

            # Try to find replacement site
            w.site = text.find(bigword.word, loc)
            if w.site == -1: 
                w.site = text.find(bigword.word)
            if w.site == -1: 
                pywikibot.output(u"Not found any more in %s: %s" % (
                title, bigword.word))
                continue

            # We now have a potential site for replacement
            sugg = w.correctword
            w.LocAdd = len(bigword)

            # Check if the word has been replaced in the meantime with the
            # correct suggestion
            if len(text) > loc + len(sugg) and \
              text[w.site:w.site+len(sugg)].lower() == sugg.lower():
                continue
            if smallword == sugg: 
                continue

            # Adjust case
            if smallword[0].isupper(): 
                sugg = sugg[0].upper() + sugg[1:]

            # Print the two words
            pywikibot.output(u"Replace \03{lightred}\"%s\"" % smallword +
              "\03{default} \nby      \03{lightgreen}\"%s\"\03{default}" % sugg)

            # Print context
            pywikibot.output(u"    %s" % text[max(0,w.site-55):w.site+len(w)+55])
            choice = pywikibot.inputChoice('', ['Yes', 'yes', 'No', 'no',
               'No but dont save', 'never replace' 'Replace by something else', 
               '<--!sic!-->',
                'Exit and go to next site'], ['y', '\\', 'n', ']', 'b', 'v', 'r', 's','x'])

            # Evaluate user choice
            if choice == 'b':
                continue
            if choice in ('v'): 
                self.pm.markCorrectWord(smallword)
                continue
            if choice in ('n', ']'): 
                self.pm.markCorrectWordPerPage(title, smallword)
                continue
            if choice == 'x': 
                return
            if choice == 'r':
                w.replacement = pywikibot.input(u"What should I replace \"%s\" by?"
                                              % bigword.word)
                w.doReplace = True
            if choice == 's':
                w.replacement = bigword.word + '<--!sic!-->'
                w.doReplace = True

            if choice in ( 'y','\\'):
                w.replacement = bigword.replace(sugg)
                w.doReplace = True

                self.pm.markReplaced(smallword, sugg)
Esempio n. 51
0
    def get_most_likely_morph_parse(self, word_text):
        serialized_word = Word.find_word_by_text(word_text, self.words)
        if serialized_word is None or len(serialized_word.parses) == 0:
            return None

        return serialized_word.parses[0]  # to get the most likely
Esempio n. 52
0
def analyze_sentence(sentence):
    sentence = sentence.strip()
    if sentence == "":
        return
    nouns = []
    verbs = []
    adjectives = []
    current_word = start_word
    current_grammer = start_grammer
    previous_word = start_word
    previous_grammer = start_grammer

    for i in sentence.split():
        # See if word exists in our dictionary
        # If not create it
        if word_dictionary.has_key(i) == True:
            current_word = word_dictionary[i]
            current_grammer = current_word.getGrammer()
        else:
            # Get the grammer
            grammer = summartUtil.getWordFunction(i)

            # See if that grammer type exists, if not create it
            if grammer_dictionary.has_key(grammer) == True:
                current_grammer = grammer_dictionary[grammer]
            else:
                current_grammer = Word(grammer)
                grammer_dictionary[grammer] = current_grammer

            current_word = Word(i, current_grammer)
            word_dictionary[i] = current_word

            # Increase how much the word and grammer have been used
        current_word.increaseUsage()
        current_grammer.increaseUsage()

        # Link words together as valid words that can be next to
        # eachother
        previous_word.addPostWord(current_word)
        current_word.addPreWord(previous_word)

        previous_grammer.addPostWord(current_grammer)
        current_grammer.addPreWord(previous_grammer)

        previous_word = current_word
        previous_grammer = current_grammer

        # Check if the word is black listed meaning we don't want
        # any contextual linking
        if i in BLACK_LIST:
            continue

            # Link the context based on what type of word it is
        if current_word.getGrammer().getWord() == "noun":
            nouns.append(current_word)
        elif current_word.getGrammer().getWord() == "verb":
            verbs.append(current_word)
        elif current_word.getGrammer().getWord() == "adjective":
            adjectives.append(current_word)

    current_word.addPostWord(end_word)
    end_word.addPreWord(current_word)
    current_grammer.addPostWord(end_grammer)
    end_grammer.addPreWord(current_grammer)

    for i in nouns:
        current_word = i
        for j in nouns:
            if i == j:
                continue

            current_word.addRelated(j)

        for j in verbs:
            current_word.addRelated(j)
            j.addRelated(current_word)

        for j in adjectives:
            current_word.addAdjective(j)
            j.addRelated(current_word)
Esempio n. 53
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import summartUtil
from Word import Word
from Markov import Markov
import sys

DEBUG = False

# Words to not type
BLACK_LIST = ["the", "has", "hasn't", "have", "havn't", "a", "an", "is", "it", "to", "its"]

# The grammer a sentence can start with
start_grammer = Word(" ")
# The grammer a sentence can end with
end_grammer = Word(".")
# All the grammer we know about
grammer_dictionary = {start_grammer.getWord(): start_grammer, end_grammer.getWord(): end_grammer}
# The words that can start a sentence
start_word = Word(" ", start_grammer)
# The words that can end a sentence
end_word = Word(".", end_grammer)
# All the words we know about
word_dictionary = {start_word.getWord(): start_word, end_word.getWord(): end_word}


def analyze_sentence(sentence):
    sentence = sentence.strip()
    if sentence == "":
        return
Esempio n. 54
0
 def __init__(self, ob, rl):
     self.trs = []
     Word.__init__(self, ob, rl)
Esempio n. 55
0
def buildcorpus(corpus, rootpath, filelimit = 0):
    
    #rootpath = corpus.rootpath
    fileids = os.listdir(rootpath)
    
    hugewordlist = []   
    hugewordlist.extend(corpus.words)   # will contain distinct Word instances

    numoffiles = 0
    
    corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts")
    
    for fileid in fileids:
    
        
        allwords = nltk.FreqDist()    # will contain all words in this text
        
        doc_id = fileid.split(".")[0]
        # corpus.inserttext(doc_id)    ##### !   text in kendisini gondermeli
        newtext = Text(doc_id)
        
        path = rootpath + os.sep + fileid
        #lines = readtextlines(path)
    
        #rawtext = texter.readtxtfile(path)
        rawtext = texter.readnewstext(path)
        lines = texter.splitToSentences(rawtext)
        
        sntindex = 0
        # each line is a sentence
        for line in lines:
            words = []   # words in this sentence
            words = line.split()
            words = texter.eliminatepunctuation(words)
            words = [word for word in words if not word.isspace()]
            
            
            
            for word in words:
                allwords.inc(word)
                
                
                newword = Word(word)
                newword.insertsentenceid(doc_id+"_"+str(sntindex))
                
                if allwords[word] <= 1:    # if this was not added to the hugelist before, add it
                    hugewordlist.append(newword)
                
                    
            sentence = Sentence(sntindex)
            sntindex = sntindex + 1
            
            # sentence'a Word mu wordindex mi atalim?
            for word in words:
                index = hugewordlist.index(Word(word))
                hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1))
                sentence.insertword(index)
                
            newtext.insertsentence(sentence)
            
        if (not rawtext.isspace()) or (len(allwords) != 0):   
            corpus.inserttext(newtext)    
            
            print str(numoffiles)," : finished handling the words-snts-txts ",doc_id 
    
                
            numofwords = reduce(lambda x,y : x+y, allwords.values())
            
            for word in hugewordlist:
                cnt =  allwords[word.literal]
                #freq = cnt / float(numofwords)
                word.assigntermfreq(cnt, numofwords, doc_id)
                #hugewordlist[index].toscreen()
        
        numoffiles = numoffiles + 1
        if filelimit == numoffiles:
            break       

        
    # end for - docs
    

    numofdocs = len(fileids)
    print "computing tf*idf"
    for word in hugewordlist:
        word.computeinvdocfreq(numofdocs)
        word.computeTFIDF()
        #word.toscreen()
        
    corpus.assignwords(hugewordlist)
    print "corpus length ",str(len(corpus.words))," words"
    print "huges length ",str(len(hugewordlist))," words"
    print "exiting buildcorpus()"
    
    print "pickle-dumping words"
    corpus.pickledumpwords()
Esempio n. 56
0
    def spellcheck_blacklist(self, text, badDict, return_for_db=False,
                             return_words=False, title=None, verbose=False,
                             range_level="full"):
        """ Checks a single text against the words in the blacklist and returns
        a list of wrong words.
        """

        loc = 0 # the current location in the text we parse
        old_loc = 0
        curr_r = 0
        ranges = self.forbiddenRanges(text, level=range_level)

        ranges = sorted(ranges)
        wrongWords = []
        prepare = []
        j = 0

        # Regex to find next word: look for any whitespace or control
        # characters followed by a "word" stopping at the next whitespace or
        # control character.
        wordsearch = re.compile(r'([\s\=\<\>\_/-]*)([^\s\=\<\>\_/\-|]+)')
        if self._testcase_compat:
            wordsearch = re.compile(r'([\s\=\<\>\_/-]*)([^\s\=\<\>\_/\-]+)')

        while True:

            if verbose:
                print "== Start wordsearch at location", loc

            match = wordsearch.search(text,loc)
            LocAdd = 0
            j = j + 1

            if not match:
                # No more words on this page
                break

            if verbose:
                print j, "Check '%s'" % text[ match.start():match.end()].encode("utf8"), "at loc", loc

            # Check if we are in forbidden range
            curr_r, loc, in_nontext = self.check_in_ranges(ranges, 
                                       match.start(), match.end(), curr_r, loc)

            if verbose:
                print "    -> moved loc pointer to ", loc, "skip is", in_nontext

            if in_nontext:
                continue

            # Split the words up at special places like &nbsp; or a dash
            spl = re.split('&nbsp;', match.group(2))
            if len(spl) > 1: 
                LocAdd = 5
            elif len(spl) == 1:
                spl = re.split(u'–', spl[0])

            loc_start = loc + len(match.group(1)) # start of the word
            ww = spl[0]
            LocAdd += len(ww) + 1
            bigword = Word(ww)
            smallword = bigword.derive()

            if verbose:
                print "    ==> smallword", smallword.encode("utf8")

            done = False
            for r in ranges:
                # If the end of the range coincides with the start of the word
                # we might not have a full word -> rather discard it.
                if r[1] == loc_start:
                    loc += LocAdd
                    done = True
                    if verbose:
                        print "    we are done with ", smallword.encode("utf8"), "due to range", r

            if done:
                continue

            # We advance the location by the characters skipped (group 1)
            loc += len(match.group(1))
            if range_level != "none" or self._testcase_compat:
                done = self._text_skip(text, loc, smallword, title, return_for_db)
            if verbose:
                print "    new loc (after accounting for skipped chars)", loc, "which is '%s'" % match.group(1)

            ###################################
            #use this code to insert into the database
            if return_for_db:
                if not done:
                    if verbose: print "    ===> append word for db: ", smallword.encode("utf8")

                    wrongWords.append(smallword)

            else:
                ###################################
                #here we check whether it is wrong
                if not done and smallword.lower() in badDict \
                   and not smallword == '' and not smallword.isupper():

                    if not smallword == badDict[smallword.lower()]:
                        if return_words:
                            wrongWords.append(
                                WrongWord(wrong_word = smallword,
                                          location = loc, 
                                          bigword = bigword.word,
                                          correctword = badDict[smallword.lower()]
                                ) 
                            )
                        else:
                            wrongWords.append([smallword, bigword, loc, badDict[smallword.lower()],
                                text[max(0, loc-100):min(loc+100, len(text))] ])

            # We advance the location by the characters of the word (group 2)
            loc += LocAdd
            if verbose and len(text) > loc:
                print "    new loc (after accounting for word)", loc, "we are at", text[loc].encode("utf8")

        return wrongWords
Esempio n. 57
0
	def __init__(self, ob, rl, after=0):
		Word.__init__(self, ob, rl)
		self.after = after