Exemple #1
0
    def make_ngrams(self, text):
        """Transforms a string into a list of 4-grams, using multiple cores"""

        result = multiprocessing.Queue()

        #Starts the workers
        def worker(nr, string, result):

            if self.approach == 'w':
                if nr == 0:
                    ngrams = timbl.window_string(string, True)
                else:
                    ngrams = timbl.window_string(string)
            elif self.approach == 'l':
                if nr == 0:
                    ngrams = timbl.window_string_letters(string, True)
                else:
                    ngrams = timbl.window_string_letters(string)

            result.put((nr, ngrams))

        if self.approach == 'w':
            substrings = timbl.divide_iterable(text.split(), 10, 3)
        elif self.approach == 'l':
            substrings = timbl.divide_iterable(text, 10, 15)

        for n, i in enumerate(substrings):
            t = multiprocessing.Process(target=worker, args=[n, i, result])
            t.start()

        #Wait until all results are in
        resultlist = []

        while len(resultlist) < 10:

            while not result.empty():
                resultlist.append(result.get())

            time.sleep(1)

        #Sort and merge the results
        resultlist = sorted(resultlist, key=lambda x: x[0])
        between_result = [x[1] for x in resultlist]
        end_result = []
        for i in between_result:
            end_result += i

        return end_result
Exemple #2
0
    def make_ngrams(self,text):
        """Transforms a string into a list of 4-grams, using multiple cores"""

        result = multiprocessing.Queue()

        #Starts the workers
        def worker(nr,string,result):

            if self.approach == 'w':
                if nr == 0:
                    ngrams = timbl.window_string(string,True)
                else:
                    ngrams = timbl.window_string(string)
            elif self.approach == 'l':
                if nr == 0:
                    ngrams = timbl.window_string_letters(string,True)
                else:
                    ngrams = timbl.window_string_letters(string)

            result.put((nr,ngrams))

        if self.approach == 'w':
            substrings = timbl.divide_iterable(text.split(),10,3)
        elif self.approach == 'l':
            substrings = timbl.divide_iterable(text,10,15)    

        for n,i in enumerate(substrings):
            t = multiprocessing.Process(target=worker,args=[n,i,result])
            t.start()

        #Wait until all results are in
        resultlist = []

        while len(resultlist) < 10:

            while not result.empty():
                resultlist.append(result.get())

            time.sleep(1)

        #Sort and merge the results
        resultlist = sorted(resultlist,key=lambda x:x[0])
        between_result = [x[1] for x in resultlist]    
        end_result = []
        for i in between_result:
            end_result += i

        return end_result
Exemple #3
0
    def attenuate_string_multicore(self, string, lex):
        """Replaces infrequent words in string with #DUMMY, using multiple cores"""

        #Prepare input and output
        words = string.split()
        word_nr = len(words)
        result = multiprocessing.Queue()

        #The actual work
        def dummify(n, word):
            try:
                if not word in lex[len(word)] and word not in ['_']:
                    return '#DUMMY'
                else:
                    return word

            except KeyError:
                return '#DUMMY'

        #Starts the workers
        def worker(nr, words, result):
            resultstring = ''
            wordtotal = len(words)

            for n, i in enumerate(words):

                resultstring += ' ' + dummify(n, i)

                #Report progress of the first worker
                if nr == 0 and n % 100000 == 0:
                    print('  ', n / wordtotal)

            result.put((nr, resultstring))

        substrings = timbl.divide_iterable(words, 10)

        for n, i in enumerate(substrings):
            t = multiprocessing.Process(target=worker, args=[n, i, result])
            t.start()

        #Wait until all results are in
        resultlist = []

        while len(resultlist) < 10:

            while not result.empty():
                resultlist.append(result.get())

            time.sleep(1)

        #Sort and merge the results
        resultlist = sorted(resultlist, key=lambda x: x[0])
        actual_result = [x[1] for x in resultlist]

        return ' '.join(actual_result).strip()
Exemple #4
0
    def attenuate_string_multicore(self,string,lex):
        """Replaces infrequent words in string with #DUMMY, using multiple cores"""

        #Prepare input and output
        words = string.split()
        word_nr = len(words)
        result = multiprocessing.Queue()

        #The actual work
        def dummify(n,word):        
            try:
                if not word in lex[len(word)] and word not in ['_']:
                    return '#DUMMY'
                else:
                    return word

            except KeyError:
                return '#DUMMY'

        #Starts the workers
        def worker(nr,words,result):
            resultstring = ''
            wordtotal = len(words)

            for n,i in enumerate(words):
                
                resultstring += ' ' + dummify(n,i)

                #Report progress of the first worker
                if nr == 0 and n%100000 == 0:
                    print('  ',n / wordtotal)

            result.put((nr,resultstring))

        substrings = timbl.divide_iterable(words,10)
        
        for n,i in enumerate(substrings):
            t = multiprocessing.Process(target=worker,args=[n,i,result])
            t.start()

        #Wait until all results are in
        resultlist = []

        while len(resultlist) < 10:

            while not result.empty():
                resultlist.append(result.get())

            time.sleep(1)

        #Sort and merge the results
        resultlist = sorted(resultlist,key=lambda x:x[0])
        actual_result = [x[1] for x in resultlist]
            
        return ' '.join(actual_result).strip()