def make_ngrams(self, text): """Transforms a string into a list of 4-grams, using multiple cores""" result = multiprocessing.Queue() #Starts the workers def worker(nr, string, result): if self.approach == 'w': if nr == 0: ngrams = timbl.window_string(string, True) else: ngrams = timbl.window_string(string) elif self.approach == 'l': if nr == 0: ngrams = timbl.window_string_letters(string, True) else: ngrams = timbl.window_string_letters(string) result.put((nr, ngrams)) if self.approach == 'w': substrings = timbl.divide_iterable(text.split(), 10, 3) elif self.approach == 'l': substrings = timbl.divide_iterable(text, 10, 15) for n, i in enumerate(substrings): t = multiprocessing.Process(target=worker, args=[n, i, result]) t.start() #Wait until all results are in resultlist = [] while len(resultlist) < 10: while not result.empty(): resultlist.append(result.get()) time.sleep(1) #Sort and merge the results resultlist = sorted(resultlist, key=lambda x: x[0]) between_result = [x[1] for x in resultlist] end_result = [] for i in between_result: end_result += i return end_result
def make_ngrams(self,text): """Transforms a string into a list of 4-grams, using multiple cores""" result = multiprocessing.Queue() #Starts the workers def worker(nr,string,result): if self.approach == 'w': if nr == 0: ngrams = timbl.window_string(string,True) else: ngrams = timbl.window_string(string) elif self.approach == 'l': if nr == 0: ngrams = timbl.window_string_letters(string,True) else: ngrams = timbl.window_string_letters(string) result.put((nr,ngrams)) if self.approach == 'w': substrings = timbl.divide_iterable(text.split(),10,3) elif self.approach == 'l': substrings = timbl.divide_iterable(text,10,15) for n,i in enumerate(substrings): t = multiprocessing.Process(target=worker,args=[n,i,result]) t.start() #Wait until all results are in resultlist = [] while len(resultlist) < 10: while not result.empty(): resultlist.append(result.get()) time.sleep(1) #Sort and merge the results resultlist = sorted(resultlist,key=lambda x:x[0]) between_result = [x[1] for x in resultlist] end_result = [] for i in between_result: end_result += i return end_result
def attenuate_string_multicore(self, string, lex): """Replaces infrequent words in string with #DUMMY, using multiple cores""" #Prepare input and output words = string.split() word_nr = len(words) result = multiprocessing.Queue() #The actual work def dummify(n, word): try: if not word in lex[len(word)] and word not in ['_']: return '#DUMMY' else: return word except KeyError: return '#DUMMY' #Starts the workers def worker(nr, words, result): resultstring = '' wordtotal = len(words) for n, i in enumerate(words): resultstring += ' ' + dummify(n, i) #Report progress of the first worker if nr == 0 and n % 100000 == 0: print(' ', n / wordtotal) result.put((nr, resultstring)) substrings = timbl.divide_iterable(words, 10) for n, i in enumerate(substrings): t = multiprocessing.Process(target=worker, args=[n, i, result]) t.start() #Wait until all results are in resultlist = [] while len(resultlist) < 10: while not result.empty(): resultlist.append(result.get()) time.sleep(1) #Sort and merge the results resultlist = sorted(resultlist, key=lambda x: x[0]) actual_result = [x[1] for x in resultlist] return ' '.join(actual_result).strip()
def attenuate_string_multicore(self,string,lex): """Replaces infrequent words in string with #DUMMY, using multiple cores""" #Prepare input and output words = string.split() word_nr = len(words) result = multiprocessing.Queue() #The actual work def dummify(n,word): try: if not word in lex[len(word)] and word not in ['_']: return '#DUMMY' else: return word except KeyError: return '#DUMMY' #Starts the workers def worker(nr,words,result): resultstring = '' wordtotal = len(words) for n,i in enumerate(words): resultstring += ' ' + dummify(n,i) #Report progress of the first worker if nr == 0 and n%100000 == 0: print(' ',n / wordtotal) result.put((nr,resultstring)) substrings = timbl.divide_iterable(words,10) for n,i in enumerate(substrings): t = multiprocessing.Process(target=worker,args=[n,i,result]) t.start() #Wait until all results are in resultlist = [] while len(resultlist) < 10: while not result.empty(): resultlist.append(result.get()) time.sleep(1) #Sort and merge the results resultlist = sorted(resultlist,key=lambda x:x[0]) actual_result = [x[1] for x in resultlist] return ' '.join(actual_result).strip()