def align_audio(payload: PayLoad): try: load_audio(payload.bucket_id, payload.sub_dir, payload.file_name) except ClientError: raise HTTPException(status_code=404, detail="Item not found") prepare_text(payload.text) sync_map = force_align() clean_dir() response = Response(alignment=sync_map, file_name=payload.file_name) return response
def main(): # scoring_fn = wordnet_lesk_ambiguity # output_extension = "_wordnet_amb_clean" scoring_fn = csi_lesk_ambiguity output_extension = "_csi_amb" sw = stopwords.words("english") filepaths = [ 'data/short_jokes/train.tsv', 'data/short_jokes/dev.tsv', 'data/short_jokes/test.tsv' ] # change for each dataset for filepath in filepaths: with open(filepath, "r") as in_f: with open(filepath.replace(".tsv", f"{output_extension}.tsv"), "w") as out_f: reader = csv.reader(in_f) writer = csv.writer(out_f) for row in reader: scores = [] text = row[-1].replace("_____", " ") text = utils.prepare_text(text).split() for word in text: if word != "[SEP]": word = word.lower() if word in sw: scores.append((word, 0)) else: scores.append((word, scoring_fn(text, word))) out_row = row + [scores] writer.writerow(out_row)
def server_discover(answer, magic="fna349fn", listen_ip="0.0.0.0", port=50000, password=None, disable_hidden=False): my_ip = gethostbyname(gethostname()) log.info("Starting UDP server") # Prepare password if password: password = prepare_text(password) # Setup Protocol DiscoverServerProtocol.magic = magic DiscoverServerProtocol.my_ip = my_ip DiscoverServerProtocol.password = password DiscoverServerProtocol.disable_hidden = disable_hidden DiscoverServerProtocol.answer = answer # Start running loop = asyncio.get_event_loop() listen = loop.create_datagram_endpoint(DiscoverServerProtocol, local_addr=(listen_ip, port), allow_broadcast=True) transport, protocol = loop.run_until_complete(listen) try: loop.run_forever() except KeyboardInterrupt: pass finally: log.info("Shutdown server") transport.close() loop.close()
def predict(original_text): split_text = original_text.split() prepared = prepare_text(split_text, char_to_int) predictions = model.predict(prepared) predictions[predictions > 0.3] = 1 predictions[predictions <= 0.3] = 0 tokenized = predictions_to_text(split_text, predictions) return tokenized
def discover(magic="fna349fn", port=50000, password=None, timeout=5): log.info("Looking for a server discovery") # Prepare password if password: password = prepare_text(password) # Build message msg = "%s%s" % (magic, datetime.datetime.now().timestamp()) try: # Send discover s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # create UDP socket s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) # this is a broadcast socket s.sendto(crypt(msg, password), ('<broadcast>', port)) s.settimeout(timeout) data, addr = s.recvfrom(1024) # wait for a packet except socket.timeout: log.info("No servers found") raise TimeOutException("No servers found") msg = decrypt(data, password) # Get a correlates response if msg.startswith(magic): msg_details = msg[len(magic):] log.debug("Got service announcement from '%s' with response: %s" % ("%s:%s" % addr, msg_details)) if msg_details.startswith("#ERROR#"): error_details = msg_details[len("#ERROR#"):] log.debug("Response from server: %s" % error_details) if "timestamp" in error_details: raise TimeStampException(error_details) elif "password" in error_details: raise PasswordMagicException(error_details) else: ok_details = msg_details[len("#OK#"):] return ok_details, "%s:%s:" % addr
def main(input_data="all"): # Read json files from infreemation reporting API, new ones added # periodically # data: "all" or "latest" base_path = files_config.raw_data_path filename = 'infreemation-dump-' df = pd.DataFrame() for i in range(1, 6): filepath = base_path + filename + str(i) + '.json' with open(filepath) as f: data = f.read() data = json.loads(data) dff = json_normalize(data['published']['request']) df = df.append(dff) df = df.reset_index(drop=True) # Need to get the FOI ID from the url field for i in df.index: df.at[i, 'id'] = utils.extract_id(df.iloc[i]['url']) # Prepare subject field, which is plain text for i in df.index: try: df.at[i, 'subject_prepared'] = utils.prepare_text( df.iloc[i]['subject']) except: print(df.iloc[i]['subject']) # Prepare request body # Strip HTML for i in df.index: df.at[i, 'requestbody_stripped'] = utils.strip_element( df.iloc[i]['requestbody']) # Remove stopwords, non alpha, etc. df['requestbody_prepared'] = df.apply( lambda x: prepare_requestbody(x['requestbody_stripped']), axis=1) # Store pre-processed data filename = files_config.preprocessed_filename filepath = base_path + filename df.reset_index(drop=True).to_pickle(filepath)
def server_discover(loop, magic=DiscoveryConfig.MAGIC, listen_ip=DiscoveryConfig.IP, port=DiscoveryConfig.PORT, password=DiscoveryConfig.PASSWORD, disable_hidden=False): server_ip = get_ip_address() logger.info('Starting Discover Server at port %s', port) if password: password = prepare_text(password) config = dict() config[KEYS.GRPC_SERVER_PORT] = GRPCConfig.PORT config[KEYS.GRPC_SERVER_IP] = get_ip_address() _answer = json.dumps(config) # Setup Protocol DiscoverServerProtocol.magic = magic DiscoverServerProtocol.server_ip = server_ip DiscoverServerProtocol.password = password DiscoverServerProtocol.disable_hidden = disable_hidden DiscoverServerProtocol.answer = _answer transport = None try: # Start running the server listen = loop.create_datagram_endpoint(DiscoverServerProtocol, local_addr=(listen_ip, port), allow_broadcast=True) transport, protocol = loop.run_until_complete(listen) loop.run_forever() except: pass finally: logger.info('Shutting down Discovery Server') if transport is not None: transport.close() loop.close()
choice = input( 'Would you like to suggest a topic or should i pick from samples? ') if choice in 'sample': choice = input('Choose from samples: \n 1-) Eminem \n 2-) 50 Cent') if choice == '1': lyrics = eminem elif choice == '2': lyrics = fifty_cent else: limit = input("Set a line limit: ") lyrics = crawl_lyrics(choice)[:int(limit)] ap = AudioProcessing() for text in lyrics: text = prepare_text(text) tts = TTS(Voice(Voice.Language.enUS, Voice.Sex.male, "Justin")) ap.modify(tts.speak(text), 0.1, -1, 1, mid_part=0.05, mid_pitch=2, mid_stretch=1, accel=1.0) aud, br = ap.insert_beat() aud, br = ap.add_duet('Drop the beat DJ!', 0, 2) aud, br = ap.add_duet('Aha. Yeah. Aha!', 3, 0.5) ap.write(aud, br, name=choice)
def put_stress(self, text: str, stress_symbol: str = '+', accuracy_threshold: float = 0.75, replace_similar_symbols: bool = False, lemmatize_words: bool = False, use_batch_mode: bool = True) -> str: ''' Split the text into words and place stress on them. The source text formatting is preserved. If some words already have an stress, it will be saved. The stress is indicated using the "'" or '+' symbol after the stressed vowel. The threshold for the accuracy of stress placement allows you to cut off stresses, the prediction accuracy of which is lower (<=) than specified. The 0.75 threshold reduces the number of incorrectly placed stresses, but increases the number of words that will not be stressed. The 0.0 threshold allows you to place stresses in absolutely all words, but not always correctly. 1. text - string with text 2. stress_symbol - stress symbol, only "'" and '+' are supported 3. accuracy_threshold - threshold for the accuracy of stress placement (from 0.0 to 1.0) 4. replace_similar_symbols - True: replacing similar latin symbols with cyrillic ones 5. lemmatize_words - True: lemmatize (normalize) each word before searching in exception dictionary 6. use_batch_mode - True: place stress on words for 1 call to the neural network (speeds up work by 1.5-2 times) 7. returns text with placed stresses ''' if stress_symbol != DEF_STRESS_SYMBOL and stress_symbol != ADD_STRESS_SYMBOL: raise ValueError( "Unsupported stress symbol '{}'! Only \"{}\" and '{}' are supported." .format(stress_symbol, DEF_STRESS_SYMBOL, ADD_STRESS_SYMBOL)) words = prepare_text(text, replace_similar_symbols=replace_similar_symbols) tokens = tokenize(text, replace_similar_symbols=replace_similar_symbols) words_with_endings = add_endings(words) # Stress placement stressed_words = [] batch_for_predict = [] for word in words_with_endings: # When using the module after russian_g2p.Accentor, it is possible situation that one of the words passed to the input, contains # stress symbol after each letter (for example, 'почем' -> '+п+о+ч+е+м+') if word.count(DEF_STRESS_SYMBOL) > 2: word = word.replace(DEF_STRESS_SYMBOL, '') elif word.count(ADD_STRESS_SYMBOL) > 1: word = word.replace(ADD_STRESS_SYMBOL, '') if word.find(DEF_STRESS_SYMBOL) != -1 and word[ word.find(DEF_STRESS_SYMBOL) - 1] in VOWELS: continue elif word.find(ADD_STRESS_SYMBOL) != -1 and word[ word.find(ADD_STRESS_SYMBOL) - 1] in VOWELS: continue elif count_number_of_vowels(word) == 1: stressed_word = word[:find_vowel_indices(word)[-1] + 1] + stress_symbol + word[ find_vowel_indices(word)[-1] + 1:] stressed_words.append(stressed_word) elif SEARCH_TWO_VOWELS_RE.search( word) and self.exception_dict_wrapper.is_in_dict( del_endings(word), lemmatize_words): stressed_word = self.exception_dict_wrapper.put_stress( del_endings(word), stress_symbol, lemmatize_words) stressed_words.append(stressed_word) elif use_batch_mode and SEARCH_TWO_VOWELS_RE.search(word): batch_for_predict.append(word) stressed_words.append(word) elif not use_batch_mode and SEARCH_TWO_VOWELS_RE.search(word): stressed_word, accuracity = self.__predict( word, stress_symbol)[0] if accuracity >= accuracy_threshold: stressed_words.append(stressed_word) # Predict all words in 1 network call if use_batch_mode: batch_with_stressed_words = self.__predict(batch_for_predict, stress_symbol) if len(batch_for_predict) > 0: updated_stressed_words = [] idx_in_batch = 0 for stressed_word in stressed_words: if idx_in_batch >= len(batch_for_predict): break if stressed_word == batch_for_predict[ idx_in_batch] and batch_with_stressed_words[ idx_in_batch][1] >= accuracy_threshold: updated_stressed_words.append( batch_with_stressed_words[idx_in_batch][0]) idx_in_batch += 1 elif stressed_word == batch_for_predict[idx_in_batch]: idx_in_batch += 1 else: updated_stressed_words.append(stressed_word) stressed_words = updated_stressed_words # Transferring stresses to the source text stressed_text = [] for token in tokens: if count_number_of_vowels(token) == 0: stressed_text.append(token) else: try: unstressed_word = stressed_words[0].replace( stress_symbol, '') except IndexError: unstressed_word = '' if unstressed_word == token.lower(): stress_position = stressed_words[0].find(stress_symbol) stressed_token = token[:stress_position - 1] + stress_symbol + token[ stress_position - 1:] stressed_text.append(stressed_token) stressed_words = stressed_words[1:] else: stressed_text.append(token) stressed_text = ''.join(stressed_text) return stressed_text
def prepare_requestbody(s): l = sent_tokenize(s) l = [utils.prepare_text(e) for e in l] l = [e for e in l if e != ''] return l