Python prepare_text Examples, utils.prepare_text Python Examples

Example #1

0

Show file

File: main.py Project: Slyfest/aeneas_fastapi

def align_audio(payload: PayLoad):
    try:
        load_audio(payload.bucket_id, payload.sub_dir, payload.file_name)
    except ClientError:
        raise HTTPException(status_code=404, detail="Item not found")
    prepare_text(payload.text)
    sync_map = force_align()
    clean_dir()

    response = Response(alignment=sync_map, file_name=payload.file_name)
    return response

Example #2

0

Show file

File: ambiguity.py Project: derosejf/RedditHumorDetection

def main():
    # scoring_fn = wordnet_lesk_ambiguity
    # output_extension = "_wordnet_amb_clean"

    scoring_fn = csi_lesk_ambiguity
    output_extension = "_csi_amb"

    sw = stopwords.words("english")
    filepaths = [
        'data/short_jokes/train.tsv', 'data/short_jokes/dev.tsv',
        'data/short_jokes/test.tsv'
    ]  # change for each dataset
    for filepath in filepaths:
        with open(filepath, "r") as in_f:
            with open(filepath.replace(".tsv", f"{output_extension}.tsv"),
                      "w") as out_f:
                reader = csv.reader(in_f)
                writer = csv.writer(out_f)
                for row in reader:
                    scores = []
                    text = row[-1].replace("_____", " ")
                    text = utils.prepare_text(text).split()
                    for word in text:
                        if word != "[SEP]":
                            word = word.lower()
                        if word in sw:
                            scores.append((word, 0))
                        else:
                            scores.append((word, scoring_fn(text, word)))
                    out_row = row + [scores]
                    writer.writerow(out_row)

Example #3

0

Show file

File: server.py Project: cr0hn/TestingBench

def server_discover(answer, magic="fna349fn", listen_ip="0.0.0.0", port=50000, password=None, disable_hidden=False):

	my_ip = gethostbyname(gethostname())

	log.info("Starting UDP server")

	# Prepare password
	if password:
		password = prepare_text(password)

	# Setup Protocol
	DiscoverServerProtocol.magic = magic
	DiscoverServerProtocol.my_ip = my_ip
	DiscoverServerProtocol.password = password
	DiscoverServerProtocol.disable_hidden = disable_hidden
	DiscoverServerProtocol.answer = answer

	# Start running
	loop = asyncio.get_event_loop()

	listen = loop.create_datagram_endpoint(DiscoverServerProtocol,
	                                       local_addr=(listen_ip, port),
	                                       allow_broadcast=True)
	transport, protocol = loop.run_until_complete(listen)

	try:
		loop.run_forever()
	except KeyboardInterrupt:
		pass
	finally:
		log.info("Shutdown server")
		transport.close()
		loop.close()

Example #4

0

Show file

def predict(original_text):
    split_text = original_text.split()
    prepared = prepare_text(split_text, char_to_int)
    predictions = model.predict(prepared)
    predictions[predictions > 0.3] = 1
    predictions[predictions <= 0.3] = 0
    tokenized = predictions_to_text(split_text, predictions)
    return tokenized

Example #5

0

Show file

File: client.py Project: cr0hn/TestingBench

def discover(magic="fna349fn", port=50000, password=None, timeout=5):
	log.info("Looking for a server discovery")

	# Prepare password
	if password:
		password = prepare_text(password)

	# Build message
	msg = "%s%s" % (magic, datetime.datetime.now().timestamp())

	try:
		# Send discover
		s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)  # create UDP socket
		s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)  # this is a broadcast socket
		s.sendto(crypt(msg, password), ('<broadcast>', port))
		s.settimeout(timeout)

		data, addr = s.recvfrom(1024)  # wait for a packet
	except socket.timeout:
		log.info("No servers found")

		raise TimeOutException("No servers found")

	msg = decrypt(data, password)

	# Get a correlates response
	if msg.startswith(magic):
		msg_details = msg[len(magic):]

		log.debug("Got service announcement from '%s' with response: %s" % ("%s:%s" % addr, msg_details))

		if msg_details.startswith("#ERROR#"):
			error_details = msg_details[len("#ERROR#"):]

			log.debug("Response from server: %s" % error_details)

			if "timestamp" in error_details:
				raise TimeStampException(error_details)
			elif "password" in error_details:
				raise PasswordMagicException(error_details)
		else:
			ok_details = msg_details[len("#OK#"):]

			return ok_details, "%s:%s:" % addr

Example #6

0

Show file

File: infreemation_preprocessing.py Project: LBHackney-IT/foi-semantic-search

def main(input_data="all"):
    # Read json files from infreemation reporting API, new ones added
    # periodically
    # data: "all" or "latest"
    base_path = files_config.raw_data_path
    filename = 'infreemation-dump-'
    df = pd.DataFrame()

    for i in range(1, 6):
        filepath = base_path + filename + str(i) + '.json'
        with open(filepath) as f:
            data = f.read()
        data = json.loads(data)
        dff = json_normalize(data['published']['request'])
        df = df.append(dff)
        df = df.reset_index(drop=True)

    # Need to get the FOI ID from the url field
    for i in df.index:
        df.at[i, 'id'] = utils.extract_id(df.iloc[i]['url'])

    # Prepare subject field, which is plain text
    for i in df.index:
        try:
            df.at[i, 'subject_prepared'] = utils.prepare_text(
                df.iloc[i]['subject'])
        except:
            print(df.iloc[i]['subject'])

    # Prepare request body
    # Strip HTML
    for i in df.index:
        df.at[i, 'requestbody_stripped'] = utils.strip_element(
            df.iloc[i]['requestbody'])
    # Remove stopwords, non alpha, etc.
    df['requestbody_prepared'] = df.apply(
        lambda x: prepare_requestbody(x['requestbody_stripped']), axis=1)

    # Store pre-processed data
    filename = files_config.preprocessed_filename
    filepath = base_path + filename
    df.reset_index(drop=True).to_pickle(filepath)

Example #7

0

Show file

def server_discover(loop,
                    magic=DiscoveryConfig.MAGIC,
                    listen_ip=DiscoveryConfig.IP,
                    port=DiscoveryConfig.PORT,
                    password=DiscoveryConfig.PASSWORD,
                    disable_hidden=False):
    server_ip = get_ip_address()
    logger.info('Starting Discover Server at port %s', port)

    if password:
        password = prepare_text(password)

    config = dict()
    config[KEYS.GRPC_SERVER_PORT] = GRPCConfig.PORT
    config[KEYS.GRPC_SERVER_IP] = get_ip_address()
    _answer = json.dumps(config)

    # Setup Protocol
    DiscoverServerProtocol.magic = magic
    DiscoverServerProtocol.server_ip = server_ip
    DiscoverServerProtocol.password = password
    DiscoverServerProtocol.disable_hidden = disable_hidden
    DiscoverServerProtocol.answer = _answer

    transport = None
    try:
        # Start running the server
        listen = loop.create_datagram_endpoint(DiscoverServerProtocol,
                                               local_addr=(listen_ip, port),
                                               allow_broadcast=True)
        transport, protocol = loop.run_until_complete(listen)
        loop.run_forever()
    except:
        pass
    finally:
        logger.info('Shutting down Discovery Server')
        if transport is not None:
            transport.close()
        loop.close()

Example #8

0

Show file

choice = input(
    'Would you like to suggest a topic or should i pick from samples? ')
if choice in 'sample':
    choice = input('Choose from samples: \n 1-) Eminem \n 2-) 50 Cent')
    if choice == '1':
        lyrics = eminem
    elif choice == '2':
        lyrics = fifty_cent
else:
    limit = input("Set a line limit: ")
    lyrics = crawl_lyrics(choice)[:int(limit)]

ap = AudioProcessing()

for text in lyrics:
    text = prepare_text(text)
    tts = TTS(Voice(Voice.Language.enUS, Voice.Sex.male, "Justin"))
    ap.modify(tts.speak(text),
              0.1,
              -1,
              1,
              mid_part=0.05,
              mid_pitch=2,
              mid_stretch=1,
              accel=1.0)
aud, br = ap.insert_beat()
aud, br = ap.add_duet('Drop the beat DJ!', 0, 2)
aud, br = ap.add_duet('Aha. Yeah. Aha!', 3, 0.5)
ap.write(aud, br, name=choice)

Example #9

0

Show file

    def put_stress(self,
                   text: str,
                   stress_symbol: str = '+',
                   accuracy_threshold: float = 0.75,
                   replace_similar_symbols: bool = False,
                   lemmatize_words: bool = False,
                   use_batch_mode: bool = True) -> str:
        ''' Split the text into words and place stress on them. The source text formatting is preserved. If some words already have an stress,
        it will be saved.

        The stress is indicated using the "'" or '+' symbol after the stressed vowel.

        The threshold for the accuracy of stress placement allows you to cut off stresses, the prediction accuracy of which is lower (<=)
        than specified. The 0.75 threshold reduces the number of incorrectly placed stresses, but increases the number of words that will
        not be stressed. The 0.0 threshold allows you to place stresses in absolutely all words, but not always correctly.
        
        1. text - string with text
        2. stress_symbol - stress symbol, only "'" and '+' are supported
        3. accuracy_threshold - threshold for the accuracy of stress placement (from 0.0 to 1.0)
        4. replace_similar_symbols - True: replacing similar latin symbols with cyrillic ones
        5. lemmatize_words - True: lemmatize (normalize) each word before searching in exception dictionary
        6. use_batch_mode - True: place stress on words for 1 call to the neural network (speeds up work by 1.5-2 times)
        7. returns text with placed stresses '''

        if stress_symbol != DEF_STRESS_SYMBOL and stress_symbol != ADD_STRESS_SYMBOL:
            raise ValueError(
                "Unsupported stress symbol '{}'! Only \"{}\" and '{}' are supported."
                .format(stress_symbol, DEF_STRESS_SYMBOL, ADD_STRESS_SYMBOL))

        words = prepare_text(text,
                             replace_similar_symbols=replace_similar_symbols)
        tokens = tokenize(text,
                          replace_similar_symbols=replace_similar_symbols)
        words_with_endings = add_endings(words)

        # Stress placement
        stressed_words = []
        batch_for_predict = []
        for word in words_with_endings:
            # When using the module after russian_g2p.Accentor, it is possible situation that one of the words passed to the input, contains
            # stress symbol after each letter (for example, 'почем' -> '+п+о+ч+е+м+')
            if word.count(DEF_STRESS_SYMBOL) > 2:
                word = word.replace(DEF_STRESS_SYMBOL, '')
            elif word.count(ADD_STRESS_SYMBOL) > 1:
                word = word.replace(ADD_STRESS_SYMBOL, '')

            if word.find(DEF_STRESS_SYMBOL) != -1 and word[
                    word.find(DEF_STRESS_SYMBOL) - 1] in VOWELS:
                continue

            elif word.find(ADD_STRESS_SYMBOL) != -1 and word[
                    word.find(ADD_STRESS_SYMBOL) - 1] in VOWELS:
                continue

            elif count_number_of_vowels(word) == 1:
                stressed_word = word[:find_vowel_indices(word)[-1] +
                                     1] + stress_symbol + word[
                                         find_vowel_indices(word)[-1] + 1:]
                stressed_words.append(stressed_word)

            elif SEARCH_TWO_VOWELS_RE.search(
                    word) and self.exception_dict_wrapper.is_in_dict(
                        del_endings(word), lemmatize_words):
                stressed_word = self.exception_dict_wrapper.put_stress(
                    del_endings(word), stress_symbol, lemmatize_words)
                stressed_words.append(stressed_word)

            elif use_batch_mode and SEARCH_TWO_VOWELS_RE.search(word):
                batch_for_predict.append(word)
                stressed_words.append(word)

            elif not use_batch_mode and SEARCH_TWO_VOWELS_RE.search(word):
                stressed_word, accuracity = self.__predict(
                    word, stress_symbol)[0]
                if accuracity >= accuracy_threshold:
                    stressed_words.append(stressed_word)

        # Predict all words in 1 network call
        if use_batch_mode:
            batch_with_stressed_words = self.__predict(batch_for_predict,
                                                       stress_symbol)

            if len(batch_for_predict) > 0:
                updated_stressed_words = []
                idx_in_batch = 0
                for stressed_word in stressed_words:
                    if idx_in_batch >= len(batch_for_predict):
                        break
                    if stressed_word == batch_for_predict[
                            idx_in_batch] and batch_with_stressed_words[
                                idx_in_batch][1] >= accuracy_threshold:
                        updated_stressed_words.append(
                            batch_with_stressed_words[idx_in_batch][0])
                        idx_in_batch += 1
                    elif stressed_word == batch_for_predict[idx_in_batch]:
                        idx_in_batch += 1
                    else:
                        updated_stressed_words.append(stressed_word)
                stressed_words = updated_stressed_words

        # Transferring stresses to the source text
        stressed_text = []
        for token in tokens:
            if count_number_of_vowels(token) == 0:
                stressed_text.append(token)
            else:
                try:
                    unstressed_word = stressed_words[0].replace(
                        stress_symbol, '')
                except IndexError:
                    unstressed_word = ''
                if unstressed_word == token.lower():
                    stress_position = stressed_words[0].find(stress_symbol)
                    stressed_token = token[:stress_position -
                                           1] + stress_symbol + token[
                                               stress_position - 1:]
                    stressed_text.append(stressed_token)
                    stressed_words = stressed_words[1:]
                else:
                    stressed_text.append(token)
        stressed_text = ''.join(stressed_text)

        return stressed_text

Example #10

0

Show file

File: infreemation_preprocessing.py Project: LBHackney-IT/foi-semantic-search

def prepare_requestbody(s):
    l = sent_tokenize(s)
    l = [utils.prepare_text(e) for e in l]
    l = [e for e in l if e != '']
    return l