def create_ner_tensor(tokenized_context,
                      entities,
                      ner_textdict,
                      return_in_tensor=True):
    ner_tensor = [
        NONE_NER_POS_TOKEN if return_in_tensor else NONE_NER_POS
        for _ in range(len(tokenized_context))
    ]

    if len(entities) == 0:
        return ner_tensor

    pointer_loc = 0
    i = 0
    j = 0
    k = 0
    entities_name = tokenize(entities[j]['name'])
    while i < len(tokenized_context) and entities_name != None:
        pointer_loc += len(tokenized_context[i]) + 1
        if entities[j]['begin_offset'] - pointer_loc <= 0:
            similarity = fuzz.partial_ratio(tokenized_context[i],
                                            entities_name[k])
            # print(f'{tokenized_context[i]} vs {entities_name[k]} = {similarity}')
            if similarity >= WORD_SIMILARITY_THRESHOLD:
                ner_tensor[i] = ner_textdict.word2index[entities[j]['type']] if return_in_tensor else \
                    entities[j]['type']
                k += 1
                if k == len(entities_name):
                    j += 1
                    k = 0
                    entities_name = None if j == len(entities) else tokenize(
                        entities[j]['name'])
            i += 1

    return ner_tensor
def sent_tokenize(input_text):
    tokenized_sents = []
    tokenized_input = tokenize(normalize_string(input_text))
    sentence = []
    for token in tokenized_input:
        sentence.append(token)
        if is_end_punctuations(token):
            tokenized_sents.append(sentence.copy())
            sentence = []
    return tokenized_sents
def load_context_and_question(df_squad):
    j = 0
    contexts = []
    questions = []
    start_time = time.time()
    for taken_topic_idx in range(df_squad.shape[0]):
        for taken_context_idx in range(
                len(df_squad.iloc[taken_topic_idx]['paragraphs'])):
            i = 0
            context = df_squad.iloc[taken_topic_idx]['paragraphs'][
                taken_context_idx]['context']
            contexts.append(tokenize(normalize_string(context)))

            qas = df_squad.iloc[taken_topic_idx]['paragraphs'][
                taken_context_idx]['qas']
            while i < len(qas):
                question = qas[i]['question']
                questions.append(tokenize(normalize_string(question)))
                i += 1
                j += 1
                if j % 10000 == 0:
                    print(f'{j:04d}: {time.time() - start_time}s')
    return contexts, questions
Beispiel #4
0
def preprocess_data():
    data['intent'] = data['intent'].map(intent_mapping)
    count = 0
    for i in data['question']:
        data.replace(i, tokenize(i), regex=True, inplace=True)
        if count % 50 == 0:
            print("CURRENT COLLECT : ", count)
        count += 1

    encode = []
    decode = []
    for q, i in data.values:
        encode.append(q)
        decode.append(i)

    return {'encode': encode, 'decode': decode}
def prepare_featured_input(input_text,
                           output_file_name='free_input.txt',
                           manual_ne_postag=False,
                           lower=False,
                           seed=42):
    is_answer_sents = []
    is_cased_sents = []
    if manual_ne_postag:
        entities = json.loads(
            input('Enter the named entities (list of dicts):').replace(
                '\'', '"'))
        postags = json.loads(
            input('Enter the postags (list of lists of lists):').replace(
                '\'', '"'))
    else:
        try:
            entities = get_ner(input_text)['entities']
            postags = get_pos_tag(input_text)['postags']
        except TimeoutError as e:
            print(
                'Unable to invoke the NE and/or Pos Tag API. Please check your VPN or your internet connection:',
                e)
            exit(1)
    tokenized_input = tokenize(normalize_string(input_text))
    entities = create_ner_tensor(tokenized_input,
                                 entities,
                                 ner_textdict=None,
                                 return_in_tensor=False)
    postags = create_postags_tensor(tokenized_input,
                                    postags,
                                    postags_textdict=None,
                                    return_in_tensor=False)
    tokenized_sents, entity_sents, postag_sents = sentenize(
        tokenized_input, entities, postags)
    for i in range(len(tokenized_sents)):
        is_answer_sents.append(
            get_random_answer_loc(tokenized_sents[i],
                                  entity_sents[i],
                                  seed=seed))
        is_cased = []
        for j in range(len(tokenized_sents[i])):
            is_cased.append(
                '1' if j < len(tokenized_sents[i]) and any(c.isupper() for c in tokenized_sents[i][j]) \
                    else '0'
            )
        is_cased_sents.append(is_cased)

    tokenized_sents = np.array(tokenized_sents)

    # YES, DIRTY CODE. But have no choice to force the numpy to keep the input as array-of-list instead of pure array
    is_answer_sents = np.array(is_answer_sents + [[]])[:-1]
    is_cased_sents = np.array(is_cased_sents + [[]])[:-1]
    entity_sents = np.array(entity_sents + [[]])[:-1]
    postag_sents = np.array(postag_sents + [[]])[:-1]

    is_answer_sents = np.expand_dims(is_answer_sents, axis=-1)
    is_cased_sents = np.expand_dims(is_cased_sents, axis=-1)
    entity_sents = np.expand_dims(entity_sents, axis=-1)
    postag_sents = np.expand_dims(postag_sents, axis=-1)

    if lower:
        features = np.concatenate(
            (is_answer_sents, is_cased_sents, entity_sents, postag_sents),
            axis=-1)
    else:
        features = np.concatenate(
            (is_answer_sents, entity_sents, postag_sents), axis=-1)
    with open(output_file_name, 'w', encoding='utf-8') as f_out:
        for i in range(len(tokenized_sents)):
            if lower:
                f_out.write((print_input_along_feature(
                    tokenized_sents[i], features[i]) + '\n').lower())
            else:
                f_out.write((print_input_along_feature(tokenized_sents[i],
                                                       features[i]) + '\n'))
Beispiel #6
0
def preprcoess(speech) -> str:
    speech = fix(speech)
    speech = tokenize(speech)
    speech = fix(speech)
    return speech