コード例 #1
0
ファイル: helpers.py プロジェクト: jeremiecanoni/ML-Project2
def get_raw_data(path_g, full):
    """
    Generate raw data without punctuation

    :param path_g: Path on the Google Drive
    :param full: string, 'f' for full dataset and 'nf' for non-full dataset
    """

    if full == 'f':
        path_pos = path_g + 'data/twitter-datasets/train_pos_full.txt'
        path_neg = path_g + 'data/twitter-datasets/train_neg_full.txt'

    elif full == 'nf':
        path_pos = path_g + 'data/twitter-datasets/train_pos.txt'
        path_neg = path_g + 'data/twitter-datasets/train_neg.txt'

    else:
        raise ValueError("Not valid full, should be 'f' or 'nf'")

    path_test = path_g + 'data/twitter-datasets/test_data.txt'

    # Read all files
    data_neg = read_file(path_neg)

    data_pos = read_file(path_pos)

    data_test = read_file(path_test)

    df_neg = pd.DataFrame(data_neg)
    df_pos = pd.DataFrame(data_pos)
    df_test = pd.DataFrame(data_test, columns=['tweet'])

    df_neg = pd.DataFrame(pd.unique(df_neg[0]).T, columns=['tweet'])
    df_neg['sentiment'] = 0
    print(df_neg.shape)

    df_pos = pd.DataFrame(pd.unique(df_pos[0]).T, columns=['tweet'])
    df_pos['sentiment'] = 1
    print(df_pos.shape)

    df = pd.concat([df_neg, df_pos])
    text_data = df['tweet'].values
    text_data_test = df_test['tweet'].values

    for idx, tweet in enumerate(text_data):
        text_data[idx] = text_to_word_sequence(
            tweet,
            filters='#"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n0123456789',
            lower=True)

    for idx, tweet in enumerate(text_data_test):
        text_data_test[idx] = text_to_word_sequence(
            tweet,
            filters='#"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n123456789',
            lower=True)

    labels = df['sentiment'].values

    return text_data, labels, text_data_test
コード例 #2
0
def preprocess_news_data(filename):
    print('Preprocessing news...')
    all_texts = []
    category_map = {}
    titles = []
    abstracts = []
    categories = []

    with open(filename, 'r') as f:
        for l in f:
            id, category, subcategory, title, abstract, url, entity = l.strip(
                '\n').split('\t')
            title = title.lower()
            abstract = abstract.lower()
            all_texts.append(title + ". " + abstract)
            # map every category to a number
            if category not in category_map:
                category_map[category] = len(category_map)
            # map every subcategory to a number
            titles.append(title)
            abstracts.append(abstract)
            categories.append(category)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    word_index = tokenizer.word_index  # a dict: word_index[word]=index
    print('Found %s unique tokens.' % len(word_index))
    # print(word_index)

    # title
    news_title = np.zeros((len(titles), MAX_TITLE_LENGTH), dtype='int32')
    for i, title in enumerate(titles):
        wordTokens = text_to_word_sequence(title)
        k = 0
        for _, word in enumerate(wordTokens):
            if k < MAX_TITLE_LENGTH:
                news_title[i, k] = word_index[word]
                k = k + 1

    # abstract
    news_abstract = np.zeros((len(abstracts), MAX_ABSTRACT_LENGTH),
                             dtype='int32')
    for i, abstract in enumerate(abstracts):
        wordTokens = text_to_word_sequence(abstract)
        k = 0
        for _, word in enumerate(wordTokens):
            if k < MAX_ABSTRACT_LENGTH:
                news_abstract[i, k] = word_index[word]
                k = k + 1
    # category & subcategory
    news_category = []
    k = 0
    for category in categories:
        news_category.append(category_map[category])
        k += 1
    news_category = to_categorical(np.asarray(news_category))

    return word_index, category_map, news_category, news_abstract, news_title
コード例 #3
0
def preprocess_news_data(filename, filename_2):
    # only use news title
    print('Preprocessing news...')
    titles = []
    news_index = {}
    with open(filename, 'r') as f:
        for l in f:
            id, category, subcategory, title, abstract, url, entity = l.strip(
                '\n').split('\t')
            if id not in news_index:
                news_index[id] = len(news_index)
                title = title.lower()
                titles.append(title)
    news_index_test = {}
    titles_test = []
    with open(filename_2, 'r') as f:
        for l in f:
            id, category, subcategory, title, abstract, url, entity = l.strip(
                '\n').split('\t')
            if id not in news_index:
                news_index[id] = len(news_index)
                title = title.lower()
                titles.append(title)
            if id not in news_index_test:
                news_index_test[id] = len(news_index_test)
                title = title.lower()
                titles_test.append(title)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(titles)
    word_index = tokenizer.word_index  # a dict: word_index[word]=index
    print('Found %s unique news.' % len(news_index))
    print('Found %s unique tokens.' % len(word_index))

    news_title = np.zeros((len(titles), MAX_TITLE_LENGTH), dtype='int32')
    news_title_test = np.zeros((len(titles_test), MAX_TITLE_LENGTH),
                               dtype='int32')
    for i, title in enumerate(titles):
        wordTokens = text_to_word_sequence(title)
        k = 0
        for _, word in enumerate(wordTokens):
            if k < MAX_TITLE_LENGTH:
                news_title[i, k] = word_index[word]
                k = k + 1
    for i, title in enumerate(titles_test):
        wordTokens = text_to_word_sequence(title)
        k = 0
        for _, word in enumerate(wordTokens):
            if k < MAX_TITLE_LENGTH:
                news_title_test[i, k] = word_index[word]
                k = k + 1

    return news_index, word_index, news_title, news_index_test, news_title_test
コード例 #4
0
def transform_sentence_complete(sentence):
    def camel_case_split(identifier):
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
        return [m.group(0) for m in matches]

    if FLAGS.cs_use_clef_data and 'task1' in FLAGS.cs_raw_clef_train_loc:
        sentence = emoji.get_emoji_regexp().sub(r'', sentence)
        sentence = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', 'url', sentence)
        # sentence = re.sub('^@?(\w){1,15}$', 'user', sentence)
        sentence = ' '.join([' '.join(camel_case_split(x[1:])) if x[0] == '#' or x[0] == '@' else x for x in sentence.split()])
        sentence = ' '.join(['ebola' if any([y in x.lower() for y in ['covid', 'corona']]) else x for x in sentence.split()])
        print(sentence)

    sentence = correct_mistakes(sentence)

    if not FLAGS.cs_custom_preprc:
        return sentence.strip()

    sentence = (process_sentence_ner_spacy(sentence) if FLAGS.cs_ner_spacy else sentence)
    sentence = ' '.join(text_to_word_sequence(sentence))

    sentence = expand_contractions(sentence)
    sentence = remove_possessives(sentence)
    sentence = remove_kill_words(sentence)

    return sentence.strip()
コード例 #5
0
def preprocessing(sentence):
    # divide sentences as each words using(text to word sequences)
    words = set(text_to_word_sequence(sentence))  # remove duplicate words
    vocab_size = len(words)
    # words to numeric value(vector)
    results = one_hot(sentence, round(vocab_size * 1.3))
    return results
コード例 #6
0
    def preprocess(w2v,text,language):
        embeddings = []
        if language == 'english':
            text = text.replace("'",'')
            words = text_to_word_sequence(text)
        elif language == 'hindi':
            text = text.replace(",", '')
            text = text.replace("|", ' ')
            words = text.split()
        else:
            raise Exception("Choose lang as 'hindi' or 'english' ")
        for word in words:
            if word in w2v:
                embeddings.append(w2v[word])

        cur_seq_len = len(embeddings)
        print(words)
        print(cur_seq_len,language)
        # print(text)
        if cur_seq_len < max_len:
            embeddings = np.pad(embeddings, [(0, max_len - cur_seq_len), (0, 0)])
        else:
            embeddings = embeddings[cur_seq_len - max_len:]

        return embeddings
コード例 #7
0
def preprocess(labels, titles, abstracts, texts):
    news = []
    labels = to_categorical(np.asarray(labels))
    for i in texts:
        sentences = tokenize.sent_tokenize(i)
        news.append(sentences)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    data = np.zeros((len(texts), MAX_SENTS, MAX_SEQUENCE_LENGTH),
                    dtype='int32')
    for i, sentences in enumerate(news):
        for j, sent in enumerate(sentences):
            if j < MAX_SENTS:
                wordTokens = text_to_word_sequence(sent)
                k = 0
                for _, word in enumerate(wordTokens):
                    if k < MAX_SEQUENCE_LENGTH and tokenizer.word_index[
                            word] < max_features:
                        data[i, j, k] = tokenizer.word_index[word]
                        k = k + 1
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    return word_index, data, labels
コード例 #8
0
def load_data(file, max_fatures, max_sequence_length):
    data = pd.read_excel(file)

    data = data[['text', 'sentiment']]

    data['text'] = data['text'].apply(lambda x: x.lower())
    data['text'] = data['text'].apply(lambda x: clean_str(x))
    data['text'] = data['text'].apply(
        (lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

    stop_words = set(stopwords.words('english'))
    text = []
    for row in data['text'].values:
        word_list = text_to_word_sequence(row)
        no_stop_words = [w for w in word_list if not w in stop_words]
        no_stop_words = " ".join(no_stop_words)
        text.append(no_stop_words)

    tokenizer = Tokenizer(num_words=max_fatures, split=' ')

    tokenizer.fit_on_texts(text)
    X = tokenizer.texts_to_sequences(text)

    X = pad_sequences(X, maxlen=max_sequence_length)

    word_index = tokenizer.word_index
    Y = pd.get_dummies(data['sentiment']).values
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.20,
                                                        random_state=42)

    return X_train, X_test, Y_train, Y_test, word_index, tokenizer
コード例 #9
0
ファイル: tokensTest.py プロジェクト: chungfaith1/tech_review
    def get_collisions(self):
        for i in range(len(self.quotes)):
            words = text_to_word_sequence(self.quotes[i])
            for j in range(len(words)):
                word = words[j]
                num = self.encoded_quotes[i][j]

                if num in self.collisions:
                    if (not word in self.collisions[num]
                        ):  # new word, same hash = collision!
                        self.collisions_count += 1
                        l = self.collisions[num] + [word]
                        self.collisions[num] = l
                else:  # new hash id
                    self.collisions[num] = [word]

        collision_words = 0
        self.max_hashKey = 0
        for num in self.collisions:
            print(str(num) + ": " + str(self.collisions[num]))
            if len(self.collisions[num]) > 1:
                collision_words += 1
            if num > self.max_hashKey:
                self.max_hashKey = num

        print("vocab size: " + str(self.vocab_size))
        print("dictionary size: " + str(len(self.collisions)))
        print("number of hash ids with collisions: " + str(collision_words))
コード例 #10
0
ファイル: tokensTest.py プロジェクト: chungfaith1/tech_review
    def hashing_method(self):
        # get vocab size
        motiv = self.flatten(self.motiv_quotes)
        demotiv = self.flatten(self.demotiv_quotes)
        self.vocab = set(text_to_word_sequence(motiv + " " + demotiv))
        self.vocab_size = len(self.vocab)

        # perform hash encoding
        self.quotes = self.motiv_quotes + self.demotiv_quotes
        before = time.time()
        for quote in self.quotes:
            self.encoded_quotes.append(
                hashing_trick(quote,
                              round(self.vocab_size * 1.5),
                              hash_function='md5'))
        after = time.time()
        diff = (after - before) * 1000
        print("hashing trick time: " + str(diff) + " ms")

        # PADDED HASH DATA FOR TRAINING
        self.padded_encoded_quotes = pad_sequences(self.encoded_quotes,
                                                   maxlen=280)
        #print(self.encoded_quotes)
        #print("----------------------------------------")
        print(self.padded_encoded_quotes)
コード例 #11
0
    def prepare_data(self, data):
        data = data[['text', 'sentiment']]
        data['text'] = data['text'].apply(lambda x: x.lower())
        data['text'] = data['text'].apply(lambda x: self.clean_str(x))
        data['text'] = data['text'].apply(
            (lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

        stop_words = set(stopwords.words('english'))
        text = []
        for row in data['text'].values:
            word_list = text_to_word_sequence(row)
            no_stop_words = [w for w in word_list if not w in stop_words]
            no_stop_words = " ".join(no_stop_words)
            text.append(no_stop_words)

        tokenizer = Tokenizer(num_words=self.MAX_FEATURES, split=' ')
        tokenizer.fit_on_texts(text)
        X = tokenizer.texts_to_sequences(text)
        X = pad_sequences(X, maxlen=self.MAXLEN)
        word_index = tokenizer.word_index
        Y = pd.get_dummies(data['sentiment']).values
        x_train, x_test, y_train, y_test = train_test_split(
            X, Y, test_size=self.TEST_DIM, random_state=42)

        return x_train, x_test, y_train, y_test, word_index, tokenizer
コード例 #12
0
def keras_tokenize_wrapper(texts,
                           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                           lower=False,
                           split=' ',
                           **kwargs):
    """

    Examples
    >>> texts = get_test_data()
    >>> dfs = keras_tokenize_wrapper(texts)
    >>> isinstance(dfs[0], pd.DataFrame)
    True
    """

    # get valid kwargs - only pass on valid arguments
    ttws_kwargs = {k: kwargs[k] for k in kwargs if
                   k in signature(text_to_word_sequence).parameters}

    dfs = [pd.DataFrame(text_to_word_sequence(text,
                                              filters=filters,
                                              lower=lower,
                                              split=split,
                                              **ttws_kwargs),
                        columns=['token']) for text in texts]
    return dfs
コード例 #13
0
ファイル: Text_Preprocessing.py プロジェクト: 245charan/Pixir
def text_preprocessing(text):
    # text to word_sequence
    texts = [text_to_word_sequence(word) for texts in text for word in texts]

    # 표제어 추출
    n = WordNetLemmatizer()
    words = [n.lemmatize(word, 'v') for text in texts for word in text]

    # 파라미터 text를 표제어로 구성된 texts_lemmatized로 변경
    texts_lemmatized = [[n.lemmatize(word, 'v') for word in text] for text in texts]

    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # 전처리 끝난 words로 token 형성
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(words)
    tokens = tokenizer.index_word

    cap_vector = tokenizer.texts_to_sequences(texts_lemmatized)
    pad_sequences = sequence.pad_sequences(cap_vector, padding='post')
    result = np.array(pad_sequences)
    vocab_size = len(tokenizer.index_word) + 1
    return result, vocab_size
コード例 #14
0
def database_title_token(data_list):
    """
    DB를 한줄씩 sqlite로 불러와서 튜플로 저장된 하나의 원소들을
    for문으로 하나씩 분해 후, title을 token으로 나눈 후,
    token들을 합친 리스트를 return값에 list로 반환
    """

    user_history_update_data_list_titletoken = []

    for i in data_list:
        """
        output : 
        (13567, 'https://www.youtube.com/', 'YouTube', 36, 1420192312848192)
        """
        # output이 tuple이어서
        Id, url, title, visit_count, last_visit_time = i
        # title의 text를 word로 끊어버리기
        title = text_to_word_sequence(title)
        # title이 빈공간인 건 제외
        if len(title) == 0:
            continue

        user_history_update_data_list_titletoken.append(
            (Id, url, title, visit_count, last_visit_time))
    return tuple(user_history_update_data_list_titletoken)
コード例 #15
0
ファイル: dataset.py プロジェクト: zftan0709/DL_HW
    def process_data(self):
        pad = self.tokenizer.texts_to_sequences(['<PAD>'])[0]
        id_list = []
        cap_list = []
        cap_length_list = []
        self.feat_data = {}
        with open(self.label_dir) as f:
            raw_data = json.load(f)
        for vid in raw_data:
            vid_id = vid['id']
            self.feat_data[vid_id] = np.load(self.feat_dir + vid_id + '.npy')

            for caption in vid['caption']:
                words = text_to_word_sequence(caption)
                for i in range(len(words)):
                    if words[i] not in self.tokenizer.word_index:
                        words[i] = '<UNK>'
                words.append('<EOS>')
                one_hot = self.tokenizer.texts_to_sequences([words])[0]
                cap_length = len(one_hot)
                one_hot += pad * (self.max_caption_len - cap_length)
                id_list.append(vid_id)
                cap_list.append(one_hot)
                cap_length_list.append(cap_length)

        self.id_list = np.array(id_list)
        self.cap_list = np.array(cap_list)
        self.cap_length_list = np.array(cap_length_list)
        self.data_size = len(self.cap_list)
        self.data_idx = np.arange(self.data_size, dtype=np.int)
コード例 #16
0
ファイル: model.py プロジェクト: jxrx99/trumpet
def generate_text_sequences(lines, pastWords, vocab):
    X_line = list()
    Y_line = list()
    pastWords = pastWords
    for line in lines:
        # Tokenize line
        lineTokenized = text_to_word_sequence(line.item(),\
                                              filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")
        #Get line length
        lengthLine = len(lineTokenized)
        lineBatch = lengthLine - pastWords
        
        # Substitute words outside vocab with <Unknown>
        for idx in range(0,len(lineTokenized)):
            if lineTokenized[idx] in vocab:
                continue
            else:
                lineTokenized[idx] = '<Unknown>'
        
        #Create sequences of text
        for i in range(0,lineBatch):
            X_sequence = lineTokenized[i:i+pastWords]
            X_line.append(X_sequence)
            Y_sequence = lineTokenized[i+pastWords]
            Y_line.append(Y_sequence)
    
    return(X_line, Y_line)
コード例 #17
0
def tokenizer(data_frame, data_frame_element):    
  # Faz uma limpeza no texto para deixá-lo corretaente ajustado:
  # Nota: Criar funções para isso 
  #data_frame[data_frame_element] = data_frame[data_frame_element].apply(lambda x: x.lower())
  #data_frame[data_frame_element] = data_frame[data_frame_element].apply(lambda x: clean_str(x))
  #data_frame[data_frame_element] = data_frame[data_frame_element].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

  nltk.download('stopwords')

  # Separa um conjunto de stopwords (inglês) fornecido pelo NLTK:
  stop_words = set(stopwords.words('english'))

  # Cria uma lista vazia aonde ficarão os textos:
  text = []

  # Retira-se as stopwords do texto:
  for row in data_frame[data_frame_element].values:
    words_clean = sentence_clean(row)
    word_list = text_to_word_sequence(words_clean)
    no_stop_words = [w for w in word_list if not w in stop_words]
    no_stop_words = " ".join(no_stop_words)
    text.append(no_stop_words)

  # Aqui é usado o Tokenizer do próprio Keras para transformar as palavras em tokens
  tokenizer = Tokenizer(split=' ')

  # São criados Tokens para cada palavra ao longo dos textos:
  tokenizer.fit_on_texts(text)

  # Aqui cada texto é transformado em listas de Tokens, ou seja, as palavras neles
  # são convertidos de Strings para números:
  text_tokenized = tokenizer.texts_to_sequences(text)

  return text_tokenized
コード例 #18
0
def Audio_file_Read(filename):
    universal_dict = {}
    cnt = {}
    gantu = [0, 0, 0, 0]
    analysis = {}
    token = Tokenizer()
    recog = Recognizer()
    try:
        audioFile = sr.AudioFile(filename)
        with audioFile as source:
            audio = recog.record(source)
            recognized = recog.recognize_google(audio, language="ko-KR")
            res = text_to_word_sequence(recognized)
            cnt = collections.Counter(res)
            universal_dict = dict(cnt)
            if "어" in universal_dict:
                gantu[0] = universal_dict["어"]
            if "아니" in universal_dict:
                gantu[1] = universal_dict["아니"]
            if "근데" in universal_dict:
                gantu[2] = universal_dict["근데"]
            if "이제" in universal_dict:
                gantu[3] = universal_dict["이제"]
            text = recognized
            analysis['text'] = text
            analysis['data'] = gantu
            return analysis
    except UnknownValueError:
        analysis['text'] = "당신이 말한 문장이 없습니다."
        analysis['data'] = [0, 0, 0, 0]
        return analysis
コード例 #19
0
    def createData(self, data, maxLength, embedding):

        with open(os.path.join(os.getcwd(), 'synonyms.json'), "rb") as f:
            synonyms = json.load(f)
        self.tokenizer = Tokenizer(num_words=None)
        with open(os.path.join(os.getcwd(), 'bureau/models/maxlength.pkl'),
                  "rb") as f:
            self.max_len = pickle.load(f)
        self.x_train = data.train_data_frame['query'].tolist()
        self.y_train = data.train_data_frame['category'].tolist()
        self.tokenizer.fit_on_texts(list(self.x_train))
        for key in synonyms:
            if key in self.tokenizer.word_index:
                for synonym in synonyms[key]:
                    if synonym not in self.tokenizer.word_index:
                        self.tokenizer.word_index[
                            synonym] = self.tokenizer.word_index[key]

        if embedding != "custom":
            for i in range(len(self.x_train)):
                self.x_train[i] = text_to_word_sequence(self.x_train[i])
            self.y_train = to_categorical(self.y_train)

        if embedding == "custom":
            self.x_train = self.tokenizer.texts_to_sequences(self.x_train)
            self.x_train = pad_sequences(self.x_train, maxlen=self.max_len)

            self.y_train = to_categorical(self.y_train)
        self.word_index = self.tokenizer.word_index
        print("Setting Maximum Length to : ")
        print(self.max_len)
コード例 #20
0
def keras_tokenizer(txt):
    from tensorflow.keras.preprocessing.text import text_to_word_sequence

    return text_to_word_sequence(
        txt,
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
        lower=False,
        split=" ")
コード例 #21
0
def convert_text_to_indices(text, tokenized_dictionary):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    word_indices = []
    for word in kpt.text_to_word_sequence(text):
        if word in tokenized_dictionary:
            word_indices.append(tokenized_dictionary[word])
    return word_indices
コード例 #22
0
 def convert_text_to_index_array(text, dictionary):
     words = kpt.text_to_word_sequence(text)
     wordIndices = []
     for word in words:
         if word in dictionary:
             wordIndices.append(dictionary[word])
         else:
             print("'%s' not in training corpus; ignoring." % word)
     return wordIndices
コード例 #23
0
ファイル: embedding_idf.py プロジェクト: LHofstee/asreview
def _get_freq_dict(all_text):
    text_dicts = []
    for text in all_text:
        cur_dict = {}
        word_sequence = text_to_word_sequence(text)
        for word in word_sequence:
            if word in cur_dict:
                cur_dict[word] += 1
            else:
                cur_dict[word] = 1
        text_dicts.append(cur_dict)
    return text_dicts
コード例 #24
0
def get_transcriptions_align(path_to_transcriptions, filename,
                             path_to_transcriptions_to_align):
    f = open(os.path.join(path_to_transcriptions, filename), 'r').read()
    f = np.array(f.split('\n'))
    transcription = {}
    print("path_to_transcriptions:", path_to_transcriptions)
    print("path_to_transcriptions_to_align:", path_to_transcriptions_to_align)
    print("filename:", filename)
    for i in range(len(f) - 1):
        g = f[i]
        i1 = g.find(': ')
        i0 = g.find(' [')
        ind_id = g[:i0]
        ind_ts = g[i1+2:]
        ind_ts = text_to_word_sequence(ind_ts,filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n',
                                       lower=True,split=" ")
        align_t = []
        #print("ind_id_text:", g, flush=True)
        #print("ind_id:", ind_id, flush=True)
        align_file_name = path_to_transcriptions_to_align + filename[:-4] + "/" + str(ind_id) + ".wdseg"
        if os.path.exists(align_file_name):
            f_align = open(align_file_name, 'r').read()
            f_align = np.array(f_align.split('\n'))
            #print("f_align", f_align, flush=True)
            for i in range(2, len(f_align) - 3):
                # print(f'f_align{i}', f_align[i])
                w = f_align[i].split()[3].split("(")[0]
                w = text_to_word_sequence(w,filters='!"#$%&()*+,-./:;=>?@[\\]^`{|}~\t\n',
                                           lower=True,split=" ")[0]
                if w in ind_ts:
                    align_t.append({'word': w,
                                    'SFrm': f_align[i].split()[0],
                                    'Efrm': f_align[i].split()[1]})
            #print("align_t", align_t, flush=True)
            #print("w_list", ind_ts, flush=True)
            assert len(align_t) == len(ind_ts)
            transcription[ind_id] = {'ind_ts': ind_ts,
                                   'align_t': align_t}
    return transcription
コード例 #25
0
def get_array_from_directory(path):
    array = os.listdir(path)
    m = []
    for n in range(len(array)):

        with open(os.path.join(path, array[n]), encoding='utf8') as f:
            data = f.read()
            words = set(text_to_word_sequence(data))
            result = one_hot(data, round(len(words) * 1.3))
            m.append(result)

    m = pad_sequences(m, maxlen=2000)
    return m
コード例 #26
0
    def preprocess(text):
        embeddings = []
        words = text_to_word_sequence(text)
        for word in words:
            if word in w2v:
                embeddings.append(w2v[word])

        cur_seq_len = len(embeddings)
        if cur_seq_len < max_len:
            embeddings = np.pad(embeddings, [(0, max_len - cur_seq_len), (0, 0)])
        else:
            embeddings = embeddings[cur_seq_len - max_len:]

        return embeddings
コード例 #27
0
def transform_sentence_complete(sentence):
    sentence = correct_mistakes(sentence)

    if not FLAGS.cs_custom_preprc:
        return sentence

    sentence = (process_sentence_ner_spacy(sentence) if FLAGS.cs_ner_spacy else sentence)
    sentence = ' '.join(text_to_word_sequence(sentence))

    sentence = expand_contractions(sentence)
    sentence = remove_possessives(sentence)
    sentence = remove_kill_words(sentence)

    return sentence
コード例 #28
0
def create_L():
    L = []
    f = open('reviews.txt', 'r')
    for review in f.readlines():
        words = text_to_word_sequence(review)
        L += list(words)
    W = []
    for word in L:
        if word not in stop_words and word.isalpha() and len(word) > 2:
            W.append(word)

    word_counts = Counter(W)

    return L, W, word_counts
コード例 #29
0
    def token_string(string: str) -> str:
        from tensorflow.keras.preprocessing.text import text_to_word_sequence
        token_list = text_to_word_sequence(string)
        token_except_stop_list = []
        import nltk
        #nltk.download('stopwords')
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        for token in token_list:
            if token not in stop_words:
                token_except_stop_list.append(token)

        #return " ".join(token_list)
        return " ".join(token_except_stop_list)
コード例 #30
0
def tokenize(text,
             filters='\t\n',
             add_whitespace_op=False,
             lower=False,
             **kwargs):
    """
  add_whitespace_op (bool): add whitespace around operator
  """
    from tensorflow.keras.preprocessing.text import text_to_word_sequence

    tokenlist = text_to_word_sequence(text, filters=filters, lower=lower)
    if add_whitespace:
        tokenlist = add_whitespace(tokenlist)
    return tokenlist