def word2id(sentences, word2idx, seq_length): idx = [] all_length = [] global word2idx_ word2idx_ = word2idx for sentence in sentences: try: sentence = sentence.strip().decode('utf-8') sentence = re.sub(punc, u' ', sentence).strip() words = WordPunctTokenizer().tokenize(sentence) except: print(sentence) if len(words) < seq_length: all_length.append(len(words)) for _ in range(len(words), seq_length): words.append('<0>') elif len(words) > seq_length: words = words[:seq_length] all_length.append(seq_length) else: all_length.append(seq_length) id = list(map(get_id, words)) idx.append(id) return np.array(idx), np.array(all_length)