def only_text(a): nr = len(a) text = [] for i in xrange(0, nr): t = re.sub("[^a-zA-Z]", " ", a[i].get_text()) #t = t.encode('ascii') words = t.split() text.append(" ".join(words)) return text
def clean_text(a): nr = len(a) text = [] for i in xrange(0, nr): letters = re.sub("[^a-zA-Z]", " ", a[i].get_text()) words = letters.lower().split() stops = set(stopwords.words("english")) meaningful_words = [w for w in words if not w in stops] text.append(" ".join( map(lambda x: lemmatizer.lemmatize(x), meaningful_words))) return (text)
def create_text_column(df): #create copy to modify text_df = df.copy() #store processed text text = [] # for each file (row) in the df, read in the file for row_i in df.index: filename = df.iloc[row_i]['response'] file_text = process_file(str(filename)) #append processed text to list text.append(file_text) #add column to the copied dataframe text_df['text'] = text return text_df
imagePath = '/media/zxq/zxq/COCO/resized/' savePath = './TextImageEmbedding.h5' # Load caption file with open(captionPath,'r') as f: data = json.load(f) text = [] imageName = [] labels = [] trainVector = [] instanceSize = 340000 print('Train model in {} instances'.format(instanceSize)) validationSize = 10000 #Load captions and images' name for i in range(0,instanceSize): text.append(data[i]['caption']) imageName.append(data[i]['image_name']) labels.append(data[i]['label']) #Load tokenizer from file with open('./tokenizerV2.pickle','rb') as handle: Tokenizer=pickle.load(handle) #Pre-processing of labels and texts max_caption_length=400 text = Tokenizer.texts_to_sequences(text) text = np.asarray(text) labels = np.asarray(labels) text = sequence.pad_sequences(text, maxlen=max_caption_length) labels = to_categorical(labels,num_classes=90) #Get the train label and validation label trainLabels = labels[:instanceSize-validationSize] valLabels = labels[instanceSize-validationSize:]
def tokenize(max_features, max_len, on='train', train_path='f:/avito/train.csv', test_path=None, tokenizer=None, clean_text=False, return_tokenizer=False, return_full_train=False): """ Tokenize text. Read train and test data, process description feature, tokenize it. Parameters: - on: fit tokenizer on train or train + test; - train_path: path to train file; - test_path: past to test file; - max_features: tokenizer parameter; - max_len: tokenizer parameter; - tokenizer: can pass tokenizer with different parameters or use a default one; - clean_text: apply text cleaning or not; """ # check that "on" has a correct value. assert on in ['train', 'all'] print('Reading train data.') train = pd.read_csv(train_path, index_col=0) labels = train['deal_probability'].values train = train['description'].astype(str).fillna('') text = train # define tokenizer if tokenizer: tokenizer = tokenizer else: tokenizer = Tokenizer(num_words=max_features) if on == 'all': print('Reading test data.') test = pd.read_csv(test_path, index_col=0) test = test['description'].astype(str).fillna('') text = text.append(test) # clean text if clean_text: pass # print('Cleaning.') print('Fitting.') tokenizer.fit_on_texts(text) # split data X_train, X_valid, y_train, y_valid = train_test_split(train, labels, test_size=0.1, random_state=23) print('Converting to sequences.') X_train = tokenizer.texts_to_sequences(X_train) X_valid = tokenizer.texts_to_sequences(X_valid) if test_path: test = tokenizer.texts_to_sequences(test) print('Padding.') X_train = sequence.pad_sequences(X_train, maxlen=max_len) X_valid = sequence.pad_sequences(X_valid, maxlen=max_len) if test_path: test = sequence.pad_sequences(test, maxlen=max_len) data = {} data['X_train'] = X_train data['X_valid'] = X_valid data['y_train'] = y_train data['y_valid'] = y_valid if test_path: data['test'] = test if return_tokenizer: data['tokenizer'] = tokenizer if return_full_train: X = np.concatenate([X_train, X_valid]) y = np.concatenate([y_train, y_valid]) data['X'] = X data['y'] = y return data