Ejemplo n.º 1
0
def only_text(a):
    nr = len(a)
    text = []
    for i in xrange(0, nr):
        t = re.sub("[^a-zA-Z]", " ", a[i].get_text())
        #t = t.encode('ascii')
        words = t.split()

        text.append(" ".join(words))
    return text
Ejemplo n.º 2
0
def clean_text(a):
    nr = len(a)
    text = []
    for i in xrange(0, nr):
        letters = re.sub("[^a-zA-Z]", " ", a[i].get_text())
        words = letters.lower().split()
        stops = set(stopwords.words("english"))
        meaningful_words = [w for w in words if not w in stops]
        text.append(" ".join(
            map(lambda x: lemmatizer.lemmatize(x), meaningful_words)))

    return (text)
Ejemplo n.º 3
0
def create_text_column(df):
  #create copy to modify
  text_df = df.copy()
    
  #store processed text
  text = []
    
      # for each file (row) in the df, read in the file 
  for row_i in df.index:
      filename = df.iloc[row_i]['response']
      file_text = process_file(str(filename))
          #append processed text to list
      text.append(file_text)
    
    #add column to the copied dataframe
  text_df['text'] = text
    
  return text_df
Ejemplo n.º 4
0
imagePath = '/media/zxq/zxq/COCO/resized/'
savePath = './TextImageEmbedding.h5'
# Load caption file 
with open(captionPath,'r') as f:
    data = json.load(f)
text = []
imageName = []
labels = []
trainVector = []
instanceSize = 340000
print('Train model in {} instances'.format(instanceSize))
validationSize = 10000

#Load captions and images' name
for i in range(0,instanceSize):
    text.append(data[i]['caption'])
    imageName.append(data[i]['image_name'])
    labels.append(data[i]['label'])
#Load tokenizer from file
with open('./tokenizerV2.pickle','rb') as handle:
    Tokenizer=pickle.load(handle)
#Pre-processing of labels and texts
max_caption_length=400
text = Tokenizer.texts_to_sequences(text)
text = np.asarray(text)
labels = np.asarray(labels)
text = sequence.pad_sequences(text, maxlen=max_caption_length)
labels = to_categorical(labels,num_classes=90)
#Get the train label and validation label
trainLabels = labels[:instanceSize-validationSize]
valLabels = labels[instanceSize-validationSize:]
def tokenize(max_features,
             max_len,
             on='train',
             train_path='f:/avito/train.csv',
             test_path=None,
             tokenizer=None,
             clean_text=False,
             return_tokenizer=False,
             return_full_train=False):
    """
    Tokenize text.

    Read train and test data, process description feature, tokenize it.
    Parameters:
    - on: fit tokenizer on train or train + test;
    - train_path: path to train file;
    - test_path: past to test file;
    - max_features: tokenizer parameter;
    - max_len: tokenizer parameter;
    - tokenizer: can pass tokenizer with different parameters or use a default one;
    - clean_text: apply text cleaning or not;
    """
    # check that "on" has a correct value.
    assert on in ['train', 'all']

    print('Reading train data.')
    train = pd.read_csv(train_path, index_col=0)
    labels = train['deal_probability'].values
    train = train['description'].astype(str).fillna('')
    text = train

    # define tokenizer
    if tokenizer:
        tokenizer = tokenizer
    else:
        tokenizer = Tokenizer(num_words=max_features)

    if on == 'all':
        print('Reading test data.')
        test = pd.read_csv(test_path, index_col=0)
        test = test['description'].astype(str).fillna('')
        text = text.append(test)

    # clean text
    if clean_text:
        pass
        # print('Cleaning.')

    print('Fitting.')
    tokenizer.fit_on_texts(text)

    # split data
    X_train, X_valid, y_train, y_valid = train_test_split(train,
                                                          labels,
                                                          test_size=0.1,
                                                          random_state=23)
    print('Converting to sequences.')
    X_train = tokenizer.texts_to_sequences(X_train)
    X_valid = tokenizer.texts_to_sequences(X_valid)
    if test_path:
        test = tokenizer.texts_to_sequences(test)

    print('Padding.')
    X_train = sequence.pad_sequences(X_train, maxlen=max_len)
    X_valid = sequence.pad_sequences(X_valid, maxlen=max_len)
    if test_path:
        test = sequence.pad_sequences(test, maxlen=max_len)

    data = {}
    data['X_train'] = X_train
    data['X_valid'] = X_valid
    data['y_train'] = y_train
    data['y_valid'] = y_valid
    if test_path:
        data['test'] = test

    if return_tokenizer:
        data['tokenizer'] = tokenizer

    if return_full_train:
        X = np.concatenate([X_train, X_valid])
        y = np.concatenate([y_train, y_valid])
        data['X'] = X
        data['y'] = y

    return data