def main(model, auxiliary=True, model_label='rcnn', rnn_type='gru', padding='pre', reg='s', prefix="crawl", embedding_file_type="word2vec", train_fname="./data/train.csv", test_fname="./data/test.csv", embeds_fname="./data/GoogleNews-vectors-negative300.bin", logger_fname="./logs/log-aws", mode="all", wrong_words_fname="./data/correct_words.csv", format_embeds="binary", config="./config.json", output_dir="./out", norm_prob=False, norm_prob_koef=1, gpus=0, char_level=False, random_seed=2018, num_folds=5): embedding_type = prefix + "_" + embedding_file_type logger = Logger(logging.getLogger(), logger_fname) # ====Detect GPUs==== logger.debug(device_lib.list_local_devices()) # ====Load data==== logger.info('Loading data...') train_df = load_data(train_fname) test_df = load_data(test_fname) target_labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] num_classes = len(target_labels) # ====Load additional data==== logger.info('Loading additional data...') # swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None) wrong_words_dict = load_data(wrong_words_fname, func=lambda x: {val[0]: val[1] for val in x}) tokinizer = RegexpTokenizer(r'\S+') regexps = [ re.compile("([a-zA-Z]+)([0-9]+)"), re.compile("([0-9]+)([a-zA-Z]+)") ] # ====Load word vectors==== logger.info('Loading embeddings...') if model != 'mvcnn': embed_dim = 300 embeds = Embeds(embeds_fname, embedding_file_type, format=format_embeds) if mode in ('preprocess', 'all'): logger.info('Generating indirect features...') # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda # Word count in each comment: train_df['count_word'] = train_df["comment_text"].apply( lambda x: len(str(x).split())) test_df['count_word'] = test_df["comment_text"].apply( lambda x: len(str(x).split())) # Unique word count train_df['count_unique_word'] = train_df["comment_text"].apply( lambda x: len(set(str(x).split()))) test_df['count_unique_word'] = test_df["comment_text"].apply( lambda x: len(set(str(x).split()))) # Letter count train_df['count_letters'] = train_df["comment_text"].apply( lambda x: len(str(x))) test_df['count_letters'] = test_df["comment_text"].apply( lambda x: len(str(x))) # punctuation count train_df["count_punctuations"] = train_df["comment_text"].apply( lambda x: len([c for c in str(x) if c in string.punctuation])) test_df["count_punctuations"] = test_df["comment_text"].apply( lambda x: len([c for c in str(x) if c in string.punctuation])) # upper case words count train_df["count_words_upper"] = train_df["comment_text"].apply( lambda x: len([w for w in str(x).split() if w.isupper()])) test_df["count_words_upper"] = test_df["comment_text"].apply( lambda x: len([w for w in str(x).split() if w.isupper()])) # title case words count train_df["count_words_title"] = train_df["comment_text"].apply( lambda x: len([w for w in str(x).split() if w.istitle()])) test_df["count_words_title"] = test_df["comment_text"].apply( lambda x: len([w for w in str(x).split() if w.istitle()])) # Word count percent in each comment: train_df['word_unique_pct'] = train_df[ 'count_unique_word'] * 100 / train_df['count_word'] test_df['word_unique_pct'] = test_df[ 'count_unique_word'] * 100 / test_df['count_word'] # Punct percent in each comment: train_df['punct_pct'] = train_df[ 'count_punctuations'] * 100 / train_df['count_word'] test_df['punct_pct'] = test_df['count_punctuations'] * 100 / test_df[ 'count_word'] # Average length of the words train_df["mean_word_len"] = train_df["comment_text"].apply( lambda x: np.mean([len(w) for w in str(x).split()])) test_df["mean_word_len"] = test_df["comment_text"].apply( lambda x: np.mean([len(w) for w in str(x).split()])) # upper case words percentage train_df["words_upper_pct"] = train_df[ "count_words_upper"] * 100 / train_df['count_word'] test_df["words_upper_pct"] = test_df[ "count_words_upper"] * 100 / test_df['count_word'] # title case words count train_df["words_title_pct"] = train_df[ "count_words_title"] * 100 / train_df['count_word'] test_df["words_title_pct"] = test_df[ "count_words_title"] * 100 / test_df['count_word'] # remove columns train_df = train_df.drop('count_word', 1) train_df = train_df.drop('count_unique_word', 1) train_df = train_df.drop('count_punctuations', 1) train_df = train_df.drop('count_words_upper', 1) train_df = train_df.drop('count_words_title', 1) test_df = test_df.drop('count_word', 1) test_df = test_df.drop('count_unique_word', 1) test_df = test_df.drop('count_punctuations', 1) test_df = test_df.drop('count_words_upper', 1) test_df = test_df.drop('count_words_title', 1) logger.info('Cleaning text...') train_df['comment_text_clear'] = clean_text(train_df['comment_text'], tokinizer, wrong_words_dict, regexps, autocorrect=False) test_df['comment_text_clear'] = clean_text(test_df['comment_text'], tokinizer, wrong_words_dict, regexps, autocorrect=False) if reg == 'w': # remove all punctuations train_df.to_csv(os.path.join(output_dir, 'train_clear_w.csv'), index=False) test_df.to_csv(os.path.join(output_dir, 'test_clear_w.csv'), index=False) train_df = pd.read_csv( os.path.join(output_dir, 'train_clear_w.csv')) test_df = pd.read_csv(os.path.join(output_dir, 'test_clear_w.csv')) elif reg == 's': # split by S+ keep all punctuations train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'), index=False) test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'), index=False) train_df = pd.read_csv(os.path.join(output_dir, 'train_clear.csv')) test_df = pd.read_csv(os.path.join(output_dir, 'test_clear.csv')) if mode == 'preprocess': return if mode == 'processed': if reg == 'w': train_df = pd.read_csv( os.path.join(output_dir, 'train_clear_w.csv')) test_df = pd.read_csv(os.path.join(output_dir, 'test_clear_w.csv')) elif reg == 's': train_df = pd.read_csv(os.path.join(output_dir, 'train_clear.csv')) test_df = pd.read_csv(os.path.join(output_dir, 'test_clear.csv')) logger.info('Calc text length...') train_df.fillna('unknown', inplace=True) test_df.fillna('unknown', inplace=True) train_df['text_len'] = train_df['comment_text_clear'].apply( lambda words: len(words.split())) test_df['text_len'] = test_df['comment_text_clear'].apply( lambda words: len(words.split())) max_seq_len = np.round(train_df['text_len'].mean() + 3 * train_df['text_len'].std()).astype(int) logger.debug('Max seq length = {}'.format(max_seq_len)) # ====Prepare data to NN==== logger.info('Converting texts to sequences...') max_words = 100000 if char_level: max_seq_len = 1200 train_df['comment_seq'], test_df[ 'comment_seq'], word_index = convert_text2seq( train_df['comment_text_clear'].tolist(), test_df['comment_text_clear'].tolist(), max_words, max_seq_len, embeds, lower=True, char_level=char_level, uniq=True, use_only_exists_words=True, position=padding) logger.debug('Dictionary size = {}'.format(len(word_index))) logger.info('Preparing embedding matrix...') if model != 'mvcnn': embedding_matrix, words_not_found = get_embedding_matrix( embed_dim, embeds, max_words, word_index) logger.debug('Embedding matrix shape = {}'.format( np.shape(embedding_matrix))) logger.debug('Number of null word embeddings = {}'.format( np.sum(np.sum(embedding_matrix, axis=1) == 0))) # ====Train/test split data==== # train/val x_aux = np.matrix([ train_df["word_unique_pct"].tolist(), train_df["punct_pct"].tolist(), train_df["mean_word_len"].tolist(), train_df["words_upper_pct"].tolist(), train_df["words_title_pct"].tolist() ], dtype='float32').transpose((1, 0)) x = np.array(train_df['comment_seq'].tolist()) y = np.array(train_df[target_labels].values) x_train_nn, x_test_nn, x_aux_train_nn, x_aux_test_nn, y_train_nn, y_test_nn, train_idxs, test_idxs = \ split_data(x, np.squeeze(np.asarray(x_aux)),y,test_size=0.2,shuffle=True,random_state=2018) # test set test_df_seq = np.array(test_df['comment_seq'].tolist()) test_aux = np.matrix([ train_df["word_unique_pct"].tolist(), train_df["punct_pct"].tolist(), train_df["mean_word_len"].tolist(), train_df["words_upper_pct"].tolist(), train_df["words_title_pct"].tolist() ], dtype='float32').transpose((1, 0)) test_df_seq_aux = np.squeeze(np.asarray(test_aux)) y_nn = [] logger.debug('X shape = {}'.format(np.shape(x_train_nn))) # ====Train models==== params = Params(config) if model_label == None: logger.warn('Should choose a model to train') return if model_label == 'dense': model = dense( embedding_matrix, num_classes, max_seq_len, dense_dim=params.get('dense').get('dense_dim'), n_layers=params.get('dense').get('n_layers'), concat=params.get('dense').get('concat'), dropout_val=params.get('dense').get('dropout_val'), l2_weight_decay=params.get('dense').get('l2_weight_decay'), pool=params.get('dense').get('pool'), train_embeds=params.get('dense').get('train_embeds'), add_sigmoid=True, gpus=gpus) if model_label == 'cnn': model = cnn(embedding_matrix, num_classes, max_seq_len, num_filters=params.get('cnn').get('num_filters'), l2_weight_decay=params.get('cnn').get('l2_weight_decay'), dropout_val=params.get('cnn').get('dropout_val'), dense_dim=params.get('cnn').get('dense_dim'), train_embeds=params.get('cnn').get('train_embeds'), n_cnn_layers=params.get('cnn').get('n_cnn_layers'), pool=params.get('cnn').get('pool'), add_embeds=params.get('cnn').get('add_embeds'), auxiliary=auxiliary, add_sigmoid=True, gpus=gpus) if model_label == 'cnn2d': model = cnn2d( embedding_matrix, num_classes, max_seq_len, num_filters=params.get('cnn2d').get('num_filters'), l2_weight_decay=params.get('cnn2d').get('l2_weight_decay'), dropout_val=params.get('cnn2d').get('dropout_val'), dense_dim=params.get('cnn2d').get('dense_dim'), train_embeds=params.get('cnn2d').get('train_embeds'), add_embeds=params.get('cnn2d').get('add_embeds'), auxiliary=auxiliary, add_sigmoid=True, gpus=gpus) if model_label == 'lstm': model = rnn( embedding_matrix, num_classes, max_seq_len, l2_weight_decay=params.get('lstm').get('l2_weight_decay'), rnn_dim=params.get('lstm').get('rnn_dim'), dropout_val=params.get('lstm').get('dropout_val'), dense_dim=params.get('lstm').get('dense_dim'), n_branches=params.get('lstm').get('n_branches'), n_rnn_layers=params.get('lstm').get('n_rnn_layers'), n_dense_layers=params.get('lstm').get('n_dense_layers'), train_embeds=params.get('lstm').get('train_embeds'), mask_zero=params.get('lstm').get('mask_zero'), kernel_regularizer=params.get('lstm').get('kernel_regularizer'), recurrent_regularizer=params.get('lstm').get( 'recurrent_regularizer'), activity_regularizer=params.get('lstm').get( 'activity_regularizer'), dropout=params.get('lstm').get('dropout'), recurrent_dropout=params.get('lstm').get('recurrent_dropout'), auxiliary=auxiliary, add_sigmoid=True, gpus=gpus, rnn_type='lstm') if model_label == 'gru': model = rnn( embedding_matrix, num_classes, max_seq_len, l2_weight_decay=params.get('gru').get('l2_weight_decay'), rnn_dim=params.get('gru').get('rnn_dim'), dropout_val=params.get('gru').get('dropout_val'), dense_dim=params.get('gru').get('dense_dim'), n_branches=params.get('gru').get('n_branches'), n_rnn_layers=params.get('gru').get('n_rnn_layers'), n_dense_layers=params.get('gru').get('n_dense_layers'), train_embeds=params.get('gru').get('train_embeds'), mask_zero=params.get('gru').get('mask_zero'), kernel_regularizer=params.get('gru').get('kernel_regularizer'), recurrent_regularizer=params.get('gru').get( 'recurrent_regularizer'), activity_regularizer=params.get('gru').get('activity_regularizer'), dropout=params.get('gru').get('dropout'), recurrent_dropout=params.get('gru').get('recurrent_dropout'), auxiliary=auxiliary, add_sigmoid=True, gpus=gpus, rnn_type='gru') if model_label == 'charrnn': model = charrnn( len(word_index), num_classes, max_seq_len, rnn_dim=params.get('charrnn').get('rnn_dim'), dropout_val=params.get('charrnn').get('dropout_val'), auxiliary=auxiliary, dropout=params.get('charrnn').get('dropout'), recurrent_dropout=params.get('charrnn').get('recurrent_dropout'), add_sigmoid=True, gpus=gpus, rnn_type=rnn_type) if model_label == 'cnn2rnn': model = cnn2rnn(embedding_matrix, num_classes, max_seq_len, rnn_type=rnn_type) if model_label == 'dpcnn': model = dpcnn(embedding_matrix, num_classes, max_seq_len, num_filters=params.get('dpcnn').get('num_filters'), dense_dim=params.get('dpcnn').get('dense_dim'), add_sigmoid=True, gpus=gpus) if model_label == 'rcnn': model = rcnn( embedding_matrix, num_classes, max_seq_len, rnn_dim=params.get('rcnn').get('rnn_dim'), dropout_val=params.get('rcnn').get('dropout_val'), dense_dim=params.get('rcnn').get('dense_dim'), train_embeds=params.get('rcnn').get('train_embeds'), auxiliary=auxiliary, dropout=params.get('rcnn').get('dropout'), recurrent_dropout=params.get('rcnn').get('recurrent_dropout'), add_sigmoid=True, gpus=gpus, rnn_type=rnn_type) if model_label == 'capsule': model = capsule( embedding_matrix, num_classes, max_seq_len, auxiliary=auxiliary, Num_capsule=params.get('capsule').get('Num_capsule'), Routings=params.get('capsule').get('Routing'), add_sigmoid=params.get('capsule').get('add_sigmoid'), mask_zero=params.get('capsule').get('mask_zero'), gpus=gpus, rnn_type='gru') # lstm may diverge but gru works better if model == 'mvcnn': embeds_fname1 = "./data/crawl-300d-2M.vec" # "./data/crawl-300d-2M.vec word2vec-raw.txt embeds_fname2 = "./data/glove.840B.300d.txt" embeds_fname3 = "./data/GoogleNews-vectors-negative300.bin" embed_dim = 300 embeds1 = Embeds(embeds_fname1, "glove", format='file') embeds2 = Embeds(embeds_fname2, "fasttext", format='file') embeds3 = Embeds(embeds_fname3, "word2vec", format='binary') embedding_matrix1, words_not_found1 = get_embedding_matrix( embed_dim, embeds1, max_words, word_index) embedding_matrix2, words_not_found2 = get_embedding_matrix( embed_dim, embeds2, max_words, word_index) #embedding_matrix3, words_not_found3 = get_embedding_matrix(embed_dim, embeds3, max_words, word_index) model = mvcnn(embedding_matrix1, embedding_matrix2, num_classes, max_seq_len, auxiliary=auxiliary, gpus=gpus) # ====k-fold cross validations split data==== logger.info('Run k-fold cross validation...') params = Params(config) kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_seed) oof_train = np.zeros((x.shape[0], num_classes)) oof_test_skf = [] for i, (train_index, test_index) in enumerate(kf.split(x, y)): print("TRAIN:", train_index, "TEST:", test_index) x_train, x_aux_train, x_test, x_aux_test = x[train_index], x_aux[ train_index], x[test_index], x_aux[test_index] y_train, y_test = y[train_index], y[test_index] logger.info('Start training {}-th fold'.format(i)) if auxiliary: inputs = [x_train, x_aux_train] inputs_val = [x_test, x_aux_test] output = [test_df_seq, test_df_seq_aux] else: inputs = x_train inputs_val = x_test output = test_df_seq hist = train( x_train= inputs, # [x_train, x_aux_train] when auxiliary input is allowed. y_train=y_train, x_val=inputs_val, # [x_test, x_aux_test], y_val=y_test, model=model, batch_size=params.get(model_label).get('batch_size'), num_epochs=params.get(model_label).get('num_epochs'), learning_rate=params.get(model_label).get('learning_rate'), early_stopping_delta=params.get(model_label).get( 'early_stopping_delta'), early_stopping_epochs=params.get(model_label).get( 'early_stopping_epochs'), use_lr_strategy=params.get(model_label).get('use_lr_strategy'), lr_drop_koef=params.get(model_label).get('lr_drop_koef'), epochs_to_drop=params.get(model_label).get('epochs_to_drop'), model_checkpoint_dir=os.path.join('.', 'model_checkpoint', reg, model_label, embedding_type, padding, str(i)), logger=logger) model.load_weights( os.path.join('.', 'model_checkpoint', reg, model_label, embedding_type, padding, str(i), 'weights.h5')) oof_train[test_index, :] = model.predict( inputs_val) # model.predict([x_test, x_aux_test]) proba = model.predict( output) # model.predict([test_df_seq, test_df_seq_aux]) oof_test_skf.append(proba) result = pd.read_csv("./data/sample_submission.csv") result[target_labels] = proba ithfold_path = "./cv/{}/{}/{}/{}/{}".format(reg, model_label, embedding_type, padding, i) if not os.path.exists(ithfold_path): os.makedirs(ithfold_path) result.to_csv(os.path.join(ithfold_path, 'sub.csv'), index=False) # model.save(os.path.join(ithfold_path,'weights.h5')) # dump oof_test and oof_train for later slacking # oof_train: oof_train_path = "./cv/{}/{}/{}/{}/oof_train".format( reg, model_label, embedding_type, padding) if not os.path.exists(oof_train_path): os.makedirs(oof_train_path) np.savetxt(os.path.join(oof_train_path, "oof_train.csv"), oof_train, fmt='%.24f', delimiter=' ') # oof_test: stacking version oof_test = np.array(oof_test_skf).mean(axis=0) oof_test_path = "./cv/{}/{}/{}/{}/oof_test".format(reg, model_label, embedding_type, padding) if not os.path.exists(oof_test_path): os.makedirs(oof_test_path) np.savetxt(os.path.join(oof_test_path, "oof_test.csv"), oof_test, fmt='%.24f', delimiter=' ') # oof_test: submission version result[target_labels] = oof_test oof_test_bag_path = "./cv/{}/{}/{}/{}/bagged".format( reg, model_label, embedding_type, padding) if not os.path.exists(oof_test_bag_path): os.makedirs(oof_test_bag_path) result.to_csv(os.path.join(oof_test_bag_path, "sub.csv"), index=False)
class TopicalCrawler: def __init__(self, name, config, topics, pages): self.name = name self.config = config self.topics = topics self.site_alias = { target['host']: target.get('alias', '') for target in self.config.targets } self.url_queries = Queue() self.current_info = Dict() self.queue_dict = { target['host']: Queue() for target in self.config.targets } self.current_info_dict = { target['host']: Dict() for target in self.config.targets } self.save_breakpoint_path = os.path.join(self.config.save.tmp, f'{self.name}.breakpoint') if os.path.isfile(self.save_breakpoint_path): self._load_breakpoint() self.request = Requestor(self.config) # self.searcher = Searcher(self.config) self.searcher = Searcher('baidu') self.parser = Parser() self.database = DataBase(**self.config.save.database.login) # self.reset_database() self.table = 'news' self.logger = Logger(filename=self.config.save.log) self.url_fetcher = SearchEngineUrlFetcher( request=self.request, searcher=self.searcher, queries=self.url_queries, current_info=self.current_info, hosts=[target['host'] for target in self.config.targets], topics=self.topics, pages=pages, queue_dict=self.queue_dict, ) # one thread for one host self.threads = [ TargetedCrawler( self.request, self.parser, self.url_fetcher, self.database, self.table, self.logger, target, self.queue_dict[target['host']], self.current_info_dict[target['host']], ) for target in config.targets ] def __call__(self, string, option='url'): if option == 'url': self.parse(string) elif option == 'keyword': urls = self.search(string) data = [] for url in urls: soup, url, host = self.request(url) data.append(self.parse(soup, host)) return data else: raise ValueError("option should be 'url' or 'keyword'") def test_search(self, keyword, host=None, page=1): query = self.searcher.construct_query(keyword, host, page) soup, _ = self.request(query) urls = self.searcher.parse(soup) return urls def test_parse(self, url, remain_query_key=None): soup, url = self.request(url, remain_query_key=remain_query_key, no_headers=True) data = self.parser(soup, url['netloc'], url['tld']) next_page_url = data.pop('next', None) while next_page_url: try: _soup, _url = self.request(next_page_url) _data = self.parser(_soup, _url['netloc'], _url['tld']) data['text'] += _data['text'] next_page_url = _data.pop('next', None) except: break return { **data, 'url': url['url'], 'site': self.site_alias.get(url['tld']) or self.site_alias.get(url['netloc'], '') } def _load_breakpoint(self): with open(self.save_breakpoint_path, 'rb') as f: temp = pickle.load(f) for q in temp['url_queries']: self.url_queries.put(q) for host, queue_ in temp['queue_dict'].items(): for q in queue_: self.queue_dict[host].put(q) def _save_breakpoint(self): temp = { 'url_queries': ([self.current_info.query] if self.current_info.get('query') else []) + [self.url_queries.get() for _ in range(self.url_queries.qsize())], 'queue_dict': { target: ([self.current_info_dict[target].query] if self.current_info_dict[target].get('query') else []) + [queue_.get() for _ in range(queue_.qsize())] for target, queue_ in self.queue_dict.items() } } with open(self.save_breakpoint_path, 'wb') as f: pickle.dump(temp, f) def _cleanup(self): try: self.database.close() except: pass def run(self): self._init_topics() self.url_fetcher.start() for t in self.threads: t.start() try: while True: time.sleep(1) if self.url_fetcher.is_alive(): continue for t in self.threads: if t.is_alive(): break else: break except KeyboardInterrupt: self.logger.info('keyboard interrupt by user') self._save_breakpoint() self._cleanup() except: self.logger.error(exc_info=sys.exc_info()) self._save_breakpoint() self._cleanup() def _init_topics(self): for topic in self.topics: if not isinstance(topic['keywords'], list): topic['keywords'] = [topic['keywords']] temp_keywords_str = json.dumps(topic['keywords']) if not self.database.select('topic', name=topic['name']): self.database.insert('topic', name=topic['name'], keywords=temp_keywords_str, entry_time=datetime.datetime.now(), remark=topic.get('remark', '')) else: self.database.update( { 'table': 'topic', 'constraints': { 'name': topic['name'] } }, remark=topic.get('remark', ''), keywords=temp_keywords_str)