def main(): argparser = argparse.ArgumentParser(description=("Clean the Quora dataset " "by removing newlines in " "the data.")) argparser.add_argument("dataset_input_path", type=str, help=("The path to the raw Quora " "dataset to clean.")) argparser.add_argument("dataset_output_path", type=str, help=("The *folder* to write the " "cleaned file to. The name will just have " "_cleaned appended to it, before the " "extension")) config = argparser.parse_args() clean_rows = [] with open(config.dataset_input_path, 'r') as f: reader = csv.DictReader(f, fieldnames=[ 'id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate' ]) for i, row in enumerate(reader): if i > 0: question1 = get_cleaned_text(row['question1']) question2 = get_cleaned_text(row['question2']) clean_rows.append([ row['id'], row['qid1'], row['qid2'], question1, question2, row['is_duplicate'] ]) input_filename_full = os.path.basename(config.dataset_input_path) input_filename, input_ext = os.path.splitext(input_filename_full) out_path = os.path.join(config.dataset_output_path, input_filename + "_cleaned" + input_ext) with open(out_path, "w") as f: writer = csv.DictWriter(f, fieldnames=[ 'id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate' ]) writer.writeheader() for row in clean_rows: writer.writerow({ 'id': row[0], 'qid1': row[1], 'qid2': row[2], 'question1': row[3], 'question2': row[4], 'is_duplicate': row[5] })
def __iter__(self): """ Iterator for the document set stored in the DB. """ for pageid in self.get_page_ids(): page_id = pageid[0] page_content = self.get_page_by_id(page_id) clean_text = utils.get_cleaned_text(page_content).split() yield clean_text
def get_cleaned_pages(self): pages = list() for pageid in self.get_page_ids(): page_id = pageid[0] page_content = self.get_page_by_id(page_id) clean_text = utils.get_cleaned_text(page_content).split() pages.append(clean_text) return pages
def get_cleaned_pages(self): pages = list() for page_hash in self.get_page_hashes(): # page_id = pageid[0] # print("###################") # print('page_hash={}'.format(page_hash)) page_content = self.get_page_by_hash(page_hash[0]) clean_text = utils.get_cleaned_text(page_content).split() pages.append(clean_text) return pages
def do_content_prediction(test_data, num_classes, max_feature_length, ckpt_dir=None): # Load Testing Data comments = [] comments_ids = [] with open(test_data, 'r') as f: for comment in f.readlines(): comments.append(comment) comment = get_cleaned_text(comment, emotes, remove_stopwords=True, streamer=FLAGS.streamer, remove_emotes_or_words=False, digit_to_string=False) comments_ids.append(get_comment_ids(comment, max_feature_length)) X = np.asarray(comments_ids, dtype='int32') ckpt_dir = os.path.join( ckpt_dir, 'content') if ckpt_dir is not None else os.path.join( 'checkpoints', 'content') model = build_model(num_filters=num_filters, num_classes=num_classes, sequence_max_length=FLAGS.max_feature_length, embedding_size=FLAGS.embedding_size, learning_rate=FLAGS.learning_rate, load_pretrained_model=True, ckpt_dir=ckpt_dir) predictions = model.predict(X, batch_size=FLAGS.batch_size, verbose=0) predictions = np.argmax(predictions, axis=1) return predictions
# print( help( corpora.dictionary ) ) should_rebuild = False # ### Dictionary ### dict_file = utils.get_file_path(cfg.DICT_BACKUP) # dictionary = corpora.dictionary.Dictionary.load(dict_file) dictionary = utils.build_dictionary(content, should_rebuild, cfg.DICT_BACKUP) # ### Corpus ### corpus_file = utils.get_file_path(cfg.CORPUS_BACKUP) # utils.pickle_save(corpus_file, corpus) # corpus = corpora.MmCorpus(corpus_file) corpus = utils.build_corpus(dictionary, content, should_rebuild, cfg.CORPUS_BACKUP) # corpus = pickle.load( open( corpus_file, "rb" ) ) # print( cfg.MODEL_NAME ) # ### LDA Model ### bow = dictionary.doc2bow(utils.get_cleaned_text(query.lower()).split()) # bag_of_words = [word for word in bow] model = utils.build_model(dictionary, corpus, should_rebuild) q_vec = model[bow] # "query vector" # topic_details = list() topic_details = model.print_topic(max(q_vec, key=lambda item: item[1])[0]) print('Dictionary Size = {}'.format(len(dictionary))) print('Corpus Size = {}'.format(len(corpus))) print('Topic Details: ') print(topic_details)